{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5995203836930456, "eval_steps": 250, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003996802557953637, "grad_norm": 27253.678894810444, "learning_rate": 0.0, "loss": 88.7727, "num_input_tokens_seen": 173048, "step": 1 }, { "epoch": 0.0003996802557953637, "eval_websight_new_IoU": 0.007802221458405256, "eval_websight_new_MAE_all": 0.22719886153936386, "eval_websight_new_MAE_h": 0.1250949464738369, "eval_websight_new_MAE_w": 0.26685621589422226, "eval_websight_new_MAE_x": 0.23003952950239182, "eval_websight_new_MAE_y": 0.28680478781461716, "eval_websight_new_NUM_probability": 2.108378305276659e-09, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 42.4260139465332, "eval_websight_new_loss_ce": 5.392822742462158, "eval_websight_new_loss_xval": 39.01171875, "eval_websight_new_runtime": 64.218, "eval_websight_new_samples_per_second": 0.779, "eval_websight_new_steps_per_second": 0.031, "num_input_tokens_seen": 173048, "step": 1 }, { "epoch": 0.0003996802557953637, "eval_seeclick_IoU": 0.013260291889309883, "eval_seeclick_MAE_all": 0.3815549612045288, "eval_seeclick_MAE_h": 0.40597620606422424, "eval_seeclick_MAE_w": 0.4406091570854187, "eval_seeclick_MAE_x": 0.3167571872472763, "eval_seeclick_MAE_y": 0.3628772497177124, "eval_seeclick_NUM_probability": 2.7780518996323167e-09, "eval_seeclick_inside_bbox": 0.015625, "eval_seeclick_loss": 46.768917083740234, "eval_seeclick_loss_ce": 6.483319997787476, "eval_seeclick_loss_xval": 39.03125, "eval_seeclick_runtime": 84.0912, "eval_seeclick_samples_per_second": 0.595, "eval_seeclick_steps_per_second": 0.024, "num_input_tokens_seen": 173048, "step": 1 }, { "epoch": 0.0003996802557953637, "eval_icons_IoU": 9.602530917618424e-05, "eval_icons_MAE_all": 0.24617066234350204, "eval_icons_MAE_h": 0.16503974795341492, "eval_icons_MAE_w": 0.14773830771446228, "eval_icons_MAE_x": 0.3416582942008972, "eval_icons_MAE_y": 0.3302464038133621, "eval_icons_NUM_probability": 4.84057904870383e-10, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 22.6812686920166, "eval_icons_loss_ce": 5.316617250442505, "eval_icons_loss_xval": 18.2421875, "eval_icons_runtime": 82.5017, "eval_icons_samples_per_second": 0.606, "eval_icons_steps_per_second": 0.024, "num_input_tokens_seen": 173048, "step": 1 }, { "epoch": 0.0003996802557953637, "loss": 22.062753677368164, "loss_ce": 5.414316177368164, "loss_xval": 16.625, "num_input_tokens_seen": 173048, "step": 1 }, { "epoch": 0.0007993605115907274, "grad_norm": 41154.27591780116, "learning_rate": 6.276845846337281e-07, "loss": 59.1779, "num_input_tokens_seen": 346016, "step": 2 }, { "epoch": 0.0007993605115907274, "loss": 79.56485748291016, "loss_ce": 5.002355575561523, "loss_xval": 74.5, "num_input_tokens_seen": 346016, "step": 2 }, { "epoch": 0.001199040767386091, "grad_norm": 32361.98582055134, "learning_rate": 9.94856528925194e-07, "loss": 59.2298, "num_input_tokens_seen": 519088, "step": 3 }, { "epoch": 0.001199040767386091, "loss": 53.204307556152344, "loss_ce": 5.1574320793151855, "loss_xval": 48.0, "num_input_tokens_seen": 519088, "step": 3 }, { "epoch": 0.0015987210231814548, "grad_norm": 18680.271876014704, "learning_rate": 1.2553691692674561e-06, "loss": 41.3081, "num_input_tokens_seen": 691920, "step": 4 }, { "epoch": 0.0015987210231814548, "loss": 39.82504653930664, "loss_ce": 4.950046539306641, "loss_xval": 35.0, "num_input_tokens_seen": 691920, "step": 4 }, { "epoch": 0.0019984012789768186, "grad_norm": 5449.561908994174, "learning_rate": 1.4574384717887574e-06, "loss": 27.6026, "num_input_tokens_seen": 864928, "step": 5 }, { "epoch": 0.0019984012789768186, "loss": 28.755176544189453, "loss_ce": 4.9895524978637695, "loss_xval": 23.75, "num_input_tokens_seen": 864928, "step": 5 }, { "epoch": 0.002398081534772182, "grad_norm": 3825.7071201245612, "learning_rate": 1.622541113558922e-06, "loss": 22.1129, "num_input_tokens_seen": 1037544, "step": 6 }, { "epoch": 0.002398081534772182, "loss": 20.575275421142578, "loss_ce": 5.247150421142578, "loss_xval": 15.3125, "num_input_tokens_seen": 1037544, "step": 6 }, { "epoch": 0.002797761790567546, "grad_norm": 3619.914754340841, "learning_rate": 1.762133408171179e-06, "loss": 21.0579, "num_input_tokens_seen": 1209928, "step": 7 }, { "epoch": 0.002797761790567546, "loss": 20.604846954345703, "loss_ce": 5.347036361694336, "loss_xval": 15.25, "num_input_tokens_seen": 1209928, "step": 7 }, { "epoch": 0.0031974420463629096, "grad_norm": 2391.231418170818, "learning_rate": 1.8830537539011838e-06, "loss": 19.8215, "num_input_tokens_seen": 1382680, "step": 8 }, { "epoch": 0.0031974420463629096, "loss": 20.428314208984375, "loss_ce": 5.27987813949585, "loss_xval": 15.125, "num_input_tokens_seen": 1382680, "step": 8 }, { "epoch": 0.0035971223021582736, "grad_norm": 2056.06429238663, "learning_rate": 1.989713057850388e-06, "loss": 17.948, "num_input_tokens_seen": 1555768, "step": 9 }, { "epoch": 0.0035971223021582736, "loss": 18.224111557006836, "loss_ce": 5.302236557006836, "loss_xval": 12.9375, "num_input_tokens_seen": 1555768, "step": 9 }, { "epoch": 0.003996802557953637, "grad_norm": 689.953934673361, "learning_rate": 2.085123056422486e-06, "loss": 15.8543, "num_input_tokens_seen": 1729384, "step": 10 }, { "epoch": 0.003996802557953637, "loss": 15.537797927856445, "loss_ce": 5.276078701019287, "loss_xval": 10.25, "num_input_tokens_seen": 1729384, "step": 10 }, { "epoch": 0.004396482813749001, "grad_norm": 785.0266162674142, "learning_rate": 2.1714318986131375e-06, "loss": 15.001, "num_input_tokens_seen": 1902192, "step": 11 }, { "epoch": 0.004396482813749001, "loss": 14.884502410888672, "loss_ce": 5.197001934051514, "loss_xval": 9.6875, "num_input_tokens_seen": 1902192, "step": 11 }, { "epoch": 0.004796163069544364, "grad_norm": 2016.36977346388, "learning_rate": 2.25022569819265e-06, "loss": 16.9301, "num_input_tokens_seen": 2075288, "step": 12 }, { "epoch": 0.004796163069544364, "loss": 16.57305145263672, "loss_ce": 5.272271156311035, "loss_xval": 11.3125, "num_input_tokens_seen": 2075288, "step": 12 }, { "epoch": 0.005195843325339729, "grad_norm": 3023.4049224405285, "learning_rate": 2.3227089674435414e-06, "loss": 18.3376, "num_input_tokens_seen": 2247968, "step": 13 }, { "epoch": 0.005195843325339729, "loss": 18.748249053955078, "loss_ce": 5.170123100280762, "loss_xval": 13.5625, "num_input_tokens_seen": 2247968, "step": 13 }, { "epoch": 0.005595523581135092, "grad_norm": 3434.7471291290594, "learning_rate": 2.389817992804907e-06, "loss": 19.9897, "num_input_tokens_seen": 2420728, "step": 14 }, { "epoch": 0.005595523581135092, "loss": 20.42713165283203, "loss_ce": 5.231818675994873, "loss_xval": 15.1875, "num_input_tokens_seen": 2420728, "step": 14 }, { "epoch": 0.005995203836930456, "grad_norm": 2501.863798675274, "learning_rate": 2.4522950007139517e-06, "loss": 16.8772, "num_input_tokens_seen": 2593888, "step": 15 }, { "epoch": 0.005995203836930456, "loss": 16.648954391479492, "loss_ce": 5.109891891479492, "loss_xval": 11.5625, "num_input_tokens_seen": 2593888, "step": 15 }, { "epoch": 0.006394884092725819, "grad_norm": 1750.9020613468717, "learning_rate": 2.5107383385349122e-06, "loss": 14.1557, "num_input_tokens_seen": 2766600, "step": 16 }, { "epoch": 0.006394884092725819, "loss": 13.727005958557129, "loss_ce": 5.047318458557129, "loss_xval": 8.6875, "num_input_tokens_seen": 2766600, "step": 16 }, { "epoch": 0.006794564348521183, "grad_norm": 227.32613482390494, "learning_rate": 2.5656374157160176e-06, "loss": 12.464, "num_input_tokens_seen": 2939256, "step": 17 }, { "epoch": 0.006794564348521183, "loss": 12.111137390136719, "loss_ce": 5.044731140136719, "loss_xval": 7.0625, "num_input_tokens_seen": 2939256, "step": 17 }, { "epoch": 0.007194244604316547, "grad_norm": 1252.224814993924, "learning_rate": 2.6173976424841156e-06, "loss": 13.1703, "num_input_tokens_seen": 3112192, "step": 18 }, { "epoch": 0.007194244604316547, "loss": 12.594108581542969, "loss_ce": 4.973014831542969, "loss_xval": 7.625, "num_input_tokens_seen": 3112192, "step": 18 }, { "epoch": 0.007593924860111911, "grad_norm": 2028.8518634432514, "learning_rate": 2.6663586168300222e-06, "loss": 14.8689, "num_input_tokens_seen": 3282040, "step": 19 }, { "epoch": 0.007593924860111911, "loss": 14.035215377807617, "loss_ce": 4.902403354644775, "loss_xval": 9.125, "num_input_tokens_seen": 3282040, "step": 19 }, { "epoch": 0.007993605115907274, "grad_norm": 2398.1141656415566, "learning_rate": 2.712807641056214e-06, "loss": 15.648, "num_input_tokens_seen": 3455080, "step": 20 }, { "epoch": 0.007993605115907274, "loss": 15.920930862426758, "loss_ce": 4.975618839263916, "loss_xval": 10.9375, "num_input_tokens_seen": 3455080, "step": 20 }, { "epoch": 0.008393285371702638, "grad_norm": 2410.923655837602, "learning_rate": 2.756989937096373e-06, "loss": 15.3804, "num_input_tokens_seen": 3628016, "step": 21 }, { "epoch": 0.008393285371702638, "loss": 15.361221313476562, "loss_ce": 4.939347267150879, "loss_xval": 10.4375, "num_input_tokens_seen": 3628016, "step": 21 }, { "epoch": 0.008792965627498001, "grad_norm": 1750.9570099084196, "learning_rate": 2.799116483246866e-06, "loss": 13.9054, "num_input_tokens_seen": 3797864, "step": 22 }, { "epoch": 0.008792965627498001, "loss": 13.663543701171875, "loss_ce": 4.839325904846191, "loss_xval": 8.8125, "num_input_tokens_seen": 3797864, "step": 22 }, { "epoch": 0.009192645883293365, "grad_norm": 1467.4142596254105, "learning_rate": 2.8393701074525802e-06, "loss": 12.8355, "num_input_tokens_seen": 3970640, "step": 23 }, { "epoch": 0.009192645883293365, "loss": 12.972114562988281, "loss_ce": 4.925239562988281, "loss_xval": 8.0625, "num_input_tokens_seen": 3970640, "step": 23 }, { "epoch": 0.009592326139088728, "grad_norm": 573.4862114776328, "learning_rate": 2.8779102828263783e-06, "loss": 12.2069, "num_input_tokens_seen": 4143328, "step": 24 }, { "epoch": 0.009592326139088728, "loss": 12.610280990600586, "loss_ce": 4.926686763763428, "loss_xval": 7.6875, "num_input_tokens_seen": 4143328, "step": 24 }, { "epoch": 0.009992006394884092, "grad_norm": 279.5844578074333, "learning_rate": 2.914876943577515e-06, "loss": 11.6072, "num_input_tokens_seen": 4315720, "step": 25 }, { "epoch": 0.009992006394884092, "loss": 12.251864433288574, "loss_ce": 4.790926933288574, "loss_xval": 7.46875, "num_input_tokens_seen": 4315720, "step": 25 }, { "epoch": 0.010391686650679457, "grad_norm": 745.4356271224037, "learning_rate": 2.9503935520772694e-06, "loss": 11.3671, "num_input_tokens_seen": 4489000, "step": 26 }, { "epoch": 0.010391686650679457, "loss": 11.730391502380371, "loss_ce": 4.703047275543213, "loss_xval": 7.03125, "num_input_tokens_seen": 4489000, "step": 26 }, { "epoch": 0.01079136690647482, "grad_norm": 1260.0413305023137, "learning_rate": 2.9845695867755812e-06, "loss": 12.7243, "num_input_tokens_seen": 4661752, "step": 27 }, { "epoch": 0.01079136690647482, "loss": 12.388510704040527, "loss_ce": 4.7322611808776855, "loss_xval": 7.65625, "num_input_tokens_seen": 4661752, "step": 27 }, { "epoch": 0.011191047162270184, "grad_norm": 1537.0491298782129, "learning_rate": 3.017502577438635e-06, "loss": 12.7051, "num_input_tokens_seen": 4834536, "step": 28 }, { "epoch": 0.011191047162270184, "loss": 13.197071075439453, "loss_ce": 4.650196075439453, "loss_xval": 8.5625, "num_input_tokens_seen": 4834536, "step": 28 }, { "epoch": 0.011590727418065548, "grad_norm": 1481.2643398938644, "learning_rate": 3.0492797830851952e-06, "loss": 12.6727, "num_input_tokens_seen": 5007272, "step": 29 }, { "epoch": 0.011590727418065548, "loss": 11.455208778381348, "loss_ce": 4.564583778381348, "loss_xval": 6.875, "num_input_tokens_seen": 5007272, "step": 29 }, { "epoch": 0.011990407673860911, "grad_norm": 1043.4564438418174, "learning_rate": 3.079979585347679e-06, "loss": 11.1842, "num_input_tokens_seen": 5178384, "step": 30 }, { "epoch": 0.011990407673860911, "loss": 11.960172653198242, "loss_ce": 4.397672653198242, "loss_xval": 7.5625, "num_input_tokens_seen": 5178384, "step": 30 }, { "epoch": 0.012390087929656275, "grad_norm": 792.8424890447506, "learning_rate": 3.1096726532791336e-06, "loss": 10.8114, "num_input_tokens_seen": 5350952, "step": 31 }, { "epoch": 0.012390087929656275, "loss": 10.764305114746094, "loss_ce": 4.383445739746094, "loss_xval": 6.375, "num_input_tokens_seen": 5350952, "step": 31 }, { "epoch": 0.012789768185451638, "grad_norm": 145.9379236072859, "learning_rate": 3.13842292316864e-06, "loss": 10.5285, "num_input_tokens_seen": 5524152, "step": 32 }, { "epoch": 0.012789768185451638, "loss": 11.266403198242188, "loss_ce": 4.449997901916504, "loss_xval": 6.8125, "num_input_tokens_seen": 5524152, "step": 32 }, { "epoch": 0.013189448441247002, "grad_norm": 261.9315218162029, "learning_rate": 3.1662884275383315e-06, "loss": 9.8499, "num_input_tokens_seen": 5697032, "step": 33 }, { "epoch": 0.013189448441247002, "loss": 9.855939865112305, "loss_ce": 4.297346591949463, "loss_xval": 5.5625, "num_input_tokens_seen": 5697032, "step": 33 }, { "epoch": 0.013589128697042365, "grad_norm": 403.77946842148157, "learning_rate": 3.1933220003497456e-06, "loss": 9.9114, "num_input_tokens_seen": 5869808, "step": 34 }, { "epoch": 0.013589128697042365, "loss": 10.173905372619629, "loss_ce": 4.193436622619629, "loss_xval": 5.96875, "num_input_tokens_seen": 5869808, "step": 34 }, { "epoch": 0.013988808952837729, "grad_norm": 1035.8631252474581, "learning_rate": 3.2195718799599367e-06, "loss": 10.7195, "num_input_tokens_seen": 6043328, "step": 35 }, { "epoch": 0.013988808952837729, "loss": 10.887596130371094, "loss_ce": 4.102439880371094, "loss_xval": 6.78125, "num_input_tokens_seen": 6043328, "step": 35 }, { "epoch": 0.014388489208633094, "grad_norm": 1149.9950193906377, "learning_rate": 3.245082227117844e-06, "loss": 10.5161, "num_input_tokens_seen": 6216344, "step": 36 }, { "epoch": 0.014388489208633094, "loss": 10.513891220092773, "loss_ce": 3.982640504837036, "loss_xval": 6.53125, "num_input_tokens_seen": 6216344, "step": 36 }, { "epoch": 0.014788169464428458, "grad_norm": 889.0168898022567, "learning_rate": 3.2698935719735842e-06, "loss": 10.6335, "num_input_tokens_seen": 6389384, "step": 37 }, { "epoch": 0.014788169464428458, "loss": 10.846721649169922, "loss_ce": 3.956096649169922, "loss_xval": 6.875, "num_input_tokens_seen": 6389384, "step": 37 }, { "epoch": 0.015187849720223821, "grad_norm": 962.657302962428, "learning_rate": 3.29404320146375e-06, "loss": 9.9492, "num_input_tokens_seen": 6562488, "step": 38 }, { "epoch": 0.015187849720223821, "loss": 10.308053970336914, "loss_ce": 4.020944595336914, "loss_xval": 6.28125, "num_input_tokens_seen": 6562488, "step": 38 }, { "epoch": 0.015587529976019185, "grad_norm": 194.46637819778587, "learning_rate": 3.3175654963687346e-06, "loss": 9.1389, "num_input_tokens_seen": 6735512, "step": 39 }, { "epoch": 0.015587529976019185, "loss": 9.079648971557617, "loss_ce": 3.8530867099761963, "loss_xval": 5.21875, "num_input_tokens_seen": 6735512, "step": 39 }, { "epoch": 0.01598721023181455, "grad_norm": 483.308185338452, "learning_rate": 3.340492225689942e-06, "loss": 9.9182, "num_input_tokens_seen": 6908304, "step": 40 }, { "epoch": 0.01598721023181455, "loss": 9.45429515838623, "loss_ce": 3.7667951583862305, "loss_xval": 5.6875, "num_input_tokens_seen": 6908304, "step": 40 }, { "epoch": 0.016386890487609912, "grad_norm": 664.1523962678483, "learning_rate": 3.3628528046722993e-06, "loss": 9.4767, "num_input_tokens_seen": 7081120, "step": 41 }, { "epoch": 0.016386890487609912, "loss": 9.607213020324707, "loss_ce": 3.615025043487549, "loss_xval": 6.0, "num_input_tokens_seen": 7081120, "step": 41 }, { "epoch": 0.016786570743405275, "grad_norm": 903.1201640859815, "learning_rate": 3.3846745217301015e-06, "loss": 10.2732, "num_input_tokens_seen": 7253696, "step": 42 }, { "epoch": 0.016786570743405275, "loss": 10.472900390625, "loss_ce": 3.750244617462158, "loss_xval": 6.71875, "num_input_tokens_seen": 7253696, "step": 42 }, { "epoch": 0.01718625099920064, "grad_norm": 871.80664974741, "learning_rate": 3.4059827386678244e-06, "loss": 10.2289, "num_input_tokens_seen": 7426136, "step": 43 }, { "epoch": 0.01718625099920064, "loss": 10.360870361328125, "loss_ce": 3.575714588165283, "loss_xval": 6.78125, "num_input_tokens_seen": 7426136, "step": 43 }, { "epoch": 0.017585931254996003, "grad_norm": 602.4666850744355, "learning_rate": 3.4268010678805934e-06, "loss": 9.0809, "num_input_tokens_seen": 7599104, "step": 44 }, { "epoch": 0.017585931254996003, "loss": 9.004478454589844, "loss_ce": 3.3775248527526855, "loss_xval": 5.625, "num_input_tokens_seen": 7599104, "step": 44 }, { "epoch": 0.017985611510791366, "grad_norm": 137.35583139064687, "learning_rate": 3.447151529639145e-06, "loss": 8.3695, "num_input_tokens_seen": 7772208, "step": 45 }, { "epoch": 0.017985611510791366, "loss": 8.689802169799805, "loss_ce": 3.4398021697998047, "loss_xval": 5.25, "num_input_tokens_seen": 7772208, "step": 45 }, { "epoch": 0.01838529176658673, "grad_norm": 478.5359664685662, "learning_rate": 3.4670546920863086e-06, "loss": 8.1446, "num_input_tokens_seen": 7944840, "step": 46 }, { "epoch": 0.01838529176658673, "loss": 7.91407585144043, "loss_ce": 3.4550914764404297, "loss_xval": 4.46875, "num_input_tokens_seen": 7944840, "step": 46 }, { "epoch": 0.018784972022382093, "grad_norm": 557.9408362107046, "learning_rate": 3.4865297961764146e-06, "loss": 8.3954, "num_input_tokens_seen": 8118024, "step": 47 }, { "epoch": 0.018784972022382093, "loss": 8.321405410766602, "loss_ce": 3.3067569732666016, "loss_xval": 5.0, "num_input_tokens_seen": 8118024, "step": 47 }, { "epoch": 0.019184652278177457, "grad_norm": 846.6195239782504, "learning_rate": 3.5055948674601067e-06, "loss": 8.578, "num_input_tokens_seen": 8290688, "step": 48 }, { "epoch": 0.019184652278177457, "loss": 9.086427688598633, "loss_ce": 3.121584892272949, "loss_xval": 5.96875, "num_input_tokens_seen": 8290688, "step": 48 }, { "epoch": 0.01958433253397282, "grad_norm": 590.9741857230598, "learning_rate": 3.524266816342358e-06, "loss": 7.7949, "num_input_tokens_seen": 8463320, "step": 49 }, { "epoch": 0.01958433253397282, "loss": 7.50385046005249, "loss_ce": 3.1073663234710693, "loss_xval": 4.40625, "num_input_tokens_seen": 8463320, "step": 49 }, { "epoch": 0.019984012789768184, "grad_norm": 261.24703891283673, "learning_rate": 3.542561528211243e-06, "loss": 7.8687, "num_input_tokens_seen": 8636560, "step": 50 }, { "epoch": 0.019984012789768184, "loss": 7.583156108856201, "loss_ce": 3.1534688472747803, "loss_xval": 4.4375, "num_input_tokens_seen": 8636560, "step": 50 }, { "epoch": 0.02038369304556355, "grad_norm": 322.9570931125352, "learning_rate": 3.5604939446412112e-06, "loss": 7.6471, "num_input_tokens_seen": 8809720, "step": 51 }, { "epoch": 0.02038369304556355, "eval_websight_new_IoU": 0.030084313824772835, "eval_websight_new_MAE_all": 0.1372687742114067, "eval_websight_new_MAE_h": 0.10130885243415833, "eval_websight_new_MAE_w": 0.14034898951649666, "eval_websight_new_MAE_x": 0.08298783376812935, "eval_websight_new_MAE_y": 0.2244294062256813, "eval_websight_new_NUM_probability": 1.1027492252679849e-08, "eval_websight_new_inside_bbox": 0.03125, "eval_websight_new_loss": 7.040191650390625, "eval_websight_new_loss_ce": 3.3154985904693604, "eval_websight_new_loss_xval": 3.492431640625, "eval_websight_new_runtime": 55.9794, "eval_websight_new_samples_per_second": 0.893, "eval_websight_new_steps_per_second": 0.036, "num_input_tokens_seen": 8809720, "step": 51 }, { "epoch": 0.02038369304556355, "eval_seeclick_IoU": 0.025128517299890518, "eval_seeclick_MAE_all": 0.20313503593206406, "eval_seeclick_MAE_h": 0.13093940913677216, "eval_seeclick_MAE_w": 0.2555273696780205, "eval_seeclick_MAE_x": 0.21082086116075516, "eval_seeclick_MAE_y": 0.21525250375270844, "eval_seeclick_NUM_probability": 1.2050559661247462e-08, "eval_seeclick_inside_bbox": 0.07465277798473835, "eval_seeclick_loss": 10.326189994812012, "eval_seeclick_loss_ce": 3.7456430196762085, "eval_seeclick_loss_xval": 7.052734375, "eval_seeclick_runtime": 82.3558, "eval_seeclick_samples_per_second": 0.607, "eval_seeclick_steps_per_second": 0.024, "num_input_tokens_seen": 8809720, "step": 51 }, { "epoch": 0.02038369304556355, "eval_icons_IoU": 0.0, "eval_icons_MAE_all": 0.12515902519226074, "eval_icons_MAE_h": 0.027381721884012222, "eval_icons_MAE_w": 0.05067274160683155, "eval_icons_MAE_x": 0.18365809321403503, "eval_icons_MAE_y": 0.23892351984977722, "eval_icons_NUM_probability": 1.5372147998959917e-08, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 7.085067272186279, "eval_icons_loss_ce": 3.12102210521698, "eval_icons_loss_xval": 3.6298828125, "eval_icons_runtime": 89.2513, "eval_icons_samples_per_second": 0.56, "eval_icons_steps_per_second": 0.022, "num_input_tokens_seen": 8809720, "step": 51 }, { "epoch": 0.02038369304556355, "loss": 5.90113639831543, "loss_ce": 3.1316046714782715, "loss_xval": 2.765625, "num_input_tokens_seen": 8809720, "step": 51 }, { "epoch": 0.020783373301358914, "grad_norm": 591.8732945437766, "learning_rate": 3.5780781367109973e-06, "loss": 7.3836, "num_input_tokens_seen": 8982736, "step": 52 }, { "epoch": 0.020783373301358914, "loss": 7.991217136383057, "loss_ce": 3.0302796363830566, "loss_xval": 4.96875, "num_input_tokens_seen": 8982736, "step": 52 }, { "epoch": 0.021183053557154278, "grad_norm": 626.8464467894951, "learning_rate": 3.5953273713375363e-06, "loss": 7.577, "num_input_tokens_seen": 9155480, "step": 53 }, { "epoch": 0.021183053557154278, "loss": 7.484159469604492, "loss_ce": 2.9724409580230713, "loss_xval": 4.5, "num_input_tokens_seen": 9155480, "step": 53 }, { "epoch": 0.02158273381294964, "grad_norm": 256.7768068798319, "learning_rate": 3.6122541714093096e-06, "loss": 6.6296, "num_input_tokens_seen": 9328328, "step": 54 }, { "epoch": 0.02158273381294964, "loss": 6.372915744781494, "loss_ce": 2.915884494781494, "loss_xval": 3.453125, "num_input_tokens_seen": 9328328, "step": 54 }, { "epoch": 0.021982414068745005, "grad_norm": 211.77074553660972, "learning_rate": 3.628870370401895e-06, "loss": 6.8308, "num_input_tokens_seen": 9501408, "step": 55 }, { "epoch": 0.021982414068745005, "loss": 6.57294225692749, "loss_ce": 2.8893485069274902, "loss_xval": 3.6875, "num_input_tokens_seen": 9501408, "step": 55 }, { "epoch": 0.02238209432454037, "grad_norm": 192.3988438237558, "learning_rate": 3.645187162072364e-06, "loss": 7.7137, "num_input_tokens_seen": 9674048, "step": 56 }, { "epoch": 0.02238209432454037, "loss": 7.810283660888672, "loss_ce": 2.751690149307251, "loss_xval": 5.0625, "num_input_tokens_seen": 9674048, "step": 56 }, { "epoch": 0.022781774580335732, "grad_norm": 659.8576946924561, "learning_rate": 3.6612151457552162e-06, "loss": 6.6531, "num_input_tokens_seen": 9847000, "step": 57 }, { "epoch": 0.022781774580335732, "loss": 5.987558364868164, "loss_ce": 2.647714614868164, "loss_xval": 3.34375, "num_input_tokens_seen": 9847000, "step": 57 }, { "epoch": 0.023181454836131096, "grad_norm": 664.7341132161741, "learning_rate": 3.6769643677189227e-06, "loss": 7.0286, "num_input_tokens_seen": 10019880, "step": 58 }, { "epoch": 0.023181454836131096, "loss": 7.144340515136719, "loss_ce": 2.720512628555298, "loss_xval": 4.4375, "num_input_tokens_seen": 10019880, "step": 58 }, { "epoch": 0.02358113509192646, "grad_norm": 212.02522543694363, "learning_rate": 3.692444358987175e-06, "loss": 7.0622, "num_input_tokens_seen": 10192832, "step": 59 }, { "epoch": 0.02358113509192646, "loss": 6.224069595336914, "loss_ce": 2.544382333755493, "loss_xval": 3.6875, "num_input_tokens_seen": 10192832, "step": 59 }, { "epoch": 0.023980815347721823, "grad_norm": 165.01048096537636, "learning_rate": 3.707664169981407e-06, "loss": 6.2673, "num_input_tokens_seen": 10365984, "step": 60 }, { "epoch": 0.023980815347721823, "loss": 6.832554817199707, "loss_ce": 2.652867555618286, "loss_xval": 4.1875, "num_input_tokens_seen": 10365984, "step": 60 }, { "epoch": 0.024380495603517186, "grad_norm": 483.00531361229116, "learning_rate": 3.7226324022999028e-06, "loss": 6.6506, "num_input_tokens_seen": 10538928, "step": 61 }, { "epoch": 0.024380495603517186, "loss": 7.050806045532227, "loss_ce": 2.4297122955322266, "loss_xval": 4.625, "num_input_tokens_seen": 10538928, "step": 61 }, { "epoch": 0.02478017585931255, "grad_norm": 572.7726598458358, "learning_rate": 3.737357237912862e-06, "loss": 6.9383, "num_input_tokens_seen": 10711776, "step": 62 }, { "epoch": 0.02478017585931255, "loss": 6.301916122436523, "loss_ce": 2.4972286224365234, "loss_xval": 3.8125, "num_input_tokens_seen": 10711776, "step": 62 }, { "epoch": 0.025179856115107913, "grad_norm": 199.8374697737695, "learning_rate": 3.751846466021567e-06, "loss": 6.6134, "num_input_tokens_seen": 10884920, "step": 63 }, { "epoch": 0.025179856115107913, "loss": 6.5081939697265625, "loss_ce": 2.4437410831451416, "loss_xval": 4.0625, "num_input_tokens_seen": 10884920, "step": 63 }, { "epoch": 0.025579536370903277, "grad_norm": 179.37701762975175, "learning_rate": 3.7661075078023677e-06, "loss": 6.0898, "num_input_tokens_seen": 11057376, "step": 64 }, { "epoch": 0.025579536370903277, "loss": 5.734729290008545, "loss_ce": 2.328479290008545, "loss_xval": 3.40625, "num_input_tokens_seen": 11057376, "step": 64 }, { "epoch": 0.02597921662669864, "grad_norm": 190.80330379121816, "learning_rate": 3.7801474392322986e-06, "loss": 5.5901, "num_input_tokens_seen": 11229816, "step": 65 }, { "epoch": 0.02597921662669864, "loss": 5.417404651641846, "loss_ce": 2.1576390266418457, "loss_xval": 3.265625, "num_input_tokens_seen": 11229816, "step": 65 }, { "epoch": 0.026378896882494004, "grad_norm": 375.5172678891961, "learning_rate": 3.793973012172059e-06, "loss": 5.7052, "num_input_tokens_seen": 11402552, "step": 66 }, { "epoch": 0.026378896882494004, "loss": 5.661341667175293, "loss_ce": 2.294153928756714, "loss_xval": 3.375, "num_input_tokens_seen": 11402552, "step": 66 }, { "epoch": 0.026778577138289367, "grad_norm": 331.5482112197834, "learning_rate": 3.807590673863634e-06, "loss": 5.6927, "num_input_tokens_seen": 11575584, "step": 67 }, { "epoch": 0.026778577138289367, "loss": 5.483163833618164, "loss_ce": 2.371835708618164, "loss_xval": 3.109375, "num_input_tokens_seen": 11575584, "step": 67 }, { "epoch": 0.02717825739408473, "grad_norm": 118.69682377879631, "learning_rate": 3.8210065849834735e-06, "loss": 5.503, "num_input_tokens_seen": 11744688, "step": 68 }, { "epoch": 0.02717825739408473, "loss": 4.790616989135742, "loss_ce": 2.1099531650543213, "loss_xval": 2.6875, "num_input_tokens_seen": 11744688, "step": 68 }, { "epoch": 0.027577937649880094, "grad_norm": 209.335686573374, "learning_rate": 3.834226636377774e-06, "loss": 4.9825, "num_input_tokens_seen": 11917488, "step": 69 }, { "epoch": 0.027577937649880094, "loss": 5.30062198638916, "loss_ce": 2.11995792388916, "loss_xval": 3.1875, "num_input_tokens_seen": 11917488, "step": 69 }, { "epoch": 0.027977617905675458, "grad_norm": 183.04183416930314, "learning_rate": 3.847256464593665e-06, "loss": 5.1639, "num_input_tokens_seen": 12090624, "step": 70 }, { "epoch": 0.027977617905675458, "loss": 5.195356369018555, "loss_ce": 2.033247232437134, "loss_xval": 3.15625, "num_input_tokens_seen": 12090624, "step": 70 }, { "epoch": 0.028377298161470825, "grad_norm": 258.67109460163607, "learning_rate": 3.860101466308762e-06, "loss": 5.6155, "num_input_tokens_seen": 12263440, "step": 71 }, { "epoch": 0.028377298161470825, "loss": 5.879308223724365, "loss_ce": 2.093175172805786, "loss_xval": 3.78125, "num_input_tokens_seen": 12263440, "step": 71 }, { "epoch": 0.02877697841726619, "grad_norm": 216.22409083276455, "learning_rate": 3.872766811751572e-06, "loss": 5.8319, "num_input_tokens_seen": 12433088, "step": 72 }, { "epoch": 0.02877697841726619, "loss": 5.86053466796875, "loss_ce": 2.049011468887329, "loss_xval": 3.8125, "num_input_tokens_seen": 12433088, "step": 72 }, { "epoch": 0.029176658673061552, "grad_norm": 224.64777003998142, "learning_rate": 3.8852574571962525e-06, "loss": 5.0139, "num_input_tokens_seen": 12602080, "step": 73 }, { "epoch": 0.029176658673061552, "loss": 5.154097557067871, "loss_ce": 1.9460902214050293, "loss_xval": 3.203125, "num_input_tokens_seen": 12602080, "step": 73 }, { "epoch": 0.029576338928856916, "grad_norm": 188.87320237119758, "learning_rate": 3.897578156607312e-06, "loss": 4.8818, "num_input_tokens_seen": 12772032, "step": 74 }, { "epoch": 0.029576338928856916, "loss": 4.89943790435791, "loss_ce": 2.05178165435791, "loss_xval": 2.84375, "num_input_tokens_seen": 12772032, "step": 74 }, { "epoch": 0.02997601918465228, "grad_norm": 239.00340379876698, "learning_rate": 3.9097334725027084e-06, "loss": 4.9708, "num_input_tokens_seen": 12944640, "step": 75 }, { "epoch": 0.02997601918465228, "loss": 5.254262924194336, "loss_ce": 1.935903549194336, "loss_xval": 3.3125, "num_input_tokens_seen": 12944640, "step": 75 }, { "epoch": 0.030375699440447643, "grad_norm": 175.64097497094016, "learning_rate": 3.921727786097478e-06, "loss": 4.8671, "num_input_tokens_seen": 13117608, "step": 76 }, { "epoch": 0.030375699440447643, "loss": 4.83261251449585, "loss_ce": 1.8843704462051392, "loss_xval": 2.953125, "num_input_tokens_seen": 13117608, "step": 76 }, { "epoch": 0.030775379696243006, "grad_norm": 171.78683132868542, "learning_rate": 3.933565306784317e-06, "loss": 4.9704, "num_input_tokens_seen": 13290680, "step": 77 }, { "epoch": 0.030775379696243006, "loss": 4.89105224609375, "loss_ce": 1.9115601778030396, "loss_xval": 2.984375, "num_input_tokens_seen": 13290680, "step": 77 }, { "epoch": 0.03117505995203837, "grad_norm": 179.14747541729278, "learning_rate": 3.945250081002463e-06, "loss": 4.7534, "num_input_tokens_seen": 13464144, "step": 78 }, { "epoch": 0.03117505995203837, "loss": 4.642127513885498, "loss_ce": 1.9326547384262085, "loss_xval": 2.703125, "num_input_tokens_seen": 13464144, "step": 78 }, { "epoch": 0.03157474020783373, "grad_norm": 137.37235263944487, "learning_rate": 3.956786000541636e-06, "loss": 4.239, "num_input_tokens_seen": 13637160, "step": 79 }, { "epoch": 0.03157474020783373, "loss": 4.785009860992432, "loss_ce": 1.7830569744110107, "loss_xval": 3.0, "num_input_tokens_seen": 13637160, "step": 79 }, { "epoch": 0.0319744204636291, "grad_norm": 186.29416857398633, "learning_rate": 3.96817681032367e-06, "loss": 5.1584, "num_input_tokens_seen": 13810040, "step": 80 }, { "epoch": 0.0319744204636291, "loss": 5.067818641662598, "loss_ce": 1.8412563800811768, "loss_xval": 3.21875, "num_input_tokens_seen": 13810040, "step": 80 }, { "epoch": 0.03237410071942446, "grad_norm": 260.14001469149053, "learning_rate": 3.979426115700776e-06, "loss": 4.9267, "num_input_tokens_seen": 13982936, "step": 81 }, { "epoch": 0.03237410071942446, "loss": 4.497675895690918, "loss_ce": 1.7955272197723389, "loss_xval": 2.703125, "num_input_tokens_seen": 13982936, "step": 81 }, { "epoch": 0.032773780975219824, "grad_norm": 216.40429405771704, "learning_rate": 3.990537389306027e-06, "loss": 5.5378, "num_input_tokens_seen": 14156248, "step": 82 }, { "epoch": 0.032773780975219824, "loss": 5.3804426193237305, "loss_ce": 1.7896225452423096, "loss_xval": 3.59375, "num_input_tokens_seen": 14156248, "step": 82 }, { "epoch": 0.03317346123101519, "grad_norm": 549.5248800116727, "learning_rate": 4.001513977488632e-06, "loss": 4.9184, "num_input_tokens_seen": 14329344, "step": 83 }, { "epoch": 0.03317346123101519, "loss": 4.968637466430664, "loss_ce": 1.767465353012085, "loss_xval": 3.203125, "num_input_tokens_seen": 14329344, "step": 83 }, { "epoch": 0.03357314148681055, "grad_norm": 194.0581771356025, "learning_rate": 4.012359106363829e-06, "loss": 4.3067, "num_input_tokens_seen": 14502072, "step": 84 }, { "epoch": 0.03357314148681055, "loss": 4.593915939331055, "loss_ce": 1.6945018768310547, "loss_xval": 2.90625, "num_input_tokens_seen": 14502072, "step": 84 }, { "epoch": 0.033972821742605915, "grad_norm": 516.6266208234828, "learning_rate": 4.023075887504775e-06, "loss": 4.4029, "num_input_tokens_seen": 14675008, "step": 85 }, { "epoch": 0.033972821742605915, "loss": 4.0735392570495605, "loss_ce": 1.7053749561309814, "loss_xval": 2.375, "num_input_tokens_seen": 14675008, "step": 85 }, { "epoch": 0.03437250199840128, "grad_norm": 374.0473462885556, "learning_rate": 4.033667323301552e-06, "loss": 4.1857, "num_input_tokens_seen": 14847768, "step": 86 }, { "epoch": 0.03437250199840128, "loss": 4.331335067749023, "loss_ce": 1.7405146360397339, "loss_xval": 2.59375, "num_input_tokens_seen": 14847768, "step": 86 }, { "epoch": 0.03477218225419664, "grad_norm": 156.7332678793166, "learning_rate": 4.044136312010388e-06, "loss": 4.2331, "num_input_tokens_seen": 15020560, "step": 87 }, { "epoch": 0.03477218225419664, "loss": 3.9675381183624268, "loss_ce": 1.6716396808624268, "loss_xval": 2.296875, "num_input_tokens_seen": 15020560, "step": 87 }, { "epoch": 0.035171862509992005, "grad_norm": 319.8318187587833, "learning_rate": 4.0544856525143226e-06, "loss": 4.8582, "num_input_tokens_seen": 15193280, "step": 88 }, { "epoch": 0.035171862509992005, "loss": 4.755683422088623, "loss_ce": 1.5857617855072021, "loss_xval": 3.171875, "num_input_tokens_seen": 15193280, "step": 88 }, { "epoch": 0.03557154276578737, "grad_norm": 644.9710013537776, "learning_rate": 4.064718048814889e-06, "loss": 4.9109, "num_input_tokens_seen": 15366384, "step": 89 }, { "epoch": 0.03557154276578737, "loss": 4.91407585144043, "loss_ce": 1.6425917148590088, "loss_xval": 3.265625, "num_input_tokens_seen": 15366384, "step": 89 }, { "epoch": 0.03597122302158273, "grad_norm": 246.6542571417044, "learning_rate": 4.074836114272873e-06, "loss": 4.3901, "num_input_tokens_seen": 15539432, "step": 90 }, { "epoch": 0.03597122302158273, "loss": 4.276044845581055, "loss_ce": 1.6398143768310547, "loss_xval": 2.640625, "num_input_tokens_seen": 15539432, "step": 90 }, { "epoch": 0.036370903277378096, "grad_norm": 455.99645617059537, "learning_rate": 4.08484237561472e-06, "loss": 4.7347, "num_input_tokens_seen": 15712712, "step": 91 }, { "epoch": 0.036370903277378096, "loss": 5.283034324645996, "loss_ce": 1.6326435804367065, "loss_xval": 3.65625, "num_input_tokens_seen": 15712712, "step": 91 }, { "epoch": 0.03677058353317346, "grad_norm": 196.25249760645295, "learning_rate": 4.094739276720037e-06, "loss": 4.3173, "num_input_tokens_seen": 15885664, "step": 92 }, { "epoch": 0.03677058353317346, "loss": 4.700507640838623, "loss_ce": 1.668281078338623, "loss_xval": 3.03125, "num_input_tokens_seen": 15885664, "step": 92 }, { "epoch": 0.03717026378896882, "grad_norm": 285.7770874903587, "learning_rate": 4.1045291822043285e-06, "loss": 4.0221, "num_input_tokens_seen": 16058736, "step": 93 }, { "epoch": 0.03717026378896882, "loss": 4.253849983215332, "loss_ce": 1.6098066568374634, "loss_xval": 2.640625, "num_input_tokens_seen": 16058736, "step": 93 }, { "epoch": 0.037569944044764186, "grad_norm": 204.95844707804565, "learning_rate": 4.1142143808101425e-06, "loss": 4.2728, "num_input_tokens_seen": 16231688, "step": 94 }, { "epoch": 0.037569944044764186, "loss": 4.915648460388184, "loss_ce": 1.6168203353881836, "loss_xval": 3.296875, "num_input_tokens_seen": 16231688, "step": 94 }, { "epoch": 0.03796962430055955, "grad_norm": 156.8292452749296, "learning_rate": 4.123797088618779e-06, "loss": 4.1215, "num_input_tokens_seen": 16404472, "step": 95 }, { "epoch": 0.03796962430055955, "loss": 4.639373779296875, "loss_ce": 1.5382994413375854, "loss_xval": 3.09375, "num_input_tokens_seen": 16404472, "step": 95 }, { "epoch": 0.03836930455635491, "grad_norm": 131.9876314168009, "learning_rate": 4.133279452093834e-06, "loss": 4.1419, "num_input_tokens_seen": 16577504, "step": 96 }, { "epoch": 0.03836930455635491, "loss": 4.308347702026367, "loss_ce": 1.5310044288635254, "loss_xval": 2.78125, "num_input_tokens_seen": 16577504, "step": 96 }, { "epoch": 0.03876898481215028, "grad_norm": 110.37162259406017, "learning_rate": 4.142663550967035e-06, "loss": 3.6569, "num_input_tokens_seen": 16750808, "step": 97 }, { "epoch": 0.03876898481215028, "loss": 3.53420352935791, "loss_ce": 1.5429925918579102, "loss_xval": 1.9921875, "num_input_tokens_seen": 16750808, "step": 97 }, { "epoch": 0.03916866506794564, "grad_norm": 168.5095133634534, "learning_rate": 4.151951400976087e-06, "loss": 3.7882, "num_input_tokens_seen": 16923992, "step": 98 }, { "epoch": 0.03916866506794564, "loss": 3.392901659011841, "loss_ce": 1.4983705282211304, "loss_xval": 1.890625, "num_input_tokens_seen": 16923992, "step": 98 }, { "epoch": 0.039568345323741004, "grad_norm": 114.59340514251383, "learning_rate": 4.161144956463525e-06, "loss": 3.8205, "num_input_tokens_seen": 17096680, "step": 99 }, { "epoch": 0.039568345323741004, "loss": 3.6567561626434326, "loss_ce": 1.485857605934143, "loss_xval": 2.171875, "num_input_tokens_seen": 17096680, "step": 99 }, { "epoch": 0.03996802557953637, "grad_norm": 203.8901770985848, "learning_rate": 4.170246112844972e-06, "loss": 3.8433, "num_input_tokens_seen": 17269216, "step": 100 }, { "epoch": 0.03996802557953637, "loss": 3.3409595489501953, "loss_ce": 1.4923267364501953, "loss_xval": 1.8515625, "num_input_tokens_seen": 17269216, "step": 100 }, { "epoch": 0.04036770583533174, "grad_norm": 124.1343078329466, "learning_rate": 4.179256708954579e-06, "loss": 3.6334, "num_input_tokens_seen": 17442352, "step": 101 }, { "epoch": 0.04036770583533174, "loss": 3.501720666885376, "loss_ce": 1.488048791885376, "loss_xval": 2.015625, "num_input_tokens_seen": 17442352, "step": 101 }, { "epoch": 0.0407673860911271, "grad_norm": 196.9699571170823, "learning_rate": 4.188178529274939e-06, "loss": 3.2125, "num_input_tokens_seen": 17615376, "step": 102 }, { "epoch": 0.0407673860911271, "loss": 3.121706008911133, "loss_ce": 1.4449481964111328, "loss_xval": 1.6796875, "num_input_tokens_seen": 17615376, "step": 102 }, { "epoch": 0.041167066346922465, "grad_norm": 119.54443363473506, "learning_rate": 4.197013306058203e-06, "loss": 3.4917, "num_input_tokens_seen": 17787984, "step": 103 }, { "epoch": 0.041167066346922465, "loss": 3.2809221744537354, "loss_ce": 1.4230120182037354, "loss_xval": 1.859375, "num_input_tokens_seen": 17787984, "step": 103 }, { "epoch": 0.04156674660271783, "grad_norm": 246.81496667478712, "learning_rate": 4.205762721344725e-06, "loss": 3.7306, "num_input_tokens_seen": 17961048, "step": 104 }, { "epoch": 0.04156674660271783, "loss": 3.0534510612487793, "loss_ce": 1.4357751607894897, "loss_xval": 1.6171875, "num_input_tokens_seen": 17961048, "step": 104 }, { "epoch": 0.04196642685851319, "grad_norm": 140.73203674853707, "learning_rate": 4.21442840888513e-06, "loss": 3.6675, "num_input_tokens_seen": 18133960, "step": 105 }, { "epoch": 0.04196642685851319, "loss": 3.460153102874756, "loss_ce": 1.4249968528747559, "loss_xval": 2.03125, "num_input_tokens_seen": 18133960, "step": 105 }, { "epoch": 0.042366107114308556, "grad_norm": 133.38932388420073, "learning_rate": 4.223011955971264e-06, "loss": 3.7657, "num_input_tokens_seen": 18306920, "step": 106 }, { "epoch": 0.042366107114308556, "loss": 3.373945951461792, "loss_ce": 1.422774076461792, "loss_xval": 1.953125, "num_input_tokens_seen": 18306920, "step": 106 }, { "epoch": 0.04276578737010392, "grad_norm": 137.17503761831742, "learning_rate": 4.231514905181194e-06, "loss": 3.9627, "num_input_tokens_seen": 18479872, "step": 107 }, { "epoch": 0.04276578737010392, "loss": 3.5468502044677734, "loss_ce": 1.4228266477584839, "loss_xval": 2.125, "num_input_tokens_seen": 18479872, "step": 107 }, { "epoch": 0.04316546762589928, "grad_norm": 168.70624239265484, "learning_rate": 4.239938756043038e-06, "loss": 3.0579, "num_input_tokens_seen": 18653056, "step": 108 }, { "epoch": 0.04316546762589928, "loss": 2.9931588172912598, "loss_ce": 1.4638619422912598, "loss_xval": 1.53125, "num_input_tokens_seen": 18653056, "step": 108 }, { "epoch": 0.043565147881694646, "grad_norm": 116.92635982536697, "learning_rate": 4.248284966622114e-06, "loss": 3.3453, "num_input_tokens_seen": 18825792, "step": 109 }, { "epoch": 0.043565147881694646, "loss": 3.643871307373047, "loss_ce": 1.4241447448730469, "loss_xval": 2.21875, "num_input_tokens_seen": 18825792, "step": 109 }, { "epoch": 0.04396482813749001, "grad_norm": 211.32307173730896, "learning_rate": 4.256554955035623e-06, "loss": 3.7173, "num_input_tokens_seen": 18998800, "step": 110 }, { "epoch": 0.04396482813749001, "loss": 3.9510254859924316, "loss_ce": 1.4148926734924316, "loss_xval": 2.53125, "num_input_tokens_seen": 18998800, "step": 110 }, { "epoch": 0.04436450839328537, "grad_norm": 125.8640650841611, "learning_rate": 4.264750100898777e-06, "loss": 3.5679, "num_input_tokens_seen": 19171832, "step": 111 }, { "epoch": 0.04436450839328537, "loss": 3.722196102142334, "loss_ce": 1.392117977142334, "loss_xval": 2.328125, "num_input_tokens_seen": 19171832, "step": 111 }, { "epoch": 0.04476418864908074, "grad_norm": 159.67886896593234, "learning_rate": 4.272871746706091e-06, "loss": 3.2583, "num_input_tokens_seen": 19344784, "step": 112 }, { "epoch": 0.04476418864908074, "loss": 3.2728888988494873, "loss_ce": 1.4017952680587769, "loss_xval": 1.875, "num_input_tokens_seen": 19344784, "step": 112 }, { "epoch": 0.0451638689048761, "grad_norm": 158.9863093245826, "learning_rate": 4.280921199151268e-06, "loss": 3.9811, "num_input_tokens_seen": 19517688, "step": 113 }, { "epoch": 0.0451638689048761, "loss": 4.066771030426025, "loss_ce": 1.3548572063446045, "loss_xval": 2.71875, "num_input_tokens_seen": 19517688, "step": 113 }, { "epoch": 0.045563549160671464, "grad_norm": 132.59333871837742, "learning_rate": 4.288899730388944e-06, "loss": 3.2045, "num_input_tokens_seen": 19690880, "step": 114 }, { "epoch": 0.045563549160671464, "loss": 3.301539659500122, "loss_ce": 1.330348253250122, "loss_xval": 1.96875, "num_input_tokens_seen": 19690880, "step": 114 }, { "epoch": 0.04596322941646683, "grad_norm": 156.7543956695616, "learning_rate": 4.296808579241338e-06, "loss": 3.0619, "num_input_tokens_seen": 19863616, "step": 115 }, { "epoch": 0.04596322941646683, "loss": 3.2327518463134766, "loss_ce": 1.3567752838134766, "loss_xval": 1.875, "num_input_tokens_seen": 19863616, "step": 115 }, { "epoch": 0.04636290967226219, "grad_norm": 137.7793568340198, "learning_rate": 4.304648952352651e-06, "loss": 3.3103, "num_input_tokens_seen": 20036800, "step": 116 }, { "epoch": 0.04636290967226219, "loss": 3.387848138809204, "loss_ce": 1.331695795059204, "loss_xval": 2.0625, "num_input_tokens_seen": 20036800, "step": 116 }, { "epoch": 0.046762589928057555, "grad_norm": 170.99058754674215, "learning_rate": 4.312422025293929e-06, "loss": 3.5094, "num_input_tokens_seen": 20209848, "step": 117 }, { "epoch": 0.046762589928057555, "loss": 3.220695972442627, "loss_ce": 1.355461597442627, "loss_xval": 1.8671875, "num_input_tokens_seen": 20209848, "step": 117 }, { "epoch": 0.04716227018385292, "grad_norm": 97.06419206586995, "learning_rate": 4.320128943620903e-06, "loss": 2.9942, "num_input_tokens_seen": 20382728, "step": 118 }, { "epoch": 0.04716227018385292, "loss": 3.0818350315093994, "loss_ce": 1.3162100315093994, "loss_xval": 1.765625, "num_input_tokens_seen": 20382728, "step": 118 }, { "epoch": 0.04756195043964828, "grad_norm": 338.7848439976261, "learning_rate": 4.327770823887197e-06, "loss": 3.9238, "num_input_tokens_seen": 20555712, "step": 119 }, { "epoch": 0.04756195043964828, "loss": 4.059802055358887, "loss_ce": 1.2927122116088867, "loss_xval": 2.765625, "num_input_tokens_seen": 20555712, "step": 119 }, { "epoch": 0.047961630695443645, "grad_norm": 427.11268613455303, "learning_rate": 4.335348754615135e-06, "loss": 3.0822, "num_input_tokens_seen": 20728280, "step": 120 }, { "epoch": 0.047961630695443645, "loss": 3.1181368827819824, "loss_ce": 1.2724335193634033, "loss_xval": 1.84375, "num_input_tokens_seen": 20728280, "step": 120 }, { "epoch": 0.04836131095123901, "grad_norm": 115.90695385095029, "learning_rate": 4.342863797226275e-06, "loss": 3.0454, "num_input_tokens_seen": 20901240, "step": 121 }, { "epoch": 0.04836131095123901, "loss": 2.844395875930786, "loss_ce": 1.2389271259307861, "loss_xval": 1.609375, "num_input_tokens_seen": 20901240, "step": 121 }, { "epoch": 0.04876099120703437, "grad_norm": 266.16344074709775, "learning_rate": 4.350316986933631e-06, "loss": 3.8283, "num_input_tokens_seen": 21074032, "step": 122 }, { "epoch": 0.04876099120703437, "loss": 3.777801752090454, "loss_ce": 1.2328799962997437, "loss_xval": 2.546875, "num_input_tokens_seen": 21074032, "step": 122 }, { "epoch": 0.049160671462829736, "grad_norm": 213.32519005243836, "learning_rate": 4.3577093335974925e-06, "loss": 3.1261, "num_input_tokens_seen": 21247224, "step": 123 }, { "epoch": 0.049160671462829736, "loss": 3.3567044734954834, "loss_ce": 1.2375637292861938, "loss_xval": 2.125, "num_input_tokens_seen": 21247224, "step": 123 }, { "epoch": 0.0495603517186251, "grad_norm": 101.5577996272501, "learning_rate": 4.36504182254659e-06, "loss": 3.0516, "num_input_tokens_seen": 21420088, "step": 124 }, { "epoch": 0.0495603517186251, "loss": 3.4608333110809326, "loss_ce": 1.2235286235809326, "loss_xval": 2.234375, "num_input_tokens_seen": 21420088, "step": 124 }, { "epoch": 0.04996003197442046, "grad_norm": 198.95971382452228, "learning_rate": 4.3723154153662725e-06, "loss": 3.2876, "num_input_tokens_seen": 21593120, "step": 125 }, { "epoch": 0.04996003197442046, "loss": 3.0590901374816895, "loss_ce": 1.2397539615631104, "loss_xval": 1.8203125, "num_input_tokens_seen": 21593120, "step": 125 }, { "epoch": 0.050359712230215826, "grad_norm": 166.57226050726882, "learning_rate": 4.379531050655295e-06, "loss": 2.9623, "num_input_tokens_seen": 21765976, "step": 126 }, { "epoch": 0.050359712230215826, "loss": 2.768925666809082, "loss_ce": 1.2347460985183716, "loss_xval": 1.53125, "num_input_tokens_seen": 21765976, "step": 126 }, { "epoch": 0.05075939248601119, "grad_norm": 217.73765671883095, "learning_rate": 4.386689644752683e-06, "loss": 3.1025, "num_input_tokens_seen": 21938808, "step": 127 }, { "epoch": 0.05075939248601119, "loss": 2.747063159942627, "loss_ce": 1.227287769317627, "loss_xval": 1.5234375, "num_input_tokens_seen": 21938808, "step": 127 }, { "epoch": 0.051159072741806554, "grad_norm": 263.5829876068027, "learning_rate": 4.3937920924360965e-06, "loss": 3.3341, "num_input_tokens_seen": 22111176, "step": 128 }, { "epoch": 0.051159072741806554, "loss": 2.9254727363586426, "loss_ce": 1.2174649238586426, "loss_xval": 1.7109375, "num_input_tokens_seen": 22111176, "step": 128 }, { "epoch": 0.05155875299760192, "grad_norm": 279.6834850427316, "learning_rate": 4.4008392675930185e-06, "loss": 2.9818, "num_input_tokens_seen": 22284392, "step": 129 }, { "epoch": 0.05155875299760192, "loss": 2.40582275390625, "loss_ce": 1.191955327987671, "loss_xval": 1.2109375, "num_input_tokens_seen": 22284392, "step": 129 }, { "epoch": 0.05195843325339728, "grad_norm": 273.57321882538974, "learning_rate": 4.407832023866027e-06, "loss": 3.3854, "num_input_tokens_seen": 22457544, "step": 130 }, { "epoch": 0.05195843325339728, "loss": 3.0595662593841553, "loss_ce": 1.2060506343841553, "loss_xval": 1.8515625, "num_input_tokens_seen": 22457544, "step": 130 }, { "epoch": 0.052358113509192644, "grad_norm": 344.0145083356384, "learning_rate": 4.414771195273343e-06, "loss": 3.6465, "num_input_tokens_seen": 22630440, "step": 131 }, { "epoch": 0.052358113509192644, "loss": 4.122766971588135, "loss_ce": 1.1769661903381348, "loss_xval": 2.953125, "num_input_tokens_seen": 22630440, "step": 131 }, { "epoch": 0.05275779376498801, "grad_norm": 255.65466358822889, "learning_rate": 4.421657596805787e-06, "loss": 3.421, "num_input_tokens_seen": 22803176, "step": 132 }, { "epoch": 0.05275779376498801, "loss": 3.9519777297973633, "loss_ce": 1.2005128860473633, "loss_xval": 2.75, "num_input_tokens_seen": 22803176, "step": 132 }, { "epoch": 0.05315747402078337, "grad_norm": 267.6338664362621, "learning_rate": 4.428492025001201e-06, "loss": 3.2026, "num_input_tokens_seen": 22976304, "step": 133 }, { "epoch": 0.05315747402078337, "loss": 3.3537511825561523, "loss_ce": 1.1672275066375732, "loss_xval": 2.1875, "num_input_tokens_seen": 22976304, "step": 133 }, { "epoch": 0.053557154276578735, "grad_norm": 131.21776577343127, "learning_rate": 4.435275258497362e-06, "loss": 2.9131, "num_input_tokens_seen": 23149344, "step": 134 }, { "epoch": 0.053557154276578735, "loss": 3.031949520111084, "loss_ce": 1.194547176361084, "loss_xval": 1.8359375, "num_input_tokens_seen": 23149344, "step": 134 }, { "epoch": 0.0539568345323741, "grad_norm": 514.2271793073204, "learning_rate": 4.442008058564339e-06, "loss": 3.4785, "num_input_tokens_seen": 23322352, "step": 135 }, { "epoch": 0.0539568345323741, "loss": 3.484752655029297, "loss_ce": 1.1751822233200073, "loss_xval": 2.3125, "num_input_tokens_seen": 23322352, "step": 135 }, { "epoch": 0.05435651478816946, "grad_norm": 163.1858947105468, "learning_rate": 4.448691169617202e-06, "loss": 3.0443, "num_input_tokens_seen": 23495392, "step": 136 }, { "epoch": 0.05435651478816946, "loss": 2.978681802749634, "loss_ce": 1.1910841464996338, "loss_xval": 1.7890625, "num_input_tokens_seen": 23495392, "step": 136 }, { "epoch": 0.054756195043964825, "grad_norm": 453.2241878703702, "learning_rate": 4.455325319709954e-06, "loss": 3.0782, "num_input_tokens_seen": 23667952, "step": 137 }, { "epoch": 0.054756195043964825, "loss": 3.2870519161224365, "loss_ce": 1.1347081661224365, "loss_xval": 2.15625, "num_input_tokens_seen": 23667952, "step": 137 }, { "epoch": 0.05515587529976019, "grad_norm": 130.18513835393406, "learning_rate": 4.461911221011503e-06, "loss": 2.5414, "num_input_tokens_seen": 23840968, "step": 138 }, { "epoch": 0.05515587529976019, "loss": 2.4765074253082275, "loss_ce": 1.1742613315582275, "loss_xval": 1.3046875, "num_input_tokens_seen": 23840968, "step": 138 }, { "epoch": 0.05555555555555555, "grad_norm": 458.0524797809028, "learning_rate": 4.468449570264441e-06, "loss": 3.1261, "num_input_tokens_seen": 24013976, "step": 139 }, { "epoch": 0.05555555555555555, "loss": 3.3846635818481445, "loss_ce": 1.151753306388855, "loss_xval": 2.234375, "num_input_tokens_seen": 24013976, "step": 139 }, { "epoch": 0.055955235811350916, "grad_norm": 128.72335328013372, "learning_rate": 4.474941049227392e-06, "loss": 3.1837, "num_input_tokens_seen": 24186640, "step": 140 }, { "epoch": 0.055955235811350916, "loss": 2.599513053894043, "loss_ce": 1.1634780168533325, "loss_xval": 1.4375, "num_input_tokens_seen": 24186640, "step": 140 }, { "epoch": 0.05635491606714628, "grad_norm": 426.5509683313082, "learning_rate": 4.481386325101608e-06, "loss": 3.1009, "num_input_tokens_seen": 24360088, "step": 141 }, { "epoch": 0.05635491606714628, "loss": 2.746947765350342, "loss_ce": 1.1336662769317627, "loss_xval": 1.609375, "num_input_tokens_seen": 24360088, "step": 141 }, { "epoch": 0.05675459632294165, "grad_norm": 139.1436086661555, "learning_rate": 4.487786050942491e-06, "loss": 2.714, "num_input_tokens_seen": 24533144, "step": 142 }, { "epoch": 0.05675459632294165, "loss": 2.8101062774658203, "loss_ce": 1.1851062774658203, "loss_xval": 1.625, "num_input_tokens_seen": 24533144, "step": 142 }, { "epoch": 0.057154276578737014, "grad_norm": 259.8648252836964, "learning_rate": 4.494140866056678e-06, "loss": 3.2545, "num_input_tokens_seen": 24705472, "step": 143 }, { "epoch": 0.057154276578737014, "loss": 3.093147039413452, "loss_ce": 1.1615064144134521, "loss_xval": 1.9296875, "num_input_tokens_seen": 24705472, "step": 143 }, { "epoch": 0.05755395683453238, "grad_norm": 188.67765026700243, "learning_rate": 4.5004513963853e-06, "loss": 3.1569, "num_input_tokens_seen": 24878424, "step": 144 }, { "epoch": 0.05755395683453238, "loss": 3.5168557167053223, "loss_ce": 1.1731057167053223, "loss_xval": 2.34375, "num_input_tokens_seen": 24878424, "step": 144 }, { "epoch": 0.05795363709032774, "grad_norm": 128.55755169702456, "learning_rate": 4.506718254873952e-06, "loss": 2.8533, "num_input_tokens_seen": 25051392, "step": 145 }, { "epoch": 0.05795363709032774, "loss": 3.2004616260528564, "loss_ce": 1.1936256885528564, "loss_xval": 2.0, "num_input_tokens_seen": 25051392, "step": 145 }, { "epoch": 0.058353317346123104, "grad_norm": 178.2596345349547, "learning_rate": 4.5129420418299804e-06, "loss": 2.5044, "num_input_tokens_seen": 25221360, "step": 146 }, { "epoch": 0.058353317346123104, "loss": 2.6224756240844727, "loss_ce": 1.1561671495437622, "loss_xval": 1.46875, "num_input_tokens_seen": 25221360, "step": 146 }, { "epoch": 0.05875299760191847, "grad_norm": 109.70621446733796, "learning_rate": 4.519123345267552e-06, "loss": 2.6378, "num_input_tokens_seen": 25394160, "step": 147 }, { "epoch": 0.05875299760191847, "loss": 3.004621982574463, "loss_ce": 1.1469557285308838, "loss_xval": 1.859375, "num_input_tokens_seen": 25394160, "step": 147 }, { "epoch": 0.05915267785771383, "grad_norm": 145.08135486046965, "learning_rate": 4.52526274124104e-06, "loss": 2.9199, "num_input_tokens_seen": 25566928, "step": 148 }, { "epoch": 0.05915267785771383, "loss": 3.229592800140381, "loss_ce": 1.1158232688903809, "loss_xval": 2.109375, "num_input_tokens_seen": 25566928, "step": 148 }, { "epoch": 0.059552358113509195, "grad_norm": 92.14531111691825, "learning_rate": 4.5313607941671774e-06, "loss": 2.3757, "num_input_tokens_seen": 25739848, "step": 149 }, { "epoch": 0.059552358113509195, "loss": 2.58808970451355, "loss_ce": 1.1012732982635498, "loss_xval": 1.484375, "num_input_tokens_seen": 25739848, "step": 149 }, { "epoch": 0.05995203836930456, "grad_norm": 218.825158302032, "learning_rate": 4.537418057136436e-06, "loss": 3.0667, "num_input_tokens_seen": 25913056, "step": 150 }, { "epoch": 0.05995203836930456, "loss": 2.9010426998138428, "loss_ce": 1.0685231685638428, "loss_xval": 1.8359375, "num_input_tokens_seen": 25913056, "step": 150 }, { "epoch": 0.06035171862509992, "grad_norm": 192.8732197306815, "learning_rate": 4.54343507221407e-06, "loss": 2.8767, "num_input_tokens_seen": 26086456, "step": 151 }, { "epoch": 0.06035171862509992, "loss": 2.9303150177001953, "loss_ce": 1.0826586484909058, "loss_xval": 1.84375, "num_input_tokens_seen": 26086456, "step": 151 }, { "epoch": 0.060751398880895285, "grad_norm": 116.65432798476725, "learning_rate": 4.549412370731207e-06, "loss": 2.8269, "num_input_tokens_seen": 26259336, "step": 152 }, { "epoch": 0.060751398880895285, "loss": 2.606137752532959, "loss_ce": 1.0680519342422485, "loss_xval": 1.5390625, "num_input_tokens_seen": 26259336, "step": 152 }, { "epoch": 0.06115107913669065, "grad_norm": 129.79247278384622, "learning_rate": 4.555350473566405e-06, "loss": 2.6485, "num_input_tokens_seen": 26432320, "step": 153 }, { "epoch": 0.06115107913669065, "loss": 2.66135573387146, "loss_ce": 1.0431914329528809, "loss_xval": 1.6171875, "num_input_tokens_seen": 26432320, "step": 153 }, { "epoch": 0.06155075939248601, "grad_norm": 255.33644741357125, "learning_rate": 4.561249891418045e-06, "loss": 2.6596, "num_input_tokens_seen": 26605232, "step": 154 }, { "epoch": 0.06155075939248601, "loss": 2.2152175903320312, "loss_ce": 1.0363845825195312, "loss_xval": 1.1796875, "num_input_tokens_seen": 26605232, "step": 154 }, { "epoch": 0.061950439648281376, "grad_norm": 326.2132881570226, "learning_rate": 4.567111125067892e-06, "loss": 3.1065, "num_input_tokens_seen": 26778160, "step": 155 }, { "epoch": 0.061950439648281376, "loss": 2.977107524871826, "loss_ce": 1.0156819820404053, "loss_xval": 1.9609375, "num_input_tokens_seen": 26778160, "step": 155 }, { "epoch": 0.06235011990407674, "grad_norm": 178.37054244023707, "learning_rate": 4.572934665636191e-06, "loss": 2.8082, "num_input_tokens_seen": 26951312, "step": 156 }, { "epoch": 0.06235011990407674, "loss": 3.033820390701294, "loss_ce": 0.995246171951294, "loss_xval": 2.03125, "num_input_tokens_seen": 26951312, "step": 156 }, { "epoch": 0.0627498001598721, "grad_norm": 240.25091369114222, "learning_rate": 4.578720994828615e-06, "loss": 2.8172, "num_input_tokens_seen": 27124296, "step": 157 }, { "epoch": 0.0627498001598721, "loss": 2.7480130195617676, "loss_ce": 0.9975247383117676, "loss_xval": 1.75, "num_input_tokens_seen": 27124296, "step": 157 }, { "epoch": 0.06314948041566747, "grad_norm": 299.6531392086124, "learning_rate": 4.584470585175365e-06, "loss": 3.137, "num_input_tokens_seen": 27297296, "step": 158 }, { "epoch": 0.06314948041566747, "loss": 3.1829733848571777, "loss_ce": 0.9749656915664673, "loss_xval": 2.203125, "num_input_tokens_seen": 27297296, "step": 158 }, { "epoch": 0.06354916067146282, "grad_norm": 236.05049498447235, "learning_rate": 4.59018390026273e-06, "loss": 2.7418, "num_input_tokens_seen": 27470144, "step": 159 }, { "epoch": 0.06354916067146282, "loss": 2.5435566902160645, "loss_ce": 0.991310715675354, "loss_xval": 1.5546875, "num_input_tokens_seen": 27470144, "step": 159 }, { "epoch": 0.0639488409272582, "grad_norm": 245.1500933668583, "learning_rate": 4.595861394957398e-06, "loss": 2.7991, "num_input_tokens_seen": 27643168, "step": 160 }, { "epoch": 0.0639488409272582, "loss": 2.7963478565216064, "loss_ce": 1.0024025440216064, "loss_xval": 1.796875, "num_input_tokens_seen": 27643168, "step": 160 }, { "epoch": 0.06434852118305355, "grad_norm": 269.6347230509653, "learning_rate": 4.601503515623759e-06, "loss": 2.5151, "num_input_tokens_seen": 27816264, "step": 161 }, { "epoch": 0.06434852118305355, "loss": 2.5006017684936523, "loss_ce": 1.0118319988250732, "loss_xval": 1.4921875, "num_input_tokens_seen": 27816264, "step": 161 }, { "epoch": 0.06474820143884892, "grad_norm": 375.92850399502834, "learning_rate": 4.607110700334503e-06, "loss": 2.925, "num_input_tokens_seen": 27988768, "step": 162 }, { "epoch": 0.06474820143884892, "loss": 3.5245378017425537, "loss_ce": 0.9766863584518433, "loss_xval": 2.546875, "num_input_tokens_seen": 27988768, "step": 162 }, { "epoch": 0.06514788169464429, "grad_norm": 126.56545474069529, "learning_rate": 4.6126833790747175e-06, "loss": 3.1895, "num_input_tokens_seen": 28161192, "step": 163 }, { "epoch": 0.06514788169464429, "loss": 2.75604248046875, "loss_ce": 0.9889528155326843, "loss_xval": 1.765625, "num_input_tokens_seen": 28161192, "step": 163 }, { "epoch": 0.06554756195043965, "grad_norm": 293.2841730627708, "learning_rate": 4.618221973939755e-06, "loss": 3.0304, "num_input_tokens_seen": 28334152, "step": 164 }, { "epoch": 0.06554756195043965, "loss": 3.074063777923584, "loss_ce": 1.013517141342163, "loss_xval": 2.0625, "num_input_tokens_seen": 28334152, "step": 164 }, { "epoch": 0.06594724220623502, "grad_norm": 93.69476386555672, "learning_rate": 4.623726899327088e-06, "loss": 2.5372, "num_input_tokens_seen": 28507128, "step": 165 }, { "epoch": 0.06594724220623502, "loss": 2.73991322517395, "loss_ce": 0.991866409778595, "loss_xval": 1.75, "num_input_tokens_seen": 28507128, "step": 165 }, { "epoch": 0.06634692246203037, "grad_norm": 354.54679439550654, "learning_rate": 4.629198562122361e-06, "loss": 3.1533, "num_input_tokens_seen": 28679696, "step": 166 }, { "epoch": 0.06634692246203037, "loss": 3.6342062950134277, "loss_ce": 0.9945579171180725, "loss_xval": 2.640625, "num_input_tokens_seen": 28679696, "step": 166 }, { "epoch": 0.06674660271782575, "grad_norm": 142.92356385134568, "learning_rate": 4.63463736187985e-06, "loss": 2.6586, "num_input_tokens_seen": 28852824, "step": 167 }, { "epoch": 0.06674660271782575, "loss": 2.430410623550415, "loss_ce": 0.9899808764457703, "loss_xval": 1.4375, "num_input_tokens_seen": 28852824, "step": 167 }, { "epoch": 0.0671462829736211, "grad_norm": 454.4623707108821, "learning_rate": 4.640043690997557e-06, "loss": 3.096, "num_input_tokens_seen": 29025240, "step": 168 }, { "epoch": 0.0671462829736211, "loss": 3.01880145072937, "loss_ce": 0.9689967632293701, "loss_xval": 2.046875, "num_input_tokens_seen": 29025240, "step": 168 }, { "epoch": 0.06754596322941647, "grad_norm": 157.19975275134112, "learning_rate": 4.645417934887083e-06, "loss": 2.986, "num_input_tokens_seen": 29198016, "step": 169 }, { "epoch": 0.06754596322941647, "loss": 3.3017961978912354, "loss_ce": 0.9551164507865906, "loss_xval": 2.34375, "num_input_tokens_seen": 29198016, "step": 169 }, { "epoch": 0.06794564348521183, "grad_norm": 550.0943331937483, "learning_rate": 4.650760472138503e-06, "loss": 3.3266, "num_input_tokens_seen": 29371016, "step": 170 }, { "epoch": 0.06794564348521183, "loss": 3.0751547813415527, "loss_ce": 0.9765218496322632, "loss_xval": 2.09375, "num_input_tokens_seen": 29371016, "step": 170 }, { "epoch": 0.0683453237410072, "grad_norm": 156.04590864267016, "learning_rate": 4.65607167468041e-06, "loss": 2.5659, "num_input_tokens_seen": 29544104, "step": 171 }, { "epoch": 0.0683453237410072, "loss": 2.807107925415039, "loss_ce": 0.9848423004150391, "loss_xval": 1.8203125, "num_input_tokens_seen": 29544104, "step": 171 }, { "epoch": 0.06874500399680256, "grad_norm": 387.04024961883766, "learning_rate": 4.66135190793528e-06, "loss": 2.8385, "num_input_tokens_seen": 29717344, "step": 172 }, { "epoch": 0.06874500399680256, "loss": 2.743061065673828, "loss_ce": 0.9437446594238281, "loss_xval": 1.796875, "num_input_tokens_seen": 29717344, "step": 172 }, { "epoch": 0.06914468425259793, "grad_norm": 352.2869774460929, "learning_rate": 4.666601530970348e-06, "loss": 2.9918, "num_input_tokens_seen": 29890248, "step": 173 }, { "epoch": 0.06914468425259793, "loss": 3.1348328590393066, "loss_ce": 1.016424536705017, "loss_xval": 2.125, "num_input_tokens_seen": 29890248, "step": 173 }, { "epoch": 0.06954436450839328, "grad_norm": 373.6882053758428, "learning_rate": 4.671820896644117e-06, "loss": 2.6598, "num_input_tokens_seen": 30063288, "step": 174 }, { "epoch": 0.06954436450839328, "loss": 3.382646083831787, "loss_ce": 0.938309907913208, "loss_xval": 2.4375, "num_input_tokens_seen": 30063288, "step": 174 }, { "epoch": 0.06994404476418865, "grad_norm": 377.7377688158846, "learning_rate": 4.677010351748694e-06, "loss": 3.2057, "num_input_tokens_seen": 30236256, "step": 175 }, { "epoch": 0.06994404476418865, "loss": 3.4559521675109863, "loss_ce": 0.8954052925109863, "loss_xval": 2.5625, "num_input_tokens_seen": 30236256, "step": 175 }, { "epoch": 0.07034372501998401, "grad_norm": 498.3446583121999, "learning_rate": 4.68217023714805e-06, "loss": 3.3281, "num_input_tokens_seen": 30408968, "step": 176 }, { "epoch": 0.07034372501998401, "loss": 4.02139139175415, "loss_ce": 0.8622116446495056, "loss_xval": 3.15625, "num_input_tokens_seen": 30408968, "step": 176 }, { "epoch": 0.07074340527577938, "grad_norm": 108.56714493238096, "learning_rate": 4.687300887912368e-06, "loss": 2.1292, "num_input_tokens_seen": 30581920, "step": 177 }, { "epoch": 0.07074340527577938, "loss": 2.317440986633301, "loss_ce": 0.8511323928833008, "loss_xval": 1.46875, "num_input_tokens_seen": 30581920, "step": 177 }, { "epoch": 0.07114308553157474, "grad_norm": 256.71895728118574, "learning_rate": 4.692402633448618e-06, "loss": 2.7499, "num_input_tokens_seen": 30754880, "step": 178 }, { "epoch": 0.07114308553157474, "loss": 2.346262216567993, "loss_ce": 0.8692113757133484, "loss_xval": 1.4765625, "num_input_tokens_seen": 30754880, "step": 178 }, { "epoch": 0.07154276578737011, "grad_norm": 110.25625300038342, "learning_rate": 4.6974757976274554e-06, "loss": 2.7188, "num_input_tokens_seen": 30927928, "step": 179 }, { "epoch": 0.07154276578737011, "loss": 2.735156774520874, "loss_ce": 0.925098180770874, "loss_xval": 1.8125, "num_input_tokens_seen": 30927928, "step": 179 }, { "epoch": 0.07194244604316546, "grad_norm": 262.0838823954474, "learning_rate": 4.702520698906601e-06, "loss": 2.5287, "num_input_tokens_seen": 31100800, "step": 180 }, { "epoch": 0.07194244604316546, "loss": 2.8189706802368164, "loss_ce": 0.9381113052368164, "loss_xval": 1.8828125, "num_input_tokens_seen": 31100800, "step": 180 }, { "epoch": 0.07234212629896083, "grad_norm": 156.72323945008685, "learning_rate": 4.707537650450795e-06, "loss": 2.6636, "num_input_tokens_seen": 31274032, "step": 181 }, { "epoch": 0.07234212629896083, "loss": 2.460793972015381, "loss_ce": 0.9273467063903809, "loss_xval": 1.53125, "num_input_tokens_seen": 31274032, "step": 181 }, { "epoch": 0.07274180655475619, "grad_norm": 128.82580048107164, "learning_rate": 4.712526960248448e-06, "loss": 2.5592, "num_input_tokens_seen": 31447248, "step": 182 }, { "epoch": 0.07274180655475619, "loss": 3.2276525497436523, "loss_ce": 0.9224766492843628, "loss_xval": 2.3125, "num_input_tokens_seen": 31447248, "step": 182 }, { "epoch": 0.07314148681055156, "grad_norm": 158.11940464942924, "learning_rate": 4.717488931225096e-06, "loss": 2.8079, "num_input_tokens_seen": 31619912, "step": 183 }, { "epoch": 0.07314148681055156, "loss": 2.667755603790283, "loss_ce": 0.7932440042495728, "loss_xval": 1.875, "num_input_tokens_seen": 31619912, "step": 183 }, { "epoch": 0.07354116706634692, "grad_norm": 182.92862063987715, "learning_rate": 4.722423861353765e-06, "loss": 2.5388, "num_input_tokens_seen": 31789160, "step": 184 }, { "epoch": 0.07354116706634692, "loss": 2.344510555267334, "loss_ce": 0.750760555267334, "loss_xval": 1.59375, "num_input_tokens_seen": 31789160, "step": 184 }, { "epoch": 0.07394084732214229, "grad_norm": 219.64439189932665, "learning_rate": 4.7273320437623414e-06, "loss": 2.2814, "num_input_tokens_seen": 31961976, "step": 185 }, { "epoch": 0.07394084732214229, "loss": 2.476329803466797, "loss_ce": 0.7595328688621521, "loss_xval": 1.71875, "num_input_tokens_seen": 31961976, "step": 185 }, { "epoch": 0.07434052757793765, "grad_norm": 99.98395758538393, "learning_rate": 4.7322137668380565e-06, "loss": 2.3144, "num_input_tokens_seen": 32135048, "step": 186 }, { "epoch": 0.07434052757793765, "loss": 2.1182146072387695, "loss_ce": 0.8057146072387695, "loss_xval": 1.3125, "num_input_tokens_seen": 32135048, "step": 186 }, { "epoch": 0.07474020783373302, "grad_norm": 221.2636081842405, "learning_rate": 4.737069314329155e-06, "loss": 2.5043, "num_input_tokens_seen": 32307616, "step": 187 }, { "epoch": 0.07474020783373302, "loss": 2.7484989166259766, "loss_ce": 0.8300418853759766, "loss_xval": 1.921875, "num_input_tokens_seen": 32307616, "step": 187 }, { "epoch": 0.07513988808952837, "grad_norm": 131.12741561544465, "learning_rate": 4.7418989654438705e-06, "loss": 2.515, "num_input_tokens_seen": 32480272, "step": 188 }, { "epoch": 0.07513988808952837, "loss": 2.4454233646392822, "loss_ce": 0.8226206302642822, "loss_xval": 1.625, "num_input_tokens_seen": 32480272, "step": 188 }, { "epoch": 0.07553956834532374, "grad_norm": 180.84951677755777, "learning_rate": 4.746702994946761e-06, "loss": 2.3479, "num_input_tokens_seen": 32653328, "step": 189 }, { "epoch": 0.07553956834532374, "loss": 2.8146772384643555, "loss_ce": 0.8351851105690002, "loss_xval": 1.9765625, "num_input_tokens_seen": 32653328, "step": 189 }, { "epoch": 0.0759392486011191, "grad_norm": 160.96373059065317, "learning_rate": 4.751481673252507e-06, "loss": 2.678, "num_input_tokens_seen": 32826176, "step": 190 }, { "epoch": 0.0759392486011191, "loss": 2.8481264114379883, "loss_ce": 0.8188296556472778, "loss_xval": 2.03125, "num_input_tokens_seen": 32826176, "step": 190 }, { "epoch": 0.07633892885691447, "grad_norm": 121.61672489552205, "learning_rate": 4.756235266517256e-06, "loss": 2.3304, "num_input_tokens_seen": 32999312, "step": 191 }, { "epoch": 0.07633892885691447, "loss": 2.3364672660827637, "loss_ce": 0.7788498997688293, "loss_xval": 1.5546875, "num_input_tokens_seen": 32999312, "step": 191 }, { "epoch": 0.07673860911270983, "grad_norm": 130.32186035710663, "learning_rate": 4.7609640367275626e-06, "loss": 2.2134, "num_input_tokens_seen": 33172008, "step": 192 }, { "epoch": 0.07673860911270983, "loss": 1.7896391153335571, "loss_ce": 0.7471586465835571, "loss_xval": 1.0390625, "num_input_tokens_seen": 33172008, "step": 192 }, { "epoch": 0.0771382893685052, "grad_norm": 143.76906168760766, "learning_rate": 4.765668241787041e-06, "loss": 2.4567, "num_input_tokens_seen": 33344800, "step": 193 }, { "epoch": 0.0771382893685052, "loss": 2.387838840484619, "loss_ce": 0.82729172706604, "loss_xval": 1.5625, "num_input_tokens_seen": 33344800, "step": 193 }, { "epoch": 0.07753796962430055, "grad_norm": 127.81565220139322, "learning_rate": 4.770348135600763e-06, "loss": 2.77, "num_input_tokens_seen": 33517848, "step": 194 }, { "epoch": 0.07753796962430055, "loss": 2.490429401397705, "loss_ce": 0.7819331884384155, "loss_xval": 1.7109375, "num_input_tokens_seen": 33517848, "step": 194 }, { "epoch": 0.07793764988009592, "grad_norm": 138.22926534009375, "learning_rate": 4.775003968157493e-06, "loss": 2.1771, "num_input_tokens_seen": 33690816, "step": 195 }, { "epoch": 0.07793764988009592, "loss": 1.85850191116333, "loss_ce": 0.7606015801429749, "loss_xval": 1.1015625, "num_input_tokens_seen": 33690816, "step": 195 }, { "epoch": 0.07833733013589128, "grad_norm": 202.1418801789714, "learning_rate": 4.779635985609814e-06, "loss": 2.3614, "num_input_tokens_seen": 33864144, "step": 196 }, { "epoch": 0.07833733013589128, "loss": 2.429080009460449, "loss_ce": 0.794070303440094, "loss_xval": 1.6328125, "num_input_tokens_seen": 33864144, "step": 196 }, { "epoch": 0.07873701039168665, "grad_norm": 167.9534199119472, "learning_rate": 4.784244430352227e-06, "loss": 2.1861, "num_input_tokens_seen": 34036976, "step": 197 }, { "epoch": 0.07873701039168665, "loss": 2.099493980407715, "loss_ce": 0.7598943710327148, "loss_xval": 1.3359375, "num_input_tokens_seen": 34036976, "step": 197 }, { "epoch": 0.07913669064748201, "grad_norm": 177.86964362591564, "learning_rate": 4.788829541097253e-06, "loss": 2.3694, "num_input_tokens_seen": 34209880, "step": 198 }, { "epoch": 0.07913669064748201, "loss": 2.0491485595703125, "loss_ce": 0.7351836562156677, "loss_xval": 1.3125, "num_input_tokens_seen": 34209880, "step": 198 }, { "epoch": 0.07953637090327738, "grad_norm": 95.02167898698566, "learning_rate": 4.793391552949641e-06, "loss": 2.102, "num_input_tokens_seen": 34382608, "step": 199 }, { "epoch": 0.07953637090327738, "loss": 2.334108829498291, "loss_ce": 0.6905540227890015, "loss_xval": 1.640625, "num_input_tokens_seen": 34382608, "step": 199 }, { "epoch": 0.07993605115907274, "grad_norm": 168.91885358506565, "learning_rate": 4.797930697478699e-06, "loss": 2.0532, "num_input_tokens_seen": 34555560, "step": 200 }, { "epoch": 0.07993605115907274, "loss": 2.0277481079101562, "loss_ce": 0.6434707641601562, "loss_xval": 1.3828125, "num_input_tokens_seen": 34555560, "step": 200 }, { "epoch": 0.0803357314148681, "grad_norm": 351.41292703035435, "learning_rate": 4.802447202788829e-06, "loss": 2.9673, "num_input_tokens_seen": 34728224, "step": 201 }, { "epoch": 0.0803357314148681, "loss": 3.7609810829162598, "loss_ce": 0.6657663583755493, "loss_xval": 3.09375, "num_input_tokens_seen": 34728224, "step": 201 }, { "epoch": 0.08073541167066348, "grad_norm": 442.343019639602, "learning_rate": 4.806941293588307e-06, "loss": 2.691, "num_input_tokens_seen": 34901368, "step": 202 }, { "epoch": 0.08073541167066348, "loss": 2.485349178314209, "loss_ce": 0.7714818716049194, "loss_xval": 1.7109375, "num_input_tokens_seen": 34901368, "step": 202 }, { "epoch": 0.08113509192645883, "grad_norm": 132.89620507776843, "learning_rate": 4.8114131912563735e-06, "loss": 2.0727, "num_input_tokens_seen": 35074232, "step": 203 }, { "epoch": 0.08113509192645883, "loss": 2.1044516563415527, "loss_ce": 0.7382407784461975, "loss_xval": 1.3671875, "num_input_tokens_seen": 35074232, "step": 203 }, { "epoch": 0.0815347721822542, "grad_norm": 348.2140794906234, "learning_rate": 4.815863113908667e-06, "loss": 2.3281, "num_input_tokens_seen": 35247568, "step": 204 }, { "epoch": 0.0815347721822542, "loss": 2.4087095260620117, "loss_ce": 0.7719906568527222, "loss_xval": 1.640625, "num_input_tokens_seen": 35247568, "step": 204 }, { "epoch": 0.08193445243804956, "grad_norm": 85.7039982973391, "learning_rate": 4.8202912764610565e-06, "loss": 2.7482, "num_input_tokens_seen": 35417112, "step": 205 }, { "epoch": 0.08193445243804956, "loss": 2.6601197719573975, "loss_ce": 0.737268328666687, "loss_xval": 1.921875, "num_input_tokens_seen": 35417112, "step": 205 }, { "epoch": 0.08233413269384493, "grad_norm": 356.1569878617076, "learning_rate": 4.82469789069193e-06, "loss": 2.7038, "num_input_tokens_seen": 35589848, "step": 206 }, { "epoch": 0.08233413269384493, "loss": 2.493128538131714, "loss_ce": 0.7455699443817139, "loss_xval": 1.75, "num_input_tokens_seen": 35589848, "step": 206 }, { "epoch": 0.08273381294964029, "grad_norm": 138.50940557588262, "learning_rate": 4.829083165302968e-06, "loss": 2.0245, "num_input_tokens_seen": 35762768, "step": 207 }, { "epoch": 0.08273381294964029, "loss": 1.909895658493042, "loss_ce": 0.7660967111587524, "loss_xval": 1.140625, "num_input_tokens_seen": 35762768, "step": 207 }, { "epoch": 0.08313349320543566, "grad_norm": 525.8835783623617, "learning_rate": 4.833447305978453e-06, "loss": 2.669, "num_input_tokens_seen": 35935712, "step": 208 }, { "epoch": 0.08313349320543566, "loss": 2.8658926486968994, "loss_ce": 0.741869330406189, "loss_xval": 2.125, "num_input_tokens_seen": 35935712, "step": 208 }, { "epoch": 0.08353317346123101, "grad_norm": 117.03141921519646, "learning_rate": 4.83779051544316e-06, "loss": 1.9704, "num_input_tokens_seen": 36108680, "step": 209 }, { "epoch": 0.08353317346123101, "loss": 2.2376246452331543, "loss_ce": 0.7322536706924438, "loss_xval": 1.5078125, "num_input_tokens_seen": 36108680, "step": 209 }, { "epoch": 0.08393285371702638, "grad_norm": 319.2968263111755, "learning_rate": 4.842112993518858e-06, "loss": 2.3714, "num_input_tokens_seen": 36281832, "step": 210 }, { "epoch": 0.08393285371702638, "loss": 2.113748788833618, "loss_ce": 0.7533972263336182, "loss_xval": 1.359375, "num_input_tokens_seen": 36281832, "step": 210 }, { "epoch": 0.08433253397282174, "grad_norm": 89.919306127746, "learning_rate": 4.846414937179485e-06, "loss": 2.0618, "num_input_tokens_seen": 36454648, "step": 211 }, { "epoch": 0.08433253397282174, "loss": 2.09275484085083, "loss_ce": 0.7450986504554749, "loss_xval": 1.34375, "num_input_tokens_seen": 36454648, "step": 211 }, { "epoch": 0.08473221422861711, "grad_norm": 314.58780968384843, "learning_rate": 4.850696540604993e-06, "loss": 2.3359, "num_input_tokens_seen": 36627424, "step": 212 }, { "epoch": 0.08473221422861711, "loss": 2.438669443130493, "loss_ce": 0.7609350681304932, "loss_xval": 1.6796875, "num_input_tokens_seen": 36627424, "step": 212 }, { "epoch": 0.08513189448441247, "grad_norm": 107.93286926725172, "learning_rate": 4.854957995233956e-06, "loss": 2.1791, "num_input_tokens_seen": 36800224, "step": 213 }, { "epoch": 0.08513189448441247, "loss": 2.3825843334198, "loss_ce": 0.8034826517105103, "loss_xval": 1.578125, "num_input_tokens_seen": 36800224, "step": 213 }, { "epoch": 0.08553157474020784, "grad_norm": 214.7337618475198, "learning_rate": 4.859199489814922e-06, "loss": 1.6366, "num_input_tokens_seen": 36973008, "step": 214 }, { "epoch": 0.08553157474020784, "loss": 1.71268630027771, "loss_ce": 0.7490633726119995, "loss_xval": 0.96484375, "num_input_tokens_seen": 36973008, "step": 214 }, { "epoch": 0.0859312549960032, "grad_norm": 149.02179568380078, "learning_rate": 4.863421210456582e-06, "loss": 2.0696, "num_input_tokens_seen": 37146168, "step": 215 }, { "epoch": 0.0859312549960032, "loss": 2.094494581222534, "loss_ce": 0.7087523937225342, "loss_xval": 1.3828125, "num_input_tokens_seen": 37146168, "step": 215 }, { "epoch": 0.08633093525179857, "grad_norm": 176.86093117438182, "learning_rate": 4.867623340676766e-06, "loss": 2.1813, "num_input_tokens_seen": 37319032, "step": 216 }, { "epoch": 0.08633093525179857, "loss": 2.461796283721924, "loss_ce": 0.6553997993469238, "loss_xval": 1.8046875, "num_input_tokens_seen": 37319032, "step": 216 }, { "epoch": 0.08673061550759392, "grad_norm": 80.29115001940868, "learning_rate": 4.871806061450314e-06, "loss": 1.9519, "num_input_tokens_seen": 37490120, "step": 217 }, { "epoch": 0.08673061550759392, "loss": 2.127835273742676, "loss_ce": 0.6957064270973206, "loss_xval": 1.4296875, "num_input_tokens_seen": 37490120, "step": 217 }, { "epoch": 0.08713029576338929, "grad_norm": 129.0998061036388, "learning_rate": 4.875969551255842e-06, "loss": 2.172, "num_input_tokens_seen": 37663176, "step": 218 }, { "epoch": 0.08713029576338929, "loss": 1.8447304964065552, "loss_ce": 0.7093545198440552, "loss_xval": 1.1328125, "num_input_tokens_seen": 37663176, "step": 218 }, { "epoch": 0.08752997601918465, "grad_norm": 85.55333189294885, "learning_rate": 4.8801139861214465e-06, "loss": 2.1141, "num_input_tokens_seen": 37835776, "step": 219 }, { "epoch": 0.08752997601918465, "loss": 1.9705400466918945, "loss_ce": 0.6758623123168945, "loss_xval": 1.296875, "num_input_tokens_seen": 37835776, "step": 219 }, { "epoch": 0.08792965627498002, "grad_norm": 175.8233267376353, "learning_rate": 4.884239539669352e-06, "loss": 1.8671, "num_input_tokens_seen": 38008872, "step": 220 }, { "epoch": 0.08792965627498002, "loss": 1.9375450611114502, "loss_ce": 0.6650841236114502, "loss_xval": 1.2734375, "num_input_tokens_seen": 38008872, "step": 220 }, { "epoch": 0.08832933653077538, "grad_norm": 145.7069843294875, "learning_rate": 4.888346383159558e-06, "loss": 2.1846, "num_input_tokens_seen": 38181760, "step": 221 }, { "epoch": 0.08832933653077538, "loss": 2.246717929840088, "loss_ce": 0.6512590646743774, "loss_xval": 1.59375, "num_input_tokens_seen": 38181760, "step": 221 }, { "epoch": 0.08872901678657075, "grad_norm": 191.30221701933354, "learning_rate": 4.892434685532505e-06, "loss": 2.1738, "num_input_tokens_seen": 38355136, "step": 222 }, { "epoch": 0.08872901678657075, "loss": 2.435502052307129, "loss_ce": 0.6669473648071289, "loss_xval": 1.765625, "num_input_tokens_seen": 38355136, "step": 222 }, { "epoch": 0.0891286970423661, "grad_norm": 327.47100310412145, "learning_rate": 4.896504613450767e-06, "loss": 2.1095, "num_input_tokens_seen": 38524896, "step": 223 }, { "epoch": 0.0891286970423661, "loss": 2.2737462520599365, "loss_ce": 0.646793007850647, "loss_xval": 1.625, "num_input_tokens_seen": 38524896, "step": 223 }, { "epoch": 0.08952837729816147, "grad_norm": 134.50077594904644, "learning_rate": 4.900556331339819e-06, "loss": 1.9673, "num_input_tokens_seen": 38697752, "step": 224 }, { "epoch": 0.08952837729816147, "loss": 1.9093546867370605, "loss_ce": 0.6168742179870605, "loss_xval": 1.2890625, "num_input_tokens_seen": 38697752, "step": 224 }, { "epoch": 0.08992805755395683, "grad_norm": 266.2162340533468, "learning_rate": 4.904590001427903e-06, "loss": 2.0047, "num_input_tokens_seen": 38870744, "step": 225 }, { "epoch": 0.08992805755395683, "loss": 1.9634662866592407, "loss_ce": 0.619227945804596, "loss_xval": 1.34375, "num_input_tokens_seen": 38870744, "step": 225 }, { "epoch": 0.0903277378097522, "grad_norm": 297.0352644872155, "learning_rate": 4.908605783784996e-06, "loss": 2.0244, "num_input_tokens_seen": 39043776, "step": 226 }, { "epoch": 0.0903277378097522, "loss": 1.9414169788360596, "loss_ce": 0.63819420337677, "loss_xval": 1.3046875, "num_input_tokens_seen": 39043776, "step": 226 }, { "epoch": 0.09072741806554756, "grad_norm": 69.86260872464577, "learning_rate": 4.912603836360931e-06, "loss": 1.8326, "num_input_tokens_seen": 39216696, "step": 227 }, { "epoch": 0.09072741806554756, "loss": 1.9535454511642456, "loss_ce": 0.583916425704956, "loss_xval": 1.3671875, "num_input_tokens_seen": 39216696, "step": 227 }, { "epoch": 0.09112709832134293, "grad_norm": 188.86005864559968, "learning_rate": 4.916584315022672e-06, "loss": 1.8476, "num_input_tokens_seen": 39389624, "step": 228 }, { "epoch": 0.09112709832134293, "loss": 1.6824400424957275, "loss_ce": 0.5689146518707275, "loss_xval": 1.1171875, "num_input_tokens_seen": 39389624, "step": 228 }, { "epoch": 0.09152677857713828, "grad_norm": 104.91403215825835, "learning_rate": 4.920547373590778e-06, "loss": 1.9768, "num_input_tokens_seen": 39562616, "step": 229 }, { "epoch": 0.09152677857713828, "loss": 2.0511388778686523, "loss_ce": 0.6551427245140076, "loss_xval": 1.3984375, "num_input_tokens_seen": 39562616, "step": 229 }, { "epoch": 0.09192645883293366, "grad_norm": 138.43126034850636, "learning_rate": 4.924493163875066e-06, "loss": 1.6764, "num_input_tokens_seen": 39735632, "step": 230 }, { "epoch": 0.09192645883293366, "loss": 1.7603843212127686, "loss_ce": 0.5760581493377686, "loss_xval": 1.1875, "num_input_tokens_seen": 39735632, "step": 230 }, { "epoch": 0.09232613908872901, "grad_norm": 112.73730815561031, "learning_rate": 4.92842183570951e-06, "loss": 2.2555, "num_input_tokens_seen": 39908488, "step": 231 }, { "epoch": 0.09232613908872901, "loss": 2.1147522926330566, "loss_ce": 0.5903382301330566, "loss_xval": 1.5234375, "num_input_tokens_seen": 39908488, "step": 231 }, { "epoch": 0.09272581934452438, "grad_norm": 210.33274716994967, "learning_rate": 4.932333536986379e-06, "loss": 1.8486, "num_input_tokens_seen": 40081488, "step": 232 }, { "epoch": 0.09272581934452438, "loss": 1.7108758687973022, "loss_ce": 0.604186475276947, "loss_xval": 1.109375, "num_input_tokens_seen": 40081488, "step": 232 }, { "epoch": 0.09312549960031974, "grad_norm": 295.7627724529692, "learning_rate": 4.936228413689641e-06, "loss": 2.1929, "num_input_tokens_seen": 40254872, "step": 233 }, { "epoch": 0.09312549960031974, "loss": 2.279324531555176, "loss_ce": 0.6152620315551758, "loss_xval": 1.6640625, "num_input_tokens_seen": 40254872, "step": 233 }, { "epoch": 0.09352517985611511, "grad_norm": 94.5253042766975, "learning_rate": 4.940106609927657e-06, "loss": 1.8654, "num_input_tokens_seen": 40428056, "step": 234 }, { "epoch": 0.09352517985611511, "loss": 1.7779114246368408, "loss_ce": 0.5865051746368408, "loss_xval": 1.1875, "num_input_tokens_seen": 40428056, "step": 234 }, { "epoch": 0.09392486011191047, "grad_norm": 218.99104092091028, "learning_rate": 4.943968267965172e-06, "loss": 1.9661, "num_input_tokens_seen": 40600888, "step": 235 }, { "epoch": 0.09392486011191047, "loss": 2.0361690521240234, "loss_ce": 0.5542352199554443, "loss_xval": 1.484375, "num_input_tokens_seen": 40600888, "step": 235 }, { "epoch": 0.09432454036770584, "grad_norm": 166.17032002216988, "learning_rate": 4.947813528254631e-06, "loss": 2.1058, "num_input_tokens_seen": 40773440, "step": 236 }, { "epoch": 0.09432454036770584, "loss": 1.6222901344299316, "loss_ce": 0.5912842154502869, "loss_xval": 1.03125, "num_input_tokens_seen": 40773440, "step": 236 }, { "epoch": 0.09472422062350119, "grad_norm": 84.32664481341867, "learning_rate": 4.95164252946683e-06, "loss": 1.5917, "num_input_tokens_seen": 40946384, "step": 237 }, { "epoch": 0.09472422062350119, "loss": 1.9710441827774048, "loss_ce": 0.56601482629776, "loss_xval": 1.40625, "num_input_tokens_seen": 40946384, "step": 237 }, { "epoch": 0.09512390087929656, "grad_norm": 155.71448367542862, "learning_rate": 4.955455408520925e-06, "loss": 1.4781, "num_input_tokens_seen": 41119280, "step": 238 }, { "epoch": 0.09512390087929656, "loss": 1.3114783763885498, "loss_ce": 0.5365760326385498, "loss_xval": 0.7734375, "num_input_tokens_seen": 41119280, "step": 238 }, { "epoch": 0.09552358113509192, "grad_norm": 107.51330242252179, "learning_rate": 4.959252300613805e-06, "loss": 2.1855, "num_input_tokens_seen": 41291848, "step": 239 }, { "epoch": 0.09552358113509192, "loss": 2.2059006690979004, "loss_ce": 0.5362229347229004, "loss_xval": 1.671875, "num_input_tokens_seen": 41291848, "step": 239 }, { "epoch": 0.09592326139088729, "grad_norm": 62.6627619626531, "learning_rate": 4.963033339248863e-06, "loss": 1.7001, "num_input_tokens_seen": 41464768, "step": 240 }, { "epoch": 0.09592326139088729, "loss": 1.9659799337387085, "loss_ce": 0.5311654210090637, "loss_xval": 1.4375, "num_input_tokens_seen": 41464768, "step": 240 }, { "epoch": 0.09632294164668265, "grad_norm": 140.82973218984645, "learning_rate": 4.96679865626416e-06, "loss": 1.886, "num_input_tokens_seen": 41637768, "step": 241 }, { "epoch": 0.09632294164668265, "loss": 2.0564374923706055, "loss_ce": 0.550822377204895, "loss_xval": 1.5078125, "num_input_tokens_seen": 41637768, "step": 241 }, { "epoch": 0.09672262190247802, "grad_norm": 418.2780183282535, "learning_rate": 4.970548381860003e-06, "loss": 1.9494, "num_input_tokens_seen": 41811136, "step": 242 }, { "epoch": 0.09672262190247802, "loss": 2.3037490844726562, "loss_ce": 0.5615614652633667, "loss_xval": 1.7421875, "num_input_tokens_seen": 41811136, "step": 242 }, { "epoch": 0.09712230215827339, "grad_norm": 696.7436633406282, "learning_rate": 4.974282644625969e-06, "loss": 2.7664, "num_input_tokens_seen": 41983952, "step": 243 }, { "epoch": 0.09712230215827339, "loss": 3.1029319763183594, "loss_ce": 0.5375022888183594, "loss_xval": 2.5625, "num_input_tokens_seen": 41983952, "step": 243 }, { "epoch": 0.09752198241406874, "grad_norm": 650.7634262460341, "learning_rate": 4.978001571567359e-06, "loss": 2.7999, "num_input_tokens_seen": 42156848, "step": 244 }, { "epoch": 0.09752198241406874, "loss": 2.580700397491455, "loss_ce": 0.5709348917007446, "loss_xval": 2.015625, "num_input_tokens_seen": 42156848, "step": 244 }, { "epoch": 0.09792166266986412, "grad_norm": 88.52292597475672, "learning_rate": 4.981705288131116e-06, "loss": 1.7696, "num_input_tokens_seen": 42329736, "step": 245 }, { "epoch": 0.09792166266986412, "loss": 1.907859206199646, "loss_ce": 0.625144362449646, "loss_xval": 1.28125, "num_input_tokens_seen": 42329736, "step": 245 }, { "epoch": 0.09832134292565947, "grad_norm": 462.59926637475985, "learning_rate": 4.98539391823122e-06, "loss": 2.623, "num_input_tokens_seen": 42502616, "step": 246 }, { "epoch": 0.09832134292565947, "loss": 2.2705206871032715, "loss_ce": 0.6870246529579163, "loss_xval": 1.5859375, "num_input_tokens_seen": 42502616, "step": 246 }, { "epoch": 0.09872102318145484, "grad_norm": 219.64091631843007, "learning_rate": 4.989067584273563e-06, "loss": 2.1558, "num_input_tokens_seen": 42675480, "step": 247 }, { "epoch": 0.09872102318145484, "loss": 2.346247911453247, "loss_ce": 0.6250565648078918, "loss_xval": 1.71875, "num_input_tokens_seen": 42675480, "step": 247 }, { "epoch": 0.0991207034372502, "grad_norm": 452.2760431268779, "learning_rate": 4.992726407180318e-06, "loss": 2.4239, "num_input_tokens_seen": 42848424, "step": 248 }, { "epoch": 0.0991207034372502, "loss": 2.3382084369659424, "loss_ce": 0.6443606615066528, "loss_xval": 1.6953125, "num_input_tokens_seen": 42848424, "step": 248 }, { "epoch": 0.09952038369304557, "grad_norm": 289.58749315357915, "learning_rate": 4.996370506413826e-06, "loss": 2.1094, "num_input_tokens_seen": 43021520, "step": 249 }, { "epoch": 0.09952038369304557, "loss": 2.083463191986084, "loss_ce": 0.6484045386314392, "loss_xval": 1.4375, "num_input_tokens_seen": 43021520, "step": 249 }, { "epoch": 0.09992006394884093, "grad_norm": 305.5139012775951, "learning_rate": 5e-06, "loss": 2.3916, "num_input_tokens_seen": 43194472, "step": 250 }, { "epoch": 0.09992006394884093, "eval_websight_new_IoU": 0.02511245897039771, "eval_websight_new_MAE_all": 0.06440733931958675, "eval_websight_new_MAE_h": 0.030316845513880253, "eval_websight_new_MAE_w": 0.1007080115377903, "eval_websight_new_MAE_x": 0.058023618534207344, "eval_websight_new_MAE_y": 0.06858088076114655, "eval_websight_new_NUM_probability": 0.0004394065181259066, "eval_websight_new_inside_bbox": 0.1302083358168602, "eval_websight_new_loss": 1.9591528177261353, "eval_websight_new_loss_ce": 0.8309407234191895, "eval_websight_new_loss_xval": 0.973876953125, "eval_websight_new_runtime": 59.2945, "eval_websight_new_samples_per_second": 0.843, "eval_websight_new_steps_per_second": 0.034, "num_input_tokens_seen": 43194472, "step": 250 }, { "epoch": 0.09992006394884093, "eval_seeclick_IoU": 0.0937136560678482, "eval_seeclick_MAE_all": 0.11204610392451286, "eval_seeclick_MAE_h": 0.04166124016046524, "eval_seeclick_MAE_w": 0.16875187307596207, "eval_seeclick_MAE_x": 0.1465640515089035, "eval_seeclick_MAE_y": 0.09120727330446243, "eval_seeclick_NUM_probability": 0.00042376687633804977, "eval_seeclick_inside_bbox": 0.2517361119389534, "eval_seeclick_loss": 4.182728290557861, "eval_seeclick_loss_ce": 0.9389870762825012, "eval_seeclick_loss_xval": 3.177978515625, "eval_seeclick_runtime": 89.2398, "eval_seeclick_samples_per_second": 0.56, "eval_seeclick_steps_per_second": 0.022, "num_input_tokens_seen": 43194472, "step": 250 }, { "epoch": 0.09992006394884093, "eval_icons_IoU": 0.0013925364146416541, "eval_icons_MAE_all": 0.053750623017549515, "eval_icons_MAE_h": 0.015283203683793545, "eval_icons_MAE_w": 0.02879231609404087, "eval_icons_MAE_x": 0.09461009502410889, "eval_icons_MAE_y": 0.07631688378751278, "eval_icons_NUM_probability": 0.0005365281249396503, "eval_icons_inside_bbox": 0.02777777798473835, "eval_icons_loss": 1.4559746980667114, "eval_icons_loss_ce": 0.7883208990097046, "eval_icons_loss_xval": 0.63275146484375, "eval_icons_runtime": 83.7242, "eval_icons_samples_per_second": 0.597, "eval_icons_steps_per_second": 0.024, "num_input_tokens_seen": 43194472, "step": 250 }, { "epoch": 0.09992006394884093, "loss": 1.3925867080688477, "loss_ce": 0.7909021377563477, "loss_xval": 0.6015625, "num_input_tokens_seen": 43194472, "step": 250 }, { "epoch": 0.1003197442046363, "grad_norm": 420.7423871157765, "learning_rate": 5e-06, "loss": 2.5456, "num_input_tokens_seen": 43367312, "step": 251 }, { "epoch": 0.1003197442046363, "loss": 2.7929365634918213, "loss_ce": 0.6552413105964661, "loss_xval": 2.140625, "num_input_tokens_seen": 43367312, "step": 251 }, { "epoch": 0.10071942446043165, "grad_norm": 7157.590435326808, "learning_rate": 5e-06, "loss": 3.7782, "num_input_tokens_seen": 43540136, "step": 252 }, { "epoch": 0.10071942446043165, "loss": 3.858090877532959, "loss_ce": 0.6344579458236694, "loss_xval": 3.21875, "num_input_tokens_seen": 43540136, "step": 252 }, { "epoch": 0.10111910471622702, "grad_norm": 1371.1194141036922, "learning_rate": 5e-06, "loss": 8.4469, "num_input_tokens_seen": 43713352, "step": 253 }, { "epoch": 0.10111910471622702, "loss": 7.681779861450195, "loss_ce": 1.0880297422409058, "loss_xval": 6.59375, "num_input_tokens_seen": 43713352, "step": 253 }, { "epoch": 0.10151878497202238, "grad_norm": 389.4763597106653, "learning_rate": 5e-06, "loss": 5.2771, "num_input_tokens_seen": 43886232, "step": 254 }, { "epoch": 0.10151878497202238, "loss": 5.221000671386719, "loss_ce": 1.2112352848052979, "loss_xval": 4.0, "num_input_tokens_seen": 43886232, "step": 254 }, { "epoch": 0.10191846522781775, "grad_norm": 1409.542196888878, "learning_rate": 5e-06, "loss": 7.2904, "num_input_tokens_seen": 44055544, "step": 255 }, { "epoch": 0.10191846522781775, "loss": 7.3611297607421875, "loss_ce": 1.2146453857421875, "loss_xval": 6.15625, "num_input_tokens_seen": 44055544, "step": 255 }, { "epoch": 0.10231814548361311, "grad_norm": 324.9012802617059, "learning_rate": 5e-06, "loss": 5.3018, "num_input_tokens_seen": 44228576, "step": 256 }, { "epoch": 0.10231814548361311, "loss": 5.863863945007324, "loss_ce": 1.2427700757980347, "loss_xval": 4.625, "num_input_tokens_seen": 44228576, "step": 256 }, { "epoch": 0.10271782573940848, "grad_norm": 1193.3422190631172, "learning_rate": 5e-06, "loss": 6.202, "num_input_tokens_seen": 44401408, "step": 257 }, { "epoch": 0.10271782573940848, "loss": 6.257023811340332, "loss_ce": 1.2023365497589111, "loss_xval": 5.0625, "num_input_tokens_seen": 44401408, "step": 257 }, { "epoch": 0.10311750599520383, "grad_norm": 618.3582068326372, "learning_rate": 5e-06, "loss": 4.5653, "num_input_tokens_seen": 44574456, "step": 258 }, { "epoch": 0.10311750599520383, "loss": 4.771925449371338, "loss_ce": 1.170362949371338, "loss_xval": 3.59375, "num_input_tokens_seen": 44574456, "step": 258 }, { "epoch": 0.1035171862509992, "grad_norm": 519.3942566880261, "learning_rate": 5e-06, "loss": 4.8413, "num_input_tokens_seen": 44747248, "step": 259 }, { "epoch": 0.1035171862509992, "loss": 4.076366901397705, "loss_ce": 1.152538537979126, "loss_xval": 2.921875, "num_input_tokens_seen": 44747248, "step": 259 }, { "epoch": 0.10391686650679456, "grad_norm": 794.491302525678, "learning_rate": 5e-06, "loss": 5.0691, "num_input_tokens_seen": 44920312, "step": 260 }, { "epoch": 0.10391686650679456, "loss": 5.723645210266113, "loss_ce": 1.1572389602661133, "loss_xval": 4.5625, "num_input_tokens_seen": 44920312, "step": 260 }, { "epoch": 0.10431654676258993, "grad_norm": 233.2312194034501, "learning_rate": 5e-06, "loss": 3.5956, "num_input_tokens_seen": 45093192, "step": 261 }, { "epoch": 0.10431654676258993, "loss": 3.4732885360717773, "loss_ce": 1.1500463485717773, "loss_xval": 2.328125, "num_input_tokens_seen": 45093192, "step": 261 }, { "epoch": 0.10471622701838529, "grad_norm": 400.30987966580153, "learning_rate": 5e-06, "loss": 4.4252, "num_input_tokens_seen": 45266064, "step": 262 }, { "epoch": 0.10471622701838529, "loss": 4.673203945159912, "loss_ce": 1.167344331741333, "loss_xval": 3.5, "num_input_tokens_seen": 45266064, "step": 262 }, { "epoch": 0.10511590727418066, "grad_norm": 546.3231363919651, "learning_rate": 5e-06, "loss": 4.737, "num_input_tokens_seen": 45439016, "step": 263 }, { "epoch": 0.10511590727418066, "loss": 5.174367904663086, "loss_ce": 1.1450711488723755, "loss_xval": 4.03125, "num_input_tokens_seen": 45439016, "step": 263 }, { "epoch": 0.10551558752997602, "grad_norm": 241.3773646667893, "learning_rate": 5e-06, "loss": 3.0476, "num_input_tokens_seen": 45612712, "step": 264 }, { "epoch": 0.10551558752997602, "loss": 3.1645121574401855, "loss_ce": 1.1503520011901855, "loss_xval": 2.015625, "num_input_tokens_seen": 45612712, "step": 264 }, { "epoch": 0.10591526778577139, "grad_norm": 291.824625291368, "learning_rate": 5e-06, "loss": 3.1524, "num_input_tokens_seen": 45785736, "step": 265 }, { "epoch": 0.10591526778577139, "loss": 3.5120248794555664, "loss_ce": 1.1692512035369873, "loss_xval": 2.34375, "num_input_tokens_seen": 45785736, "step": 265 }, { "epoch": 0.10631494804156674, "grad_norm": 323.654467074144, "learning_rate": 5e-06, "loss": 2.7766, "num_input_tokens_seen": 45958904, "step": 266 }, { "epoch": 0.10631494804156674, "loss": 2.4206995964050293, "loss_ce": 1.1362760066986084, "loss_xval": 1.28125, "num_input_tokens_seen": 45958904, "step": 266 }, { "epoch": 0.10671462829736211, "grad_norm": 261.4508945724977, "learning_rate": 5e-06, "loss": 2.9974, "num_input_tokens_seen": 46132264, "step": 267 }, { "epoch": 0.10671462829736211, "loss": 2.7609076499938965, "loss_ce": 1.150556206703186, "loss_xval": 1.609375, "num_input_tokens_seen": 46132264, "step": 267 }, { "epoch": 0.10711430855315747, "grad_norm": 225.15151507021258, "learning_rate": 5e-06, "loss": 2.637, "num_input_tokens_seen": 46305184, "step": 268 }, { "epoch": 0.10711430855315747, "loss": 2.379305362701416, "loss_ce": 1.1512782573699951, "loss_xval": 1.2265625, "num_input_tokens_seen": 46305184, "step": 268 }, { "epoch": 0.10751398880895284, "grad_norm": 296.01349816516694, "learning_rate": 5e-06, "loss": 2.8834, "num_input_tokens_seen": 46478368, "step": 269 }, { "epoch": 0.10751398880895284, "loss": 3.2463014125823975, "loss_ce": 1.113977074623108, "loss_xval": 2.125, "num_input_tokens_seen": 46478368, "step": 269 }, { "epoch": 0.1079136690647482, "grad_norm": 155.6721663099127, "learning_rate": 5e-06, "loss": 3.2604, "num_input_tokens_seen": 46651192, "step": 270 }, { "epoch": 0.1079136690647482, "loss": 3.4313535690307617, "loss_ce": 1.0978577136993408, "loss_xval": 2.328125, "num_input_tokens_seen": 46651192, "step": 270 }, { "epoch": 0.10831334932054357, "grad_norm": 159.3586976784072, "learning_rate": 5e-06, "loss": 2.9097, "num_input_tokens_seen": 46823960, "step": 271 }, { "epoch": 0.10831334932054357, "loss": 2.574904441833496, "loss_ce": 1.068800926208496, "loss_xval": 1.5078125, "num_input_tokens_seen": 46823960, "step": 271 }, { "epoch": 0.10871302957633892, "grad_norm": 100.09567673766682, "learning_rate": 5e-06, "loss": 2.8126, "num_input_tokens_seen": 46996704, "step": 272 }, { "epoch": 0.10871302957633892, "loss": 3.1134049892425537, "loss_ce": 1.0472428798675537, "loss_xval": 2.0625, "num_input_tokens_seen": 46996704, "step": 272 }, { "epoch": 0.1091127098321343, "grad_norm": 180.69118269302496, "learning_rate": 5e-06, "loss": 2.3287, "num_input_tokens_seen": 47169304, "step": 273 }, { "epoch": 0.1091127098321343, "loss": 2.5099315643310547, "loss_ce": 1.0426464080810547, "loss_xval": 1.46875, "num_input_tokens_seen": 47169304, "step": 273 }, { "epoch": 0.10951239008792965, "grad_norm": 88.69240950555843, "learning_rate": 5e-06, "loss": 2.7934, "num_input_tokens_seen": 47342552, "step": 274 }, { "epoch": 0.10951239008792965, "loss": 3.1992688179016113, "loss_ce": 1.0791513919830322, "loss_xval": 2.125, "num_input_tokens_seen": 47342552, "step": 274 }, { "epoch": 0.10991207034372502, "grad_norm": 68.31354191387534, "learning_rate": 5e-06, "loss": 2.3395, "num_input_tokens_seen": 47515488, "step": 275 }, { "epoch": 0.10991207034372502, "loss": 2.1528122425079346, "loss_ce": 1.0068161487579346, "loss_xval": 1.1484375, "num_input_tokens_seen": 47515488, "step": 275 }, { "epoch": 0.11031175059952038, "grad_norm": 98.29379090415762, "learning_rate": 5e-06, "loss": 2.5491, "num_input_tokens_seen": 47687864, "step": 276 }, { "epoch": 0.11031175059952038, "loss": 2.4284067153930664, "loss_ce": 1.0075082778930664, "loss_xval": 1.421875, "num_input_tokens_seen": 47687864, "step": 276 }, { "epoch": 0.11071143085531575, "grad_norm": 208.59052885258336, "learning_rate": 5e-06, "loss": 2.4267, "num_input_tokens_seen": 47860776, "step": 277 }, { "epoch": 0.11071143085531575, "loss": 2.307206630706787, "loss_ce": 0.9878706932067871, "loss_xval": 1.3203125, "num_input_tokens_seen": 47860776, "step": 277 }, { "epoch": 0.1111111111111111, "grad_norm": 64.62028957246228, "learning_rate": 5e-06, "loss": 1.8629, "num_input_tokens_seen": 48033416, "step": 278 }, { "epoch": 0.1111111111111111, "loss": 1.5986448526382446, "loss_ce": 0.9834105372428894, "loss_xval": 0.6171875, "num_input_tokens_seen": 48033416, "step": 278 }, { "epoch": 0.11151079136690648, "grad_norm": 99.98342503248348, "learning_rate": 5e-06, "loss": 2.3711, "num_input_tokens_seen": 48206160, "step": 279 }, { "epoch": 0.11151079136690648, "loss": 2.391204357147217, "loss_ce": 0.9517512321472168, "loss_xval": 1.4375, "num_input_tokens_seen": 48206160, "step": 279 }, { "epoch": 0.11191047162270183, "grad_norm": 115.96544018388516, "learning_rate": 5e-06, "loss": 2.4155, "num_input_tokens_seen": 48379424, "step": 280 }, { "epoch": 0.11191047162270183, "loss": 2.3940885066986084, "loss_ce": 0.947799563407898, "loss_xval": 1.4453125, "num_input_tokens_seen": 48379424, "step": 280 }, { "epoch": 0.1123101518784972, "grad_norm": 104.52004608173208, "learning_rate": 5e-06, "loss": 2.3488, "num_input_tokens_seen": 48552112, "step": 281 }, { "epoch": 0.1123101518784972, "loss": 1.9197896718978882, "loss_ce": 0.9357808828353882, "loss_xval": 0.984375, "num_input_tokens_seen": 48552112, "step": 281 }, { "epoch": 0.11270983213429256, "grad_norm": 64.65365518524405, "learning_rate": 5e-06, "loss": 2.0232, "num_input_tokens_seen": 48724952, "step": 282 }, { "epoch": 0.11270983213429256, "loss": 2.230132818222046, "loss_ce": 0.9171445369720459, "loss_xval": 1.3125, "num_input_tokens_seen": 48724952, "step": 282 }, { "epoch": 0.11310951239008793, "grad_norm": 81.10524607428229, "learning_rate": 5e-06, "loss": 1.8553, "num_input_tokens_seen": 48897816, "step": 283 }, { "epoch": 0.11310951239008793, "loss": 1.7465626001358032, "loss_ce": 0.9167286157608032, "loss_xval": 0.828125, "num_input_tokens_seen": 48897816, "step": 283 }, { "epoch": 0.1135091926458833, "grad_norm": 88.08567804606139, "learning_rate": 5e-06, "loss": 2.2449, "num_input_tokens_seen": 49071056, "step": 284 }, { "epoch": 0.1135091926458833, "loss": 2.423125743865967, "loss_ce": 0.8782038688659668, "loss_xval": 1.546875, "num_input_tokens_seen": 49071056, "step": 284 }, { "epoch": 0.11390887290167866, "grad_norm": 56.69224113163489, "learning_rate": 5e-06, "loss": 2.3233, "num_input_tokens_seen": 49244160, "step": 285 }, { "epoch": 0.11390887290167866, "loss": 2.5701351165771484, "loss_ce": 0.8718929290771484, "loss_xval": 1.6953125, "num_input_tokens_seen": 49244160, "step": 285 }, { "epoch": 0.11430855315747403, "grad_norm": 117.84651651002588, "learning_rate": 5e-06, "loss": 2.0369, "num_input_tokens_seen": 49416784, "step": 286 }, { "epoch": 0.11430855315747403, "loss": 2.136676788330078, "loss_ce": 0.8520088195800781, "loss_xval": 1.28125, "num_input_tokens_seen": 49416784, "step": 286 }, { "epoch": 0.11470823341326938, "grad_norm": 81.01654553435571, "learning_rate": 5e-06, "loss": 2.112, "num_input_tokens_seen": 49589688, "step": 287 }, { "epoch": 0.11470823341326938, "loss": 1.9272973537445068, "loss_ce": 0.8110864162445068, "loss_xval": 1.1171875, "num_input_tokens_seen": 49589688, "step": 287 }, { "epoch": 0.11510791366906475, "grad_norm": 105.66559786842068, "learning_rate": 5e-06, "loss": 2.2775, "num_input_tokens_seen": 49762488, "step": 288 }, { "epoch": 0.11510791366906475, "loss": 2.5139994621276855, "loss_ce": 0.7979352474212646, "loss_xval": 1.71875, "num_input_tokens_seen": 49762488, "step": 288 }, { "epoch": 0.11550759392486011, "grad_norm": 97.41645579108356, "learning_rate": 5e-06, "loss": 2.2071, "num_input_tokens_seen": 49935664, "step": 289 }, { "epoch": 0.11550759392486011, "loss": 2.298358678817749, "loss_ce": 0.8047064542770386, "loss_xval": 1.4921875, "num_input_tokens_seen": 49935664, "step": 289 }, { "epoch": 0.11590727418065548, "grad_norm": 118.02443280432219, "learning_rate": 5e-06, "loss": 1.915, "num_input_tokens_seen": 50108424, "step": 290 }, { "epoch": 0.11590727418065548, "loss": 1.942828893661499, "loss_ce": 0.788532018661499, "loss_xval": 1.15625, "num_input_tokens_seen": 50108424, "step": 290 }, { "epoch": 0.11630695443645084, "grad_norm": 56.77553608993919, "learning_rate": 5e-06, "loss": 1.932, "num_input_tokens_seen": 50281560, "step": 291 }, { "epoch": 0.11630695443645084, "loss": 1.5962605476379395, "loss_ce": 0.7529988288879395, "loss_xval": 0.84375, "num_input_tokens_seen": 50281560, "step": 291 }, { "epoch": 0.11670663469224621, "grad_norm": 86.75564799614305, "learning_rate": 5e-06, "loss": 1.9127, "num_input_tokens_seen": 50454736, "step": 292 }, { "epoch": 0.11670663469224621, "loss": 1.680945634841919, "loss_ce": 0.752234697341919, "loss_xval": 0.9296875, "num_input_tokens_seen": 50454736, "step": 292 }, { "epoch": 0.11710631494804156, "grad_norm": 85.07023337957867, "learning_rate": 5e-06, "loss": 2.2476, "num_input_tokens_seen": 50627480, "step": 293 }, { "epoch": 0.11710631494804156, "loss": 2.6010489463806152, "loss_ce": 0.7497307062149048, "loss_xval": 1.8515625, "num_input_tokens_seen": 50627480, "step": 293 }, { "epoch": 0.11750599520383694, "grad_norm": 90.42009042380998, "learning_rate": 5e-06, "loss": 1.9658, "num_input_tokens_seen": 50800168, "step": 294 }, { "epoch": 0.11750599520383694, "loss": 1.597025752067566, "loss_ce": 0.7454632520675659, "loss_xval": 0.8515625, "num_input_tokens_seen": 50800168, "step": 294 }, { "epoch": 0.11790567545963229, "grad_norm": 82.83818839637908, "learning_rate": 5e-06, "loss": 1.7579, "num_input_tokens_seen": 50973280, "step": 295 }, { "epoch": 0.11790567545963229, "loss": 1.7075116634368896, "loss_ce": 0.7070235013961792, "loss_xval": 1.0, "num_input_tokens_seen": 50973280, "step": 295 }, { "epoch": 0.11830535571542766, "grad_norm": 171.28458309049162, "learning_rate": 5e-06, "loss": 1.87, "num_input_tokens_seen": 51146272, "step": 296 }, { "epoch": 0.11830535571542766, "loss": 1.3094793558120728, "loss_ce": 0.6739814281463623, "loss_xval": 0.63671875, "num_input_tokens_seen": 51146272, "step": 296 }, { "epoch": 0.11870503597122302, "grad_norm": 108.10320516499523, "learning_rate": 5e-06, "loss": 1.9295, "num_input_tokens_seen": 51319280, "step": 297 }, { "epoch": 0.11870503597122302, "loss": 1.6906158924102783, "loss_ce": 0.6525299549102783, "loss_xval": 1.0390625, "num_input_tokens_seen": 51319280, "step": 297 }, { "epoch": 0.11910471622701839, "grad_norm": 209.76838886070274, "learning_rate": 5e-06, "loss": 1.8786, "num_input_tokens_seen": 51492232, "step": 298 }, { "epoch": 0.11910471622701839, "loss": 1.683868646621704, "loss_ce": 0.6523745059967041, "loss_xval": 1.03125, "num_input_tokens_seen": 51492232, "step": 298 }, { "epoch": 0.11950439648281375, "grad_norm": 473.78852226640424, "learning_rate": 5e-06, "loss": 2.029, "num_input_tokens_seen": 51665016, "step": 299 }, { "epoch": 0.11950439648281375, "loss": 2.0970144271850586, "loss_ce": 0.5896900296211243, "loss_xval": 1.5078125, "num_input_tokens_seen": 51665016, "step": 299 }, { "epoch": 0.11990407673860912, "grad_norm": 358.2945807266524, "learning_rate": 5e-06, "loss": 2.0276, "num_input_tokens_seen": 51837904, "step": 300 }, { "epoch": 0.11990407673860912, "loss": 1.9197452068328857, "loss_ce": 0.6052920818328857, "loss_xval": 1.3125, "num_input_tokens_seen": 51837904, "step": 300 }, { "epoch": 0.12030375699440447, "grad_norm": 114.31359760931073, "learning_rate": 5e-06, "loss": 1.9761, "num_input_tokens_seen": 52011016, "step": 301 }, { "epoch": 0.12030375699440447, "loss": 2.001715660095215, "loss_ce": 0.6130439043045044, "loss_xval": 1.390625, "num_input_tokens_seen": 52011016, "step": 301 }, { "epoch": 0.12070343725019984, "grad_norm": 287.61305918926126, "learning_rate": 5e-06, "loss": 2.2437, "num_input_tokens_seen": 52183608, "step": 302 }, { "epoch": 0.12070343725019984, "loss": 2.397970676422119, "loss_ce": 0.5862032771110535, "loss_xval": 1.8125, "num_input_tokens_seen": 52183608, "step": 302 }, { "epoch": 0.1211031175059952, "grad_norm": 69.49781036705235, "learning_rate": 5e-06, "loss": 2.0359, "num_input_tokens_seen": 52356680, "step": 303 }, { "epoch": 0.1211031175059952, "loss": 2.0368640422821045, "loss_ce": 0.6423327922821045, "loss_xval": 1.390625, "num_input_tokens_seen": 52356680, "step": 303 }, { "epoch": 0.12150279776179057, "grad_norm": 249.25827639483862, "learning_rate": 5e-06, "loss": 1.9618, "num_input_tokens_seen": 52529992, "step": 304 }, { "epoch": 0.12150279776179057, "loss": 1.9398987293243408, "loss_ce": 0.6230041980743408, "loss_xval": 1.3203125, "num_input_tokens_seen": 52529992, "step": 304 }, { "epoch": 0.12190247801758593, "grad_norm": 93.74391986745098, "learning_rate": 5e-06, "loss": 1.6661, "num_input_tokens_seen": 52703208, "step": 305 }, { "epoch": 0.12190247801758593, "loss": 1.5010509490966797, "loss_ce": 0.6054210066795349, "loss_xval": 0.89453125, "num_input_tokens_seen": 52703208, "step": 305 }, { "epoch": 0.1223021582733813, "grad_norm": 170.71986052407345, "learning_rate": 5e-06, "loss": 1.752, "num_input_tokens_seen": 52875752, "step": 306 }, { "epoch": 0.1223021582733813, "loss": 1.5768327713012695, "loss_ce": 0.6339616775512695, "loss_xval": 0.94140625, "num_input_tokens_seen": 52875752, "step": 306 }, { "epoch": 0.12270183852917665, "grad_norm": 71.67655156139799, "learning_rate": 5e-06, "loss": 1.8598, "num_input_tokens_seen": 53048824, "step": 307 }, { "epoch": 0.12270183852917665, "loss": 1.722001552581787, "loss_ce": 0.6446090340614319, "loss_xval": 1.078125, "num_input_tokens_seen": 53048824, "step": 307 }, { "epoch": 0.12310151878497202, "grad_norm": 137.84224023687867, "learning_rate": 5e-06, "loss": 2.1068, "num_input_tokens_seen": 53221872, "step": 308 }, { "epoch": 0.12310151878497202, "loss": 2.6642374992370605, "loss_ce": 0.6627727746963501, "loss_xval": 2.0, "num_input_tokens_seen": 53221872, "step": 308 }, { "epoch": 0.12350119904076738, "grad_norm": 120.78443169609713, "learning_rate": 5e-06, "loss": 1.5313, "num_input_tokens_seen": 53394760, "step": 309 }, { "epoch": 0.12350119904076738, "loss": 1.376590609550476, "loss_ce": 0.5723915100097656, "loss_xval": 0.8046875, "num_input_tokens_seen": 53394760, "step": 309 }, { "epoch": 0.12390087929656275, "grad_norm": 149.46277474052187, "learning_rate": 5e-06, "loss": 1.587, "num_input_tokens_seen": 53568000, "step": 310 }, { "epoch": 0.12390087929656275, "loss": 1.5170848369598389, "loss_ce": 0.5693309903144836, "loss_xval": 0.94921875, "num_input_tokens_seen": 53568000, "step": 310 }, { "epoch": 0.12430055955235811, "grad_norm": 164.71563984387302, "learning_rate": 5e-06, "loss": 1.6372, "num_input_tokens_seen": 53740792, "step": 311 }, { "epoch": 0.12430055955235811, "loss": 1.6750078201293945, "loss_ce": 0.578328013420105, "loss_xval": 1.09375, "num_input_tokens_seen": 53740792, "step": 311 }, { "epoch": 0.12470023980815348, "grad_norm": 121.0108527291859, "learning_rate": 5e-06, "loss": 1.863, "num_input_tokens_seen": 53913616, "step": 312 }, { "epoch": 0.12470023980815348, "loss": 1.7013659477233887, "loss_ce": 0.5402331352233887, "loss_xval": 1.1640625, "num_input_tokens_seen": 53913616, "step": 312 }, { "epoch": 0.12509992006394885, "grad_norm": 202.9204004913828, "learning_rate": 5e-06, "loss": 1.7032, "num_input_tokens_seen": 54086312, "step": 313 }, { "epoch": 0.12509992006394885, "loss": 1.3899582624435425, "loss_ce": 0.49823465943336487, "loss_xval": 0.890625, "num_input_tokens_seen": 54086312, "step": 313 }, { "epoch": 0.1254996003197442, "grad_norm": 137.11461244391282, "learning_rate": 5e-06, "loss": 1.8513, "num_input_tokens_seen": 54259016, "step": 314 }, { "epoch": 0.1254996003197442, "loss": 1.7766376733779907, "loss_ce": 0.5368915796279907, "loss_xval": 1.2421875, "num_input_tokens_seen": 54259016, "step": 314 }, { "epoch": 0.12589928057553956, "grad_norm": 110.41993010150011, "learning_rate": 5e-06, "loss": 1.8425, "num_input_tokens_seen": 54432112, "step": 315 }, { "epoch": 0.12589928057553956, "loss": 1.9745423793792725, "loss_ce": 0.4867495894432068, "loss_xval": 1.484375, "num_input_tokens_seen": 54432112, "step": 315 }, { "epoch": 0.12629896083133493, "grad_norm": 75.64074697829528, "learning_rate": 5e-06, "loss": 1.8845, "num_input_tokens_seen": 54604632, "step": 316 }, { "epoch": 0.12629896083133493, "loss": 1.714593768119812, "loss_ce": 0.4909610152244568, "loss_xval": 1.2265625, "num_input_tokens_seen": 54604632, "step": 316 }, { "epoch": 0.1266986410871303, "grad_norm": 147.26310133007271, "learning_rate": 5e-06, "loss": 1.3572, "num_input_tokens_seen": 54777480, "step": 317 }, { "epoch": 0.1266986410871303, "loss": 1.3109509944915771, "loss_ce": 0.4650038480758667, "loss_xval": 0.84765625, "num_input_tokens_seen": 54777480, "step": 317 }, { "epoch": 0.12709832134292565, "grad_norm": 159.3543242719045, "learning_rate": 5e-06, "loss": 1.607, "num_input_tokens_seen": 54947464, "step": 318 }, { "epoch": 0.12709832134292565, "loss": 1.791764736175537, "loss_ce": 0.4758467674255371, "loss_xval": 1.3125, "num_input_tokens_seen": 54947464, "step": 318 }, { "epoch": 0.12749800159872102, "grad_norm": 53.98166505832464, "learning_rate": 5e-06, "loss": 1.5758, "num_input_tokens_seen": 55120368, "step": 319 }, { "epoch": 0.12749800159872102, "loss": 1.6858811378479004, "loss_ce": 0.45150619745254517, "loss_xval": 1.234375, "num_input_tokens_seen": 55120368, "step": 319 }, { "epoch": 0.1278976818545164, "grad_norm": 178.3612818823336, "learning_rate": 5e-06, "loss": 1.7526, "num_input_tokens_seen": 55293312, "step": 320 }, { "epoch": 0.1278976818545164, "loss": 2.1778581142425537, "loss_ce": 0.4437272548675537, "loss_xval": 1.734375, "num_input_tokens_seen": 55293312, "step": 320 }, { "epoch": 0.12829736211031176, "grad_norm": 111.81551605653638, "learning_rate": 5e-06, "loss": 1.8185, "num_input_tokens_seen": 55466064, "step": 321 }, { "epoch": 0.12829736211031176, "loss": 1.8632432222366333, "loss_ce": 0.4272081255912781, "loss_xval": 1.4375, "num_input_tokens_seen": 55466064, "step": 321 }, { "epoch": 0.1286970423661071, "grad_norm": 115.38757013402358, "learning_rate": 5e-06, "loss": 1.433, "num_input_tokens_seen": 55639048, "step": 322 }, { "epoch": 0.1286970423661071, "loss": 1.242377758026123, "loss_ce": 0.4625926613807678, "loss_xval": 0.78125, "num_input_tokens_seen": 55639048, "step": 322 }, { "epoch": 0.12909672262190247, "grad_norm": 162.44283787518899, "learning_rate": 5e-06, "loss": 1.3945, "num_input_tokens_seen": 55812136, "step": 323 }, { "epoch": 0.12909672262190247, "loss": 1.231302261352539, "loss_ce": 0.44077491760253906, "loss_xval": 0.7890625, "num_input_tokens_seen": 55812136, "step": 323 }, { "epoch": 0.12949640287769784, "grad_norm": 125.96863724523318, "learning_rate": 5e-06, "loss": 1.2947, "num_input_tokens_seen": 55984928, "step": 324 }, { "epoch": 0.12949640287769784, "loss": 1.1521010398864746, "loss_ce": 0.4384779930114746, "loss_xval": 0.71484375, "num_input_tokens_seen": 55984928, "step": 324 }, { "epoch": 0.1298960831334932, "grad_norm": 397.9317077984938, "learning_rate": 5e-06, "loss": 1.9657, "num_input_tokens_seen": 56157800, "step": 325 }, { "epoch": 0.1298960831334932, "loss": 1.3564872741699219, "loss_ce": 0.4021415710449219, "loss_xval": 0.953125, "num_input_tokens_seen": 56157800, "step": 325 }, { "epoch": 0.13029576338928858, "grad_norm": 249.17242046790918, "learning_rate": 5e-06, "loss": 1.8821, "num_input_tokens_seen": 56330728, "step": 326 }, { "epoch": 0.13029576338928858, "loss": 2.29518461227417, "loss_ce": 0.3835635483264923, "loss_xval": 1.9140625, "num_input_tokens_seen": 56330728, "step": 326 }, { "epoch": 0.13069544364508393, "grad_norm": 205.64286668663732, "learning_rate": 5e-06, "loss": 1.6595, "num_input_tokens_seen": 56503560, "step": 327 }, { "epoch": 0.13069544364508393, "loss": 2.169922351837158, "loss_ce": 0.3920902609825134, "loss_xval": 1.78125, "num_input_tokens_seen": 56503560, "step": 327 }, { "epoch": 0.1310951239008793, "grad_norm": 301.92393796778066, "learning_rate": 5e-06, "loss": 1.8751, "num_input_tokens_seen": 56676496, "step": 328 }, { "epoch": 0.1310951239008793, "loss": 1.8017125129699707, "loss_ce": 0.42768919467926025, "loss_xval": 1.375, "num_input_tokens_seen": 56676496, "step": 328 }, { "epoch": 0.13149480415667467, "grad_norm": 103.48048918281846, "learning_rate": 5e-06, "loss": 1.6365, "num_input_tokens_seen": 56849424, "step": 329 }, { "epoch": 0.13149480415667467, "loss": 1.7497020959854126, "loss_ce": 0.42157718539237976, "loss_xval": 1.328125, "num_input_tokens_seen": 56849424, "step": 329 }, { "epoch": 0.13189448441247004, "grad_norm": 171.99654308412548, "learning_rate": 5e-06, "loss": 1.9507, "num_input_tokens_seen": 57021856, "step": 330 }, { "epoch": 0.13189448441247004, "loss": 2.358372688293457, "loss_ce": 0.41257184743881226, "loss_xval": 1.9453125, "num_input_tokens_seen": 57021856, "step": 330 }, { "epoch": 0.13229416466826538, "grad_norm": 130.04743280621042, "learning_rate": 5e-06, "loss": 1.7586, "num_input_tokens_seen": 57194984, "step": 331 }, { "epoch": 0.13229416466826538, "loss": 1.5979559421539307, "loss_ce": 0.4109441637992859, "loss_xval": 1.1875, "num_input_tokens_seen": 57194984, "step": 331 }, { "epoch": 0.13269384492406075, "grad_norm": 95.61726348027165, "learning_rate": 5e-06, "loss": 1.8431, "num_input_tokens_seen": 57367960, "step": 332 }, { "epoch": 0.13269384492406075, "loss": 2.026491165161133, "loss_ce": 0.4075947403907776, "loss_xval": 1.6171875, "num_input_tokens_seen": 57367960, "step": 332 }, { "epoch": 0.13309352517985612, "grad_norm": 179.26524203097776, "learning_rate": 5e-06, "loss": 1.8041, "num_input_tokens_seen": 57540624, "step": 333 }, { "epoch": 0.13309352517985612, "loss": 1.6417033672332764, "loss_ce": 0.41160082817077637, "loss_xval": 1.2265625, "num_input_tokens_seen": 57540624, "step": 333 }, { "epoch": 0.1334932054356515, "grad_norm": 103.72564142699332, "learning_rate": 5e-06, "loss": 1.462, "num_input_tokens_seen": 57713368, "step": 334 }, { "epoch": 0.1334932054356515, "loss": 1.3674639463424683, "loss_ce": 0.37429988384246826, "loss_xval": 0.9921875, "num_input_tokens_seen": 57713368, "step": 334 }, { "epoch": 0.13389288569144683, "grad_norm": 226.10650455556296, "learning_rate": 5e-06, "loss": 1.5788, "num_input_tokens_seen": 57886048, "step": 335 }, { "epoch": 0.13389288569144683, "loss": 1.418921947479248, "loss_ce": 0.3932870626449585, "loss_xval": 1.0234375, "num_input_tokens_seen": 57886048, "step": 335 }, { "epoch": 0.1342925659472422, "grad_norm": 463.12575740719353, "learning_rate": 5e-06, "loss": 1.933, "num_input_tokens_seen": 58058832, "step": 336 }, { "epoch": 0.1342925659472422, "loss": 1.5288450717926025, "loss_ce": 0.3584350347518921, "loss_xval": 1.171875, "num_input_tokens_seen": 58058832, "step": 336 }, { "epoch": 0.13469224620303757, "grad_norm": 290.28182300021234, "learning_rate": 5e-06, "loss": 1.7178, "num_input_tokens_seen": 58231872, "step": 337 }, { "epoch": 0.13469224620303757, "loss": 1.5614473819732666, "loss_ce": 0.3695529103279114, "loss_xval": 1.1953125, "num_input_tokens_seen": 58231872, "step": 337 }, { "epoch": 0.13509192645883294, "grad_norm": 143.59463691949978, "learning_rate": 5e-06, "loss": 1.743, "num_input_tokens_seen": 58404472, "step": 338 }, { "epoch": 0.13509192645883294, "loss": 1.9069617986679077, "loss_ce": 0.38645392656326294, "loss_xval": 1.5234375, "num_input_tokens_seen": 58404472, "step": 338 }, { "epoch": 0.1354916067146283, "grad_norm": 345.1671201338629, "learning_rate": 5e-06, "loss": 1.9162, "num_input_tokens_seen": 58577064, "step": 339 }, { "epoch": 0.1354916067146283, "loss": 1.4432034492492676, "loss_ce": 0.3567776679992676, "loss_xval": 1.0859375, "num_input_tokens_seen": 58577064, "step": 339 }, { "epoch": 0.13589128697042366, "grad_norm": 139.11908462967634, "learning_rate": 5e-06, "loss": 1.5235, "num_input_tokens_seen": 58750152, "step": 340 }, { "epoch": 0.13589128697042366, "loss": 1.5724246501922607, "loss_ce": 0.3890751004219055, "loss_xval": 1.1796875, "num_input_tokens_seen": 58750152, "step": 340 }, { "epoch": 0.13629096722621903, "grad_norm": 254.4500953030448, "learning_rate": 5e-06, "loss": 1.4418, "num_input_tokens_seen": 58922968, "step": 341 }, { "epoch": 0.13629096722621903, "loss": 1.34968101978302, "loss_ce": 0.38410484790802, "loss_xval": 0.96484375, "num_input_tokens_seen": 58922968, "step": 341 }, { "epoch": 0.1366906474820144, "grad_norm": 100.59331456194423, "learning_rate": 5e-06, "loss": 1.3907, "num_input_tokens_seen": 59092360, "step": 342 }, { "epoch": 0.1366906474820144, "loss": 1.5428651571273804, "loss_ce": 0.3946716785430908, "loss_xval": 1.1484375, "num_input_tokens_seen": 59092360, "step": 342 }, { "epoch": 0.13709032773780974, "grad_norm": 171.42095258220323, "learning_rate": 5e-06, "loss": 1.9297, "num_input_tokens_seen": 59265264, "step": 343 }, { "epoch": 0.13709032773780974, "loss": 2.0073769092559814, "loss_ce": 0.3777381181716919, "loss_xval": 1.6328125, "num_input_tokens_seen": 59265264, "step": 343 }, { "epoch": 0.1374900079936051, "grad_norm": 182.6528152043041, "learning_rate": 5e-06, "loss": 1.6114, "num_input_tokens_seen": 59438328, "step": 344 }, { "epoch": 0.1374900079936051, "loss": 1.540045976638794, "loss_ce": 0.35742881894111633, "loss_xval": 1.1796875, "num_input_tokens_seen": 59438328, "step": 344 }, { "epoch": 0.13788968824940048, "grad_norm": 234.01657762932578, "learning_rate": 5e-06, "loss": 1.4521, "num_input_tokens_seen": 59611160, "step": 345 }, { "epoch": 0.13788968824940048, "loss": 1.1887354850769043, "loss_ce": 0.4055323600769043, "loss_xval": 0.78125, "num_input_tokens_seen": 59611160, "step": 345 }, { "epoch": 0.13828936850519585, "grad_norm": 135.3533298151207, "learning_rate": 5e-06, "loss": 1.5234, "num_input_tokens_seen": 59784352, "step": 346 }, { "epoch": 0.13828936850519585, "loss": 1.3803753852844238, "loss_ce": 0.4030805230140686, "loss_xval": 0.9765625, "num_input_tokens_seen": 59784352, "step": 346 }, { "epoch": 0.1386890487609912, "grad_norm": 186.93581432030666, "learning_rate": 5e-06, "loss": 1.5216, "num_input_tokens_seen": 59957304, "step": 347 }, { "epoch": 0.1386890487609912, "loss": 1.4391629695892334, "loss_ce": 0.37886011600494385, "loss_xval": 1.0625, "num_input_tokens_seen": 59957304, "step": 347 }, { "epoch": 0.13908872901678657, "grad_norm": 119.53453883928591, "learning_rate": 5e-06, "loss": 2.1058, "num_input_tokens_seen": 60130160, "step": 348 }, { "epoch": 0.13908872901678657, "loss": 2.4009146690368652, "loss_ce": 0.3730825185775757, "loss_xval": 2.03125, "num_input_tokens_seen": 60130160, "step": 348 }, { "epoch": 0.13948840927258194, "grad_norm": 82.97817397860176, "learning_rate": 5e-06, "loss": 1.5156, "num_input_tokens_seen": 60302752, "step": 349 }, { "epoch": 0.13948840927258194, "loss": 1.3989337682724, "loss_ce": 0.3671955168247223, "loss_xval": 1.03125, "num_input_tokens_seen": 60302752, "step": 349 }, { "epoch": 0.1398880895283773, "grad_norm": 178.42499123342392, "learning_rate": 5e-06, "loss": 1.2829, "num_input_tokens_seen": 60475776, "step": 350 }, { "epoch": 0.1398880895283773, "loss": 1.3339847326278687, "loss_ce": 0.38024938106536865, "loss_xval": 0.953125, "num_input_tokens_seen": 60475776, "step": 350 }, { "epoch": 0.14028776978417265, "grad_norm": 220.105087076434, "learning_rate": 5e-06, "loss": 1.8338, "num_input_tokens_seen": 60648632, "step": 351 }, { "epoch": 0.14028776978417265, "loss": 1.7666809558868408, "loss_ce": 0.39265748858451843, "loss_xval": 1.375, "num_input_tokens_seen": 60648632, "step": 351 }, { "epoch": 0.14068745003996802, "grad_norm": 117.69512479208115, "learning_rate": 5e-06, "loss": 1.5094, "num_input_tokens_seen": 60821424, "step": 352 }, { "epoch": 0.14068745003996802, "loss": 1.3380788564682007, "loss_ce": 0.3004812002182007, "loss_xval": 1.0390625, "num_input_tokens_seen": 60821424, "step": 352 }, { "epoch": 0.1410871302957634, "grad_norm": 54.734844905000614, "learning_rate": 5e-06, "loss": 1.5579, "num_input_tokens_seen": 60994352, "step": 353 }, { "epoch": 0.1410871302957634, "loss": 1.776560664176941, "loss_ce": 0.33783990144729614, "loss_xval": 1.4375, "num_input_tokens_seen": 60994352, "step": 353 }, { "epoch": 0.14148681055155876, "grad_norm": 71.4528673149511, "learning_rate": 5e-06, "loss": 1.9286, "num_input_tokens_seen": 61167184, "step": 354 }, { "epoch": 0.14148681055155876, "loss": 1.7447428703308105, "loss_ce": 0.3264079689979553, "loss_xval": 1.421875, "num_input_tokens_seen": 61167184, "step": 354 }, { "epoch": 0.1418864908073541, "grad_norm": 144.41906447638016, "learning_rate": 5e-06, "loss": 1.3293, "num_input_tokens_seen": 61340288, "step": 355 }, { "epoch": 0.1418864908073541, "loss": 1.5950350761413574, "loss_ce": 0.30743736028671265, "loss_xval": 1.2890625, "num_input_tokens_seen": 61340288, "step": 355 }, { "epoch": 0.14228617106314947, "grad_norm": 97.59406386460213, "learning_rate": 5e-06, "loss": 1.3858, "num_input_tokens_seen": 61513160, "step": 356 }, { "epoch": 0.14228617106314947, "loss": 1.4868381023406982, "loss_ce": 0.26979708671569824, "loss_xval": 1.21875, "num_input_tokens_seen": 61513160, "step": 356 }, { "epoch": 0.14268585131894485, "grad_norm": 113.54612045662563, "learning_rate": 5e-06, "loss": 1.3206, "num_input_tokens_seen": 61686344, "step": 357 }, { "epoch": 0.14268585131894485, "loss": 1.2290370464324951, "loss_ce": 0.24270889163017273, "loss_xval": 0.984375, "num_input_tokens_seen": 61686344, "step": 357 }, { "epoch": 0.14308553157474022, "grad_norm": 156.8539131457891, "learning_rate": 5e-06, "loss": 1.611, "num_input_tokens_seen": 61859496, "step": 358 }, { "epoch": 0.14308553157474022, "loss": 1.502457618713379, "loss_ce": 0.25953781604766846, "loss_xval": 1.2421875, "num_input_tokens_seen": 61859496, "step": 358 }, { "epoch": 0.14348521183053556, "grad_norm": 316.9389502143624, "learning_rate": 5e-06, "loss": 1.2872, "num_input_tokens_seen": 62032208, "step": 359 }, { "epoch": 0.14348521183053556, "loss": 1.4252452850341797, "loss_ce": 0.2567882835865021, "loss_xval": 1.171875, "num_input_tokens_seen": 62032208, "step": 359 }, { "epoch": 0.14388489208633093, "grad_norm": 271.7727423648551, "learning_rate": 5e-06, "loss": 1.571, "num_input_tokens_seen": 62205280, "step": 360 }, { "epoch": 0.14388489208633093, "loss": 1.4632078409194946, "loss_ce": 0.28254371881484985, "loss_xval": 1.1796875, "num_input_tokens_seen": 62205280, "step": 360 }, { "epoch": 0.1442845723421263, "grad_norm": 55.46079000927162, "learning_rate": 5e-06, "loss": 1.6113, "num_input_tokens_seen": 62378232, "step": 361 }, { "epoch": 0.1442845723421263, "loss": 1.6278996467590332, "loss_ce": 0.2782902717590332, "loss_xval": 1.3515625, "num_input_tokens_seen": 62378232, "step": 361 }, { "epoch": 0.14468425259792167, "grad_norm": 436.3956415187697, "learning_rate": 5e-06, "loss": 1.3398, "num_input_tokens_seen": 62550920, "step": 362 }, { "epoch": 0.14468425259792167, "loss": 1.3358556032180786, "loss_ce": 0.2723791003227234, "loss_xval": 1.0625, "num_input_tokens_seen": 62550920, "step": 362 }, { "epoch": 0.145083932853717, "grad_norm": 755.5556492215647, "learning_rate": 5e-06, "loss": 2.0904, "num_input_tokens_seen": 62724048, "step": 363 }, { "epoch": 0.145083932853717, "loss": 1.9951095581054688, "loss_ce": 0.29784390330314636, "loss_xval": 1.6953125, "num_input_tokens_seen": 62724048, "step": 363 }, { "epoch": 0.14548361310951238, "grad_norm": 458.5396858968191, "learning_rate": 5e-06, "loss": 2.0173, "num_input_tokens_seen": 62896912, "step": 364 }, { "epoch": 0.14548361310951238, "loss": 1.844632863998413, "loss_ce": 0.3143594264984131, "loss_xval": 1.53125, "num_input_tokens_seen": 62896912, "step": 364 }, { "epoch": 0.14588329336530775, "grad_norm": 273.6763198754432, "learning_rate": 5e-06, "loss": 1.6539, "num_input_tokens_seen": 63070104, "step": 365 }, { "epoch": 0.14588329336530775, "loss": 1.8728796243667603, "loss_ce": 0.32600462436676025, "loss_xval": 1.546875, "num_input_tokens_seen": 63070104, "step": 365 }, { "epoch": 0.14628297362110312, "grad_norm": 336.10897238023637, "learning_rate": 5e-06, "loss": 1.7954, "num_input_tokens_seen": 63243208, "step": 366 }, { "epoch": 0.14628297362110312, "loss": 1.7886399030685425, "loss_ce": 0.37140363454818726, "loss_xval": 1.4140625, "num_input_tokens_seen": 63243208, "step": 366 }, { "epoch": 0.1466826538768985, "grad_norm": 235.21217327142335, "learning_rate": 5e-06, "loss": 1.4071, "num_input_tokens_seen": 63415976, "step": 367 }, { "epoch": 0.1466826538768985, "loss": 1.2615071535110474, "loss_ce": 0.37381187081336975, "loss_xval": 0.88671875, "num_input_tokens_seen": 63415976, "step": 367 }, { "epoch": 0.14708233413269384, "grad_norm": 227.78162811961886, "learning_rate": 5e-06, "loss": 1.4289, "num_input_tokens_seen": 63589032, "step": 368 }, { "epoch": 0.14708233413269384, "loss": 1.3692877292633057, "loss_ce": 0.37600159645080566, "loss_xval": 0.9921875, "num_input_tokens_seen": 63589032, "step": 368 }, { "epoch": 0.1474820143884892, "grad_norm": 332.6665441626038, "learning_rate": 5e-06, "loss": 1.5625, "num_input_tokens_seen": 63762416, "step": 369 }, { "epoch": 0.1474820143884892, "loss": 1.526054859161377, "loss_ce": 0.3673633337020874, "loss_xval": 1.15625, "num_input_tokens_seen": 63762416, "step": 369 }, { "epoch": 0.14788169464428458, "grad_norm": 236.40131995240245, "learning_rate": 5e-06, "loss": 1.8429, "num_input_tokens_seen": 63935712, "step": 370 }, { "epoch": 0.14788169464428458, "loss": 1.8038097620010376, "loss_ce": 0.4258800745010376, "loss_xval": 1.375, "num_input_tokens_seen": 63935712, "step": 370 }, { "epoch": 0.14828137490007995, "grad_norm": 347.91614795997964, "learning_rate": 5e-06, "loss": 1.458, "num_input_tokens_seen": 64109008, "step": 371 }, { "epoch": 0.14828137490007995, "loss": 1.3971425294876099, "loss_ce": 0.383470743894577, "loss_xval": 1.015625, "num_input_tokens_seen": 64109008, "step": 371 }, { "epoch": 0.1486810551558753, "grad_norm": 97.92030501492046, "learning_rate": 5e-06, "loss": 1.1957, "num_input_tokens_seen": 64281584, "step": 372 }, { "epoch": 0.1486810551558753, "loss": 1.3989770412445068, "loss_ce": 0.36162352561950684, "loss_xval": 1.0390625, "num_input_tokens_seen": 64281584, "step": 372 }, { "epoch": 0.14908073541167066, "grad_norm": 432.055973955712, "learning_rate": 5e-06, "loss": 1.6878, "num_input_tokens_seen": 64454672, "step": 373 }, { "epoch": 0.14908073541167066, "loss": 1.6717700958251953, "loss_ce": 0.33412355184555054, "loss_xval": 1.3359375, "num_input_tokens_seen": 64454672, "step": 373 }, { "epoch": 0.14948041566746603, "grad_norm": 75.76218741454379, "learning_rate": 5e-06, "loss": 1.5385, "num_input_tokens_seen": 64627880, "step": 374 }, { "epoch": 0.14948041566746603, "loss": 1.3063770532608032, "loss_ce": 0.3713184893131256, "loss_xval": 0.93359375, "num_input_tokens_seen": 64627880, "step": 374 }, { "epoch": 0.1498800959232614, "grad_norm": 434.32708742379594, "learning_rate": 5e-06, "loss": 1.9206, "num_input_tokens_seen": 64800696, "step": 375 }, { "epoch": 0.1498800959232614, "loss": 1.9140050411224365, "loss_ce": 0.3580968379974365, "loss_xval": 1.5546875, "num_input_tokens_seen": 64800696, "step": 375 }, { "epoch": 0.15027977617905675, "grad_norm": 231.0941828632733, "learning_rate": 5e-06, "loss": 1.5119, "num_input_tokens_seen": 64973736, "step": 376 }, { "epoch": 0.15027977617905675, "loss": 1.2229743003845215, "loss_ce": 0.41560131311416626, "loss_xval": 0.80859375, "num_input_tokens_seen": 64973736, "step": 376 }, { "epoch": 0.15067945643485212, "grad_norm": 308.04336921786705, "learning_rate": 5e-06, "loss": 1.717, "num_input_tokens_seen": 65146824, "step": 377 }, { "epoch": 0.15067945643485212, "loss": 1.522589087486267, "loss_ce": 0.3834289610385895, "loss_xval": 1.140625, "num_input_tokens_seen": 65146824, "step": 377 }, { "epoch": 0.1510791366906475, "grad_norm": 379.91211838660945, "learning_rate": 5e-06, "loss": 1.7196, "num_input_tokens_seen": 65319824, "step": 378 }, { "epoch": 0.1510791366906475, "loss": 1.6595503091812134, "loss_ce": 0.37146443128585815, "loss_xval": 1.2890625, "num_input_tokens_seen": 65319824, "step": 378 }, { "epoch": 0.15147881694644286, "grad_norm": 188.19279362317252, "learning_rate": 5e-06, "loss": 1.5226, "num_input_tokens_seen": 65492824, "step": 379 }, { "epoch": 0.15147881694644286, "loss": 1.2681467533111572, "loss_ce": 0.4041330814361572, "loss_xval": 0.86328125, "num_input_tokens_seen": 65492824, "step": 379 }, { "epoch": 0.1518784972022382, "grad_norm": 332.3973897757976, "learning_rate": 5e-06, "loss": 1.5562, "num_input_tokens_seen": 65665920, "step": 380 }, { "epoch": 0.1518784972022382, "loss": 1.71194326877594, "loss_ce": 0.37405264377593994, "loss_xval": 1.3359375, "num_input_tokens_seen": 65665920, "step": 380 }, { "epoch": 0.15227817745803357, "grad_norm": 71.6692089078513, "learning_rate": 5e-06, "loss": 1.5298, "num_input_tokens_seen": 65839296, "step": 381 }, { "epoch": 0.15227817745803357, "loss": 1.185120701789856, "loss_ce": 0.37054553627967834, "loss_xval": 0.81640625, "num_input_tokens_seen": 65839296, "step": 381 }, { "epoch": 0.15267785771382894, "grad_norm": 291.72669644168303, "learning_rate": 5e-06, "loss": 1.6519, "num_input_tokens_seen": 66012896, "step": 382 }, { "epoch": 0.15267785771382894, "loss": 1.4946725368499756, "loss_ce": 0.3623483180999756, "loss_xval": 1.1328125, "num_input_tokens_seen": 66012896, "step": 382 }, { "epoch": 0.1530775379696243, "grad_norm": 124.58060402399016, "learning_rate": 5e-06, "loss": 1.6748, "num_input_tokens_seen": 66185720, "step": 383 }, { "epoch": 0.1530775379696243, "loss": 1.6065537929534912, "loss_ce": 0.3433701694011688, "loss_xval": 1.265625, "num_input_tokens_seen": 66185720, "step": 383 }, { "epoch": 0.15347721822541965, "grad_norm": 303.4023928485621, "learning_rate": 5e-06, "loss": 1.3126, "num_input_tokens_seen": 66358696, "step": 384 }, { "epoch": 0.15347721822541965, "loss": 1.6531447172164917, "loss_ce": 0.3508985936641693, "loss_xval": 1.3046875, "num_input_tokens_seen": 66358696, "step": 384 }, { "epoch": 0.15387689848121502, "grad_norm": 186.7444010598904, "learning_rate": 5e-06, "loss": 1.4072, "num_input_tokens_seen": 66531952, "step": 385 }, { "epoch": 0.15387689848121502, "loss": 1.579591989517212, "loss_ce": 0.35400599241256714, "loss_xval": 1.2265625, "num_input_tokens_seen": 66531952, "step": 385 }, { "epoch": 0.1542765787370104, "grad_norm": 123.72761695126445, "learning_rate": 5e-06, "loss": 1.1076, "num_input_tokens_seen": 66704848, "step": 386 }, { "epoch": 0.1542765787370104, "loss": 1.1779499053955078, "loss_ce": 0.3305378556251526, "loss_xval": 0.84765625, "num_input_tokens_seen": 66704848, "step": 386 }, { "epoch": 0.15467625899280577, "grad_norm": 300.4351423438375, "learning_rate": 5e-06, "loss": 1.5436, "num_input_tokens_seen": 66877552, "step": 387 }, { "epoch": 0.15467625899280577, "loss": 1.4668437242507935, "loss_ce": 0.30204877257347107, "loss_xval": 1.1640625, "num_input_tokens_seen": 66877552, "step": 387 }, { "epoch": 0.1550759392486011, "grad_norm": 97.96015654356518, "learning_rate": 5e-06, "loss": 1.3898, "num_input_tokens_seen": 67050520, "step": 388 }, { "epoch": 0.1550759392486011, "loss": 1.3291375637054443, "loss_ce": 0.28250670433044434, "loss_xval": 1.046875, "num_input_tokens_seen": 67050520, "step": 388 }, { "epoch": 0.15547561950439648, "grad_norm": 256.48054637973087, "learning_rate": 5e-06, "loss": 1.0719, "num_input_tokens_seen": 67223032, "step": 389 }, { "epoch": 0.15547561950439648, "loss": 1.2077308893203735, "loss_ce": 0.2790199816226959, "loss_xval": 0.9296875, "num_input_tokens_seen": 67223032, "step": 389 }, { "epoch": 0.15587529976019185, "grad_norm": 83.61487152773492, "learning_rate": 5e-06, "loss": 1.4863, "num_input_tokens_seen": 67396040, "step": 390 }, { "epoch": 0.15587529976019185, "loss": 1.168054461479187, "loss_ce": 0.236413836479187, "loss_xval": 0.9296875, "num_input_tokens_seen": 67396040, "step": 390 }, { "epoch": 0.15627498001598722, "grad_norm": 216.848192893872, "learning_rate": 5e-06, "loss": 1.5421, "num_input_tokens_seen": 67569016, "step": 391 }, { "epoch": 0.15627498001598722, "loss": 1.3344125747680664, "loss_ce": 0.2352915108203888, "loss_xval": 1.1015625, "num_input_tokens_seen": 67569016, "step": 391 }, { "epoch": 0.15667466027178256, "grad_norm": 100.72676823977093, "learning_rate": 5e-06, "loss": 1.2574, "num_input_tokens_seen": 67742032, "step": 392 }, { "epoch": 0.15667466027178256, "loss": 1.4167189598083496, "loss_ce": 0.22641140222549438, "loss_xval": 1.1875, "num_input_tokens_seen": 67742032, "step": 392 }, { "epoch": 0.15707434052757793, "grad_norm": 217.73273930965593, "learning_rate": 5e-06, "loss": 1.1396, "num_input_tokens_seen": 67914992, "step": 393 }, { "epoch": 0.15707434052757793, "loss": 1.1209442615509033, "loss_ce": 0.2234833538532257, "loss_xval": 0.8984375, "num_input_tokens_seen": 67914992, "step": 393 }, { "epoch": 0.1574740207833733, "grad_norm": 273.2904839153481, "learning_rate": 5e-06, "loss": 1.3022, "num_input_tokens_seen": 68087776, "step": 394 }, { "epoch": 0.1574740207833733, "loss": 1.3209784030914307, "loss_ce": 0.22258979082107544, "loss_xval": 1.1015625, "num_input_tokens_seen": 68087776, "step": 394 }, { "epoch": 0.15787370103916867, "grad_norm": 102.6734241796819, "learning_rate": 5e-06, "loss": 1.3072, "num_input_tokens_seen": 68260448, "step": 395 }, { "epoch": 0.15787370103916867, "loss": 1.1156089305877686, "loss_ce": 0.19959326088428497, "loss_xval": 0.9140625, "num_input_tokens_seen": 68260448, "step": 395 }, { "epoch": 0.15827338129496402, "grad_norm": 373.9212850213255, "learning_rate": 5e-06, "loss": 0.9984, "num_input_tokens_seen": 68433496, "step": 396 }, { "epoch": 0.15827338129496402, "loss": 0.958846926689148, "loss_ce": 0.17906175553798676, "loss_xval": 0.78125, "num_input_tokens_seen": 68433496, "step": 396 }, { "epoch": 0.1586730615507594, "grad_norm": 212.68846306498975, "learning_rate": 5e-06, "loss": 1.2799, "num_input_tokens_seen": 68606624, "step": 397 }, { "epoch": 0.1586730615507594, "loss": 1.6309540271759033, "loss_ce": 0.18429884314537048, "loss_xval": 1.4453125, "num_input_tokens_seen": 68606624, "step": 397 }, { "epoch": 0.15907274180655476, "grad_norm": 201.40167739549648, "learning_rate": 5e-06, "loss": 1.0227, "num_input_tokens_seen": 68779744, "step": 398 }, { "epoch": 0.15907274180655476, "loss": 1.1671316623687744, "loss_ce": 0.1776297688484192, "loss_xval": 0.98828125, "num_input_tokens_seen": 68779744, "step": 398 }, { "epoch": 0.15947242206235013, "grad_norm": 329.1756886542891, "learning_rate": 5e-06, "loss": 1.1226, "num_input_tokens_seen": 68949416, "step": 399 }, { "epoch": 0.15947242206235013, "loss": 0.8560934662818909, "loss_ce": 0.17335423827171326, "loss_xval": 0.68359375, "num_input_tokens_seen": 68949416, "step": 399 }, { "epoch": 0.15987210231814547, "grad_norm": 243.90129646768855, "learning_rate": 5e-06, "loss": 1.6687, "num_input_tokens_seen": 69122336, "step": 400 }, { "epoch": 0.15987210231814547, "loss": 1.3468685150146484, "loss_ce": 0.16937831044197083, "loss_xval": 1.1796875, "num_input_tokens_seen": 69122336, "step": 400 }, { "epoch": 0.16027178257394084, "grad_norm": 182.98535991940025, "learning_rate": 5e-06, "loss": 1.9013, "num_input_tokens_seen": 69295288, "step": 401 }, { "epoch": 0.16027178257394084, "loss": 2.1146087646484375, "loss_ce": 0.1907806098461151, "loss_xval": 1.921875, "num_input_tokens_seen": 69295288, "step": 401 }, { "epoch": 0.1606714628297362, "grad_norm": 155.15210915585405, "learning_rate": 5e-06, "loss": 1.5412, "num_input_tokens_seen": 69468232, "step": 402 }, { "epoch": 0.1606714628297362, "loss": 1.1275564432144165, "loss_ce": 0.18346473574638367, "loss_xval": 0.9453125, "num_input_tokens_seen": 69468232, "step": 402 }, { "epoch": 0.16107114308553158, "grad_norm": 89.57021927047049, "learning_rate": 5e-06, "loss": 1.2666, "num_input_tokens_seen": 69641208, "step": 403 }, { "epoch": 0.16107114308553158, "loss": 1.3142218589782715, "loss_ce": 0.18971017003059387, "loss_xval": 1.125, "num_input_tokens_seen": 69641208, "step": 403 }, { "epoch": 0.16147082334132695, "grad_norm": 84.82452132970737, "learning_rate": 5e-06, "loss": 1.0387, "num_input_tokens_seen": 69814296, "step": 404 }, { "epoch": 0.16147082334132695, "loss": 0.96608567237854, "loss_ce": 0.14345382153987885, "loss_xval": 0.82421875, "num_input_tokens_seen": 69814296, "step": 404 }, { "epoch": 0.1618705035971223, "grad_norm": 156.2159444055507, "learning_rate": 5e-06, "loss": 1.807, "num_input_tokens_seen": 69987120, "step": 405 }, { "epoch": 0.1618705035971223, "loss": 1.702022910118103, "loss_ce": 0.16247209906578064, "loss_xval": 1.5390625, "num_input_tokens_seen": 69987120, "step": 405 }, { "epoch": 0.16227018385291767, "grad_norm": 58.20249101157415, "learning_rate": 5e-06, "loss": 1.1604, "num_input_tokens_seen": 70160184, "step": 406 }, { "epoch": 0.16227018385291767, "loss": 1.17859947681427, "loss_ce": 0.1471053659915924, "loss_xval": 1.03125, "num_input_tokens_seen": 70160184, "step": 406 }, { "epoch": 0.16266986410871304, "grad_norm": 266.83889920116616, "learning_rate": 5e-06, "loss": 1.0537, "num_input_tokens_seen": 70333352, "step": 407 }, { "epoch": 0.16266986410871304, "loss": 1.2187542915344238, "loss_ce": 0.13623477518558502, "loss_xval": 1.0859375, "num_input_tokens_seen": 70333352, "step": 407 }, { "epoch": 0.1630695443645084, "grad_norm": 422.83042391271886, "learning_rate": 5e-06, "loss": 1.3912, "num_input_tokens_seen": 70506448, "step": 408 }, { "epoch": 0.1630695443645084, "loss": 1.4533321857452393, "loss_ce": 0.1320432424545288, "loss_xval": 1.3203125, "num_input_tokens_seen": 70506448, "step": 408 }, { "epoch": 0.16346922462030375, "grad_norm": 390.3090358433531, "learning_rate": 5e-06, "loss": 1.2469, "num_input_tokens_seen": 70679568, "step": 409 }, { "epoch": 0.16346922462030375, "loss": 1.5814406871795654, "loss_ce": 0.13808134198188782, "loss_xval": 1.4453125, "num_input_tokens_seen": 70679568, "step": 409 }, { "epoch": 0.16386890487609912, "grad_norm": 154.26033869033284, "learning_rate": 5e-06, "loss": 1.1171, "num_input_tokens_seen": 70852160, "step": 410 }, { "epoch": 0.16386890487609912, "loss": 1.4136357307434082, "loss_ce": 0.14141888916492462, "loss_xval": 1.2734375, "num_input_tokens_seen": 70852160, "step": 410 }, { "epoch": 0.1642685851318945, "grad_norm": 174.6829188801557, "learning_rate": 5e-06, "loss": 1.2339, "num_input_tokens_seen": 71024808, "step": 411 }, { "epoch": 0.1642685851318945, "loss": 0.867920994758606, "loss_ce": 0.12304795533418655, "loss_xval": 0.74609375, "num_input_tokens_seen": 71024808, "step": 411 }, { "epoch": 0.16466826538768986, "grad_norm": 319.3999891226021, "learning_rate": 5e-06, "loss": 1.2699, "num_input_tokens_seen": 71197624, "step": 412 }, { "epoch": 0.16466826538768986, "loss": 1.155133843421936, "loss_ce": 0.12876664102077484, "loss_xval": 1.0234375, "num_input_tokens_seen": 71197624, "step": 412 }, { "epoch": 0.1650679456434852, "grad_norm": 320.57417407860714, "learning_rate": 5e-06, "loss": 1.322, "num_input_tokens_seen": 71370280, "step": 413 }, { "epoch": 0.1650679456434852, "loss": 1.4060778617858887, "loss_ce": 0.13434943556785583, "loss_xval": 1.2734375, "num_input_tokens_seen": 71370280, "step": 413 }, { "epoch": 0.16546762589928057, "grad_norm": 178.26232739789174, "learning_rate": 5e-06, "loss": 1.2397, "num_input_tokens_seen": 71543160, "step": 414 }, { "epoch": 0.16546762589928057, "loss": 0.9019365310668945, "loss_ce": 0.12923146784305573, "loss_xval": 0.7734375, "num_input_tokens_seen": 71543160, "step": 414 }, { "epoch": 0.16586730615507594, "grad_norm": 114.95281581821858, "learning_rate": 5e-06, "loss": 1.1288, "num_input_tokens_seen": 71716120, "step": 415 }, { "epoch": 0.16586730615507594, "loss": 1.384334921836853, "loss_ce": 0.12664452195167542, "loss_xval": 1.2578125, "num_input_tokens_seen": 71716120, "step": 415 }, { "epoch": 0.16626698641087131, "grad_norm": 242.53801499071906, "learning_rate": 5e-06, "loss": 1.287, "num_input_tokens_seen": 71888592, "step": 416 }, { "epoch": 0.16626698641087131, "loss": 1.0861682891845703, "loss_ce": 0.14195440709590912, "loss_xval": 0.9453125, "num_input_tokens_seen": 71888592, "step": 416 }, { "epoch": 0.16666666666666666, "grad_norm": 305.69045590617145, "learning_rate": 5e-06, "loss": 1.0574, "num_input_tokens_seen": 72061352, "step": 417 }, { "epoch": 0.16666666666666666, "loss": 0.9189929962158203, "loss_ce": 0.1511707752943039, "loss_xval": 0.76953125, "num_input_tokens_seen": 72061352, "step": 417 }, { "epoch": 0.16706634692246203, "grad_norm": 232.45571623460697, "learning_rate": 5e-06, "loss": 1.4015, "num_input_tokens_seen": 72234232, "step": 418 }, { "epoch": 0.16706634692246203, "loss": 0.8937399387359619, "loss_ce": 0.15826627612113953, "loss_xval": 0.734375, "num_input_tokens_seen": 72234232, "step": 418 }, { "epoch": 0.1674660271782574, "grad_norm": 104.88982706534415, "learning_rate": 5e-06, "loss": 1.3726, "num_input_tokens_seen": 72406776, "step": 419 }, { "epoch": 0.1674660271782574, "loss": 1.644016981124878, "loss_ce": 0.1425521820783615, "loss_xval": 1.5, "num_input_tokens_seen": 72406776, "step": 419 }, { "epoch": 0.16786570743405277, "grad_norm": 132.05241726760326, "learning_rate": 5e-06, "loss": 1.5023, "num_input_tokens_seen": 72579768, "step": 420 }, { "epoch": 0.16786570743405277, "loss": 1.6695343255996704, "loss_ce": 0.14804987609386444, "loss_xval": 1.5234375, "num_input_tokens_seen": 72579768, "step": 420 }, { "epoch": 0.1682653876898481, "grad_norm": 360.7322762424225, "learning_rate": 5e-06, "loss": 1.3444, "num_input_tokens_seen": 72752160, "step": 421 }, { "epoch": 0.1682653876898481, "loss": 1.3023874759674072, "loss_ce": 0.138325035572052, "loss_xval": 1.1640625, "num_input_tokens_seen": 72752160, "step": 421 }, { "epoch": 0.16866506794564348, "grad_norm": 534.9608711756771, "learning_rate": 5e-06, "loss": 1.3106, "num_input_tokens_seen": 72925176, "step": 422 }, { "epoch": 0.16866506794564348, "loss": 1.265779733657837, "loss_ce": 0.10196132957935333, "loss_xval": 1.1640625, "num_input_tokens_seen": 72925176, "step": 422 }, { "epoch": 0.16906474820143885, "grad_norm": 462.6432650379374, "learning_rate": 5e-06, "loss": 1.3916, "num_input_tokens_seen": 73098032, "step": 423 }, { "epoch": 0.16906474820143885, "loss": 1.118710994720459, "loss_ce": 0.11578124761581421, "loss_xval": 1.0, "num_input_tokens_seen": 73098032, "step": 423 }, { "epoch": 0.16946442845723422, "grad_norm": 101.64068942565726, "learning_rate": 5e-06, "loss": 1.0825, "num_input_tokens_seen": 73270832, "step": 424 }, { "epoch": 0.16946442845723422, "loss": 0.9654685854911804, "loss_ce": 0.13453596830368042, "loss_xval": 0.83203125, "num_input_tokens_seen": 73270832, "step": 424 }, { "epoch": 0.16986410871302957, "grad_norm": 201.11879256749876, "learning_rate": 5e-06, "loss": 1.1616, "num_input_tokens_seen": 73443768, "step": 425 }, { "epoch": 0.16986410871302957, "loss": 1.3148138523101807, "loss_ce": 0.17748481035232544, "loss_xval": 1.140625, "num_input_tokens_seen": 73443768, "step": 425 }, { "epoch": 0.17026378896882494, "grad_norm": 69.29811911831622, "learning_rate": 5e-06, "loss": 1.0085, "num_input_tokens_seen": 73616680, "step": 426 }, { "epoch": 0.17026378896882494, "loss": 1.0997118949890137, "loss_ce": 0.2227586954832077, "loss_xval": 0.875, "num_input_tokens_seen": 73616680, "step": 426 }, { "epoch": 0.1706634692246203, "grad_norm": 160.2507375587037, "learning_rate": 5e-06, "loss": 1.5514, "num_input_tokens_seen": 73789392, "step": 427 }, { "epoch": 0.1706634692246203, "loss": 2.2413222789764404, "loss_ce": 0.21300186216831207, "loss_xval": 2.03125, "num_input_tokens_seen": 73789392, "step": 427 }, { "epoch": 0.17106314948041568, "grad_norm": 58.93629582053553, "learning_rate": 5e-06, "loss": 1.3997, "num_input_tokens_seen": 73962416, "step": 428 }, { "epoch": 0.17106314948041568, "loss": 1.1142562627792358, "loss_ce": 0.16332849860191345, "loss_xval": 0.94921875, "num_input_tokens_seen": 73962416, "step": 428 }, { "epoch": 0.17146282973621102, "grad_norm": 136.2720034068254, "learning_rate": 5e-06, "loss": 1.2624, "num_input_tokens_seen": 74135600, "step": 429 }, { "epoch": 0.17146282973621102, "loss": 1.42500901222229, "loss_ce": 0.1691497564315796, "loss_xval": 1.2578125, "num_input_tokens_seen": 74135600, "step": 429 }, { "epoch": 0.1718625099920064, "grad_norm": 148.31464135041392, "learning_rate": 5e-06, "loss": 1.6033, "num_input_tokens_seen": 74308904, "step": 430 }, { "epoch": 0.1718625099920064, "loss": 1.8530278205871582, "loss_ce": 0.15136760473251343, "loss_xval": 1.703125, "num_input_tokens_seen": 74308904, "step": 430 }, { "epoch": 0.17226219024780176, "grad_norm": 90.69042515021651, "learning_rate": 5e-06, "loss": 1.0973, "num_input_tokens_seen": 74482056, "step": 431 }, { "epoch": 0.17226219024780176, "loss": 1.100707769393921, "loss_ce": 0.16174298524856567, "loss_xval": 0.9375, "num_input_tokens_seen": 74482056, "step": 431 }, { "epoch": 0.17266187050359713, "grad_norm": 76.132967101736, "learning_rate": 5e-06, "loss": 1.0927, "num_input_tokens_seen": 74655264, "step": 432 }, { "epoch": 0.17266187050359713, "loss": 0.9883812069892883, "loss_ce": 0.13974839448928833, "loss_xval": 0.84765625, "num_input_tokens_seen": 74655264, "step": 432 }, { "epoch": 0.17306155075939247, "grad_norm": 64.87876348409729, "learning_rate": 5e-06, "loss": 1.1213, "num_input_tokens_seen": 74828240, "step": 433 }, { "epoch": 0.17306155075939247, "loss": 1.2422434091567993, "loss_ce": 0.1206614226102829, "loss_xval": 1.125, "num_input_tokens_seen": 74828240, "step": 433 }, { "epoch": 0.17346123101518784, "grad_norm": 59.25309660695592, "learning_rate": 5e-06, "loss": 1.0091, "num_input_tokens_seen": 75001712, "step": 434 }, { "epoch": 0.17346123101518784, "loss": 0.9062660336494446, "loss_ce": 0.11586074531078339, "loss_xval": 0.7890625, "num_input_tokens_seen": 75001712, "step": 434 }, { "epoch": 0.17386091127098321, "grad_norm": 80.00118800339006, "learning_rate": 5e-06, "loss": 1.2468, "num_input_tokens_seen": 75174536, "step": 435 }, { "epoch": 0.17386091127098321, "loss": 1.3547441959381104, "loss_ce": 0.09778615832328796, "loss_xval": 1.2578125, "num_input_tokens_seen": 75174536, "step": 435 }, { "epoch": 0.17426059152677859, "grad_norm": 95.79254596327765, "learning_rate": 5e-06, "loss": 0.7757, "num_input_tokens_seen": 75347544, "step": 436 }, { "epoch": 0.17426059152677859, "loss": 0.7775790095329285, "loss_ce": 0.08666104078292847, "loss_xval": 0.69140625, "num_input_tokens_seen": 75347544, "step": 436 }, { "epoch": 0.17466027178257393, "grad_norm": 232.35516007250703, "learning_rate": 5e-06, "loss": 1.3154, "num_input_tokens_seen": 75520632, "step": 437 }, { "epoch": 0.17466027178257393, "loss": 1.4089746475219727, "loss_ce": 0.07499027997255325, "loss_xval": 1.3359375, "num_input_tokens_seen": 75520632, "step": 437 }, { "epoch": 0.1750599520383693, "grad_norm": 519.4242646802579, "learning_rate": 5e-06, "loss": 1.8545, "num_input_tokens_seen": 75693864, "step": 438 }, { "epoch": 0.1750599520383693, "loss": 1.427260160446167, "loss_ce": 0.06446726620197296, "loss_xval": 1.359375, "num_input_tokens_seen": 75693864, "step": 438 }, { "epoch": 0.17545963229416467, "grad_norm": 988.9550741607192, "learning_rate": 5e-06, "loss": 2.3758, "num_input_tokens_seen": 75866888, "step": 439 }, { "epoch": 0.17545963229416467, "loss": 2.490389347076416, "loss_ce": 0.08120955526828766, "loss_xval": 2.40625, "num_input_tokens_seen": 75866888, "step": 439 }, { "epoch": 0.17585931254996004, "grad_norm": 1106.2099257737912, "learning_rate": 5e-06, "loss": 2.9193, "num_input_tokens_seen": 76039728, "step": 440 }, { "epoch": 0.17585931254996004, "loss": 2.894068717956543, "loss_ce": 0.11086547374725342, "loss_xval": 2.78125, "num_input_tokens_seen": 76039728, "step": 440 }, { "epoch": 0.17625899280575538, "grad_norm": 105.21406614004921, "learning_rate": 5e-06, "loss": 1.5263, "num_input_tokens_seen": 76212632, "step": 441 }, { "epoch": 0.17625899280575538, "loss": 1.382925271987915, "loss_ce": 0.13243699073791504, "loss_xval": 1.25, "num_input_tokens_seen": 76212632, "step": 441 }, { "epoch": 0.17665867306155075, "grad_norm": 538.4292674538996, "learning_rate": 5e-06, "loss": 1.7185, "num_input_tokens_seen": 76385848, "step": 442 }, { "epoch": 0.17665867306155075, "loss": 1.966761589050293, "loss_ce": 0.19283580780029297, "loss_xval": 1.7734375, "num_input_tokens_seen": 76385848, "step": 442 }, { "epoch": 0.17705835331734612, "grad_norm": 254.3195195821683, "learning_rate": 5e-06, "loss": 1.2737, "num_input_tokens_seen": 76558752, "step": 443 }, { "epoch": 0.17705835331734612, "loss": 1.3162972927093506, "loss_ce": 0.23280119895935059, "loss_xval": 1.0859375, "num_input_tokens_seen": 76558752, "step": 443 }, { "epoch": 0.1774580335731415, "grad_norm": 522.1919334034071, "learning_rate": 5e-06, "loss": 1.7626, "num_input_tokens_seen": 76731944, "step": 444 }, { "epoch": 0.1774580335731415, "loss": 1.9908093214035034, "loss_ce": 0.27547723054885864, "loss_xval": 1.71875, "num_input_tokens_seen": 76731944, "step": 444 }, { "epoch": 0.17785771382893686, "grad_norm": 216.82852013498507, "learning_rate": 5e-06, "loss": 1.7808, "num_input_tokens_seen": 76904696, "step": 445 }, { "epoch": 0.17785771382893686, "loss": 1.9551982879638672, "loss_ce": 0.29369932413101196, "loss_xval": 1.6640625, "num_input_tokens_seen": 76904696, "step": 445 }, { "epoch": 0.1782573940847322, "grad_norm": 281.8149994961891, "learning_rate": 5e-06, "loss": 1.3799, "num_input_tokens_seen": 77077232, "step": 446 }, { "epoch": 0.1782573940847322, "loss": 1.4842294454574585, "loss_ce": 0.2862313687801361, "loss_xval": 1.1953125, "num_input_tokens_seen": 77077232, "step": 446 }, { "epoch": 0.17865707434052758, "grad_norm": 315.72186191273005, "learning_rate": 5e-06, "loss": 1.7867, "num_input_tokens_seen": 77249936, "step": 447 }, { "epoch": 0.17865707434052758, "loss": 1.7437907457351685, "loss_ce": 0.30873218178749084, "loss_xval": 1.4375, "num_input_tokens_seen": 77249936, "step": 447 }, { "epoch": 0.17905675459632295, "grad_norm": 59.192440355888245, "learning_rate": 5e-06, "loss": 1.3851, "num_input_tokens_seen": 77423072, "step": 448 }, { "epoch": 0.17905675459632295, "loss": 1.6609851121902466, "loss_ce": 0.2824450731277466, "loss_xval": 1.375, "num_input_tokens_seen": 77423072, "step": 448 }, { "epoch": 0.17945643485211832, "grad_norm": 299.73718753712205, "learning_rate": 5e-06, "loss": 1.5403, "num_input_tokens_seen": 77595776, "step": 449 }, { "epoch": 0.17945643485211832, "loss": 1.7367419004440308, "loss_ce": 0.26506221294403076, "loss_xval": 1.46875, "num_input_tokens_seen": 77595776, "step": 449 }, { "epoch": 0.17985611510791366, "grad_norm": 276.48273158856335, "learning_rate": 5e-06, "loss": 1.5309, "num_input_tokens_seen": 77769064, "step": 450 }, { "epoch": 0.17985611510791366, "loss": 1.32578706741333, "loss_ce": 0.2933163046836853, "loss_xval": 1.03125, "num_input_tokens_seen": 77769064, "step": 450 }, { "epoch": 0.18025579536370903, "grad_norm": 160.94325945974563, "learning_rate": 5e-06, "loss": 1.4696, "num_input_tokens_seen": 77941696, "step": 451 }, { "epoch": 0.18025579536370903, "loss": 1.3891505002975464, "loss_ce": 0.3073633909225464, "loss_xval": 1.078125, "num_input_tokens_seen": 77941696, "step": 451 }, { "epoch": 0.1806554756195044, "grad_norm": 312.55926221947135, "learning_rate": 5e-06, "loss": 1.3929, "num_input_tokens_seen": 78114856, "step": 452 }, { "epoch": 0.1806554756195044, "loss": 1.0724852085113525, "loss_ce": 0.2741453945636749, "loss_xval": 0.796875, "num_input_tokens_seen": 78114856, "step": 452 }, { "epoch": 0.18105515587529977, "grad_norm": 99.87397360747839, "learning_rate": 5e-06, "loss": 1.5163, "num_input_tokens_seen": 78287864, "step": 453 }, { "epoch": 0.18105515587529977, "loss": 1.1197184324264526, "loss_ce": 0.28866374492645264, "loss_xval": 0.83203125, "num_input_tokens_seen": 78287864, "step": 453 }, { "epoch": 0.18145483613109512, "grad_norm": 274.6263447275612, "learning_rate": 5e-06, "loss": 1.2871, "num_input_tokens_seen": 78460704, "step": 454 }, { "epoch": 0.18145483613109512, "loss": 1.456176519393921, "loss_ce": 0.2547605335712433, "loss_xval": 1.203125, "num_input_tokens_seen": 78460704, "step": 454 }, { "epoch": 0.18185451638689049, "grad_norm": 199.6778687760442, "learning_rate": 5e-06, "loss": 1.2305, "num_input_tokens_seen": 78633736, "step": 455 }, { "epoch": 0.18185451638689049, "loss": 1.268936276435852, "loss_ce": 0.2605133652687073, "loss_xval": 1.0078125, "num_input_tokens_seen": 78633736, "step": 455 }, { "epoch": 0.18225419664268586, "grad_norm": 84.40253001428162, "learning_rate": 5e-06, "loss": 1.0979, "num_input_tokens_seen": 78806336, "step": 456 }, { "epoch": 0.18225419664268586, "loss": 0.8699742555618286, "loss_ce": 0.237527996301651, "loss_xval": 0.6328125, "num_input_tokens_seen": 78806336, "step": 456 }, { "epoch": 0.18265387689848123, "grad_norm": 285.993440607779, "learning_rate": 5e-06, "loss": 1.8171, "num_input_tokens_seen": 78979056, "step": 457 }, { "epoch": 0.18265387689848123, "loss": 1.7808828353881836, "loss_ce": 0.22766010463237762, "loss_xval": 1.5546875, "num_input_tokens_seen": 78979056, "step": 457 }, { "epoch": 0.18305355715427657, "grad_norm": 90.58562385013235, "learning_rate": 5e-06, "loss": 0.5619, "num_input_tokens_seen": 79148520, "step": 458 }, { "epoch": 0.18305355715427657, "loss": 0.5432583093643188, "loss_ce": 0.22099265456199646, "loss_xval": 0.322265625, "num_input_tokens_seen": 79148520, "step": 458 }, { "epoch": 0.18345323741007194, "grad_norm": 215.2025403833186, "learning_rate": 5e-06, "loss": 1.2952, "num_input_tokens_seen": 79321256, "step": 459 }, { "epoch": 0.18345323741007194, "loss": 1.2274497747421265, "loss_ce": 0.23355332016944885, "loss_xval": 0.9921875, "num_input_tokens_seen": 79321256, "step": 459 }, { "epoch": 0.1838529176658673, "grad_norm": 143.1375353333749, "learning_rate": 5e-06, "loss": 1.2422, "num_input_tokens_seen": 79494168, "step": 460 }, { "epoch": 0.1838529176658673, "loss": 1.2271267175674438, "loss_ce": 0.20149190723896027, "loss_xval": 1.0234375, "num_input_tokens_seen": 79494168, "step": 460 }, { "epoch": 0.18425259792166268, "grad_norm": 108.0060854037686, "learning_rate": 5e-06, "loss": 1.5776, "num_input_tokens_seen": 79663920, "step": 461 }, { "epoch": 0.18425259792166268, "loss": 1.7579941749572754, "loss_ce": 0.1915878802537918, "loss_xval": 1.5625, "num_input_tokens_seen": 79663920, "step": 461 }, { "epoch": 0.18465227817745802, "grad_norm": 260.0743652424003, "learning_rate": 5e-06, "loss": 1.6427, "num_input_tokens_seen": 79836992, "step": 462 }, { "epoch": 0.18465227817745802, "loss": 1.4615955352783203, "loss_ce": 0.1910877823829651, "loss_xval": 1.2734375, "num_input_tokens_seen": 79836992, "step": 462 }, { "epoch": 0.1850519584332534, "grad_norm": 112.36519545386706, "learning_rate": 5e-06, "loss": 1.4147, "num_input_tokens_seen": 80009824, "step": 463 }, { "epoch": 0.1850519584332534, "loss": 1.2974412441253662, "loss_ce": 0.1806199550628662, "loss_xval": 1.1171875, "num_input_tokens_seen": 80009824, "step": 463 }, { "epoch": 0.18545163868904876, "grad_norm": 298.49579765479467, "learning_rate": 5e-06, "loss": 1.1115, "num_input_tokens_seen": 80182856, "step": 464 }, { "epoch": 0.18545163868904876, "loss": 1.4449951648712158, "loss_ce": 0.1569093018770218, "loss_xval": 1.2890625, "num_input_tokens_seen": 80182856, "step": 464 }, { "epoch": 0.18585131894484413, "grad_norm": 95.35891374971652, "learning_rate": 5e-06, "loss": 1.072, "num_input_tokens_seen": 80355360, "step": 465 }, { "epoch": 0.18585131894484413, "loss": 1.2684335708618164, "loss_ce": 0.13220316171646118, "loss_xval": 1.1328125, "num_input_tokens_seen": 80355360, "step": 465 }, { "epoch": 0.18625099920063948, "grad_norm": 172.78392931286916, "learning_rate": 5e-06, "loss": 1.2831, "num_input_tokens_seen": 80528248, "step": 466 }, { "epoch": 0.18625099920063948, "loss": 1.3320605754852295, "loss_ce": 0.1433398425579071, "loss_xval": 1.1875, "num_input_tokens_seen": 80528248, "step": 466 }, { "epoch": 0.18665067945643485, "grad_norm": 51.21602711363438, "learning_rate": 5e-06, "loss": 1.2468, "num_input_tokens_seen": 80700880, "step": 467 }, { "epoch": 0.18665067945643485, "loss": 1.1996004581451416, "loss_ce": 0.13368244469165802, "loss_xval": 1.0625, "num_input_tokens_seen": 80700880, "step": 467 }, { "epoch": 0.18705035971223022, "grad_norm": 162.93505295456467, "learning_rate": 5e-06, "loss": 0.9464, "num_input_tokens_seen": 80873696, "step": 468 }, { "epoch": 0.18705035971223022, "loss": 0.9175683259963989, "loss_ce": 0.10885241627693176, "loss_xval": 0.80859375, "num_input_tokens_seen": 80873696, "step": 468 }, { "epoch": 0.1874500399680256, "grad_norm": 87.58305241565736, "learning_rate": 5e-06, "loss": 1.1474, "num_input_tokens_seen": 81046288, "step": 469 }, { "epoch": 0.1874500399680256, "loss": 1.0319910049438477, "loss_ce": 0.09424698352813721, "loss_xval": 0.9375, "num_input_tokens_seen": 81046288, "step": 469 }, { "epoch": 0.18784972022382093, "grad_norm": 158.44691760619935, "learning_rate": 5e-06, "loss": 1.5434, "num_input_tokens_seen": 81219400, "step": 470 }, { "epoch": 0.18784972022382093, "loss": 1.065626621246338, "loss_ce": 0.09528970718383789, "loss_xval": 0.96875, "num_input_tokens_seen": 81219400, "step": 470 }, { "epoch": 0.1882494004796163, "grad_norm": 87.4090982666857, "learning_rate": 5e-06, "loss": 0.9658, "num_input_tokens_seen": 81392432, "step": 471 }, { "epoch": 0.1882494004796163, "loss": 0.8852615356445312, "loss_ce": 0.09217070043087006, "loss_xval": 0.79296875, "num_input_tokens_seen": 81392432, "step": 471 }, { "epoch": 0.18864908073541167, "grad_norm": 122.54682886858762, "learning_rate": 5e-06, "loss": 1.2232, "num_input_tokens_seen": 81565384, "step": 472 }, { "epoch": 0.18864908073541167, "loss": 1.1961512565612793, "loss_ce": 0.08006230741739273, "loss_xval": 1.1171875, "num_input_tokens_seen": 81565384, "step": 472 }, { "epoch": 0.18904876099120704, "grad_norm": 84.30592183743661, "learning_rate": 5e-06, "loss": 0.8992, "num_input_tokens_seen": 81738112, "step": 473 }, { "epoch": 0.18904876099120704, "loss": 1.047995924949646, "loss_ce": 0.06899204850196838, "loss_xval": 0.98046875, "num_input_tokens_seen": 81738112, "step": 473 }, { "epoch": 0.18944844124700239, "grad_norm": 53.15779481734767, "learning_rate": 5e-06, "loss": 0.8962, "num_input_tokens_seen": 81911520, "step": 474 }, { "epoch": 0.18944844124700239, "loss": 0.8450521230697632, "loss_ce": 0.06404630839824677, "loss_xval": 0.78125, "num_input_tokens_seen": 81911520, "step": 474 }, { "epoch": 0.18984812150279776, "grad_norm": 53.57707087954573, "learning_rate": 5e-06, "loss": 0.9375, "num_input_tokens_seen": 82084208, "step": 475 }, { "epoch": 0.18984812150279776, "loss": 0.9071247577667236, "loss_ce": 0.06813547015190125, "loss_xval": 0.83984375, "num_input_tokens_seen": 82084208, "step": 475 }, { "epoch": 0.19024780175859313, "grad_norm": 99.14610783345852, "learning_rate": 5e-06, "loss": 1.3425, "num_input_tokens_seen": 82257544, "step": 476 }, { "epoch": 0.19024780175859313, "loss": 1.4562163352966309, "loss_ce": 0.05588666349649429, "loss_xval": 1.3984375, "num_input_tokens_seen": 82257544, "step": 476 }, { "epoch": 0.1906474820143885, "grad_norm": 51.46105686314192, "learning_rate": 5e-06, "loss": 0.8211, "num_input_tokens_seen": 82430560, "step": 477 }, { "epoch": 0.1906474820143885, "loss": 0.915199339389801, "loss_ce": 0.049842871725559235, "loss_xval": 0.8671875, "num_input_tokens_seen": 82430560, "step": 477 }, { "epoch": 0.19104716227018384, "grad_norm": 91.19133630030953, "learning_rate": 5e-06, "loss": 0.6196, "num_input_tokens_seen": 82603408, "step": 478 }, { "epoch": 0.19104716227018384, "loss": 0.725679337978363, "loss_ce": 0.04806704819202423, "loss_xval": 0.67578125, "num_input_tokens_seen": 82603408, "step": 478 }, { "epoch": 0.1914468425259792, "grad_norm": 112.31624471773934, "learning_rate": 5e-06, "loss": 1.3017, "num_input_tokens_seen": 82776032, "step": 479 }, { "epoch": 0.1914468425259792, "loss": 1.3830339908599854, "loss_ce": 0.04367845505475998, "loss_xval": 1.3359375, "num_input_tokens_seen": 82776032, "step": 479 }, { "epoch": 0.19184652278177458, "grad_norm": 53.33397350743472, "learning_rate": 5e-06, "loss": 1.2791, "num_input_tokens_seen": 82948864, "step": 480 }, { "epoch": 0.19184652278177458, "loss": 1.2594175338745117, "loss_ce": 0.044329725205898285, "loss_xval": 1.21875, "num_input_tokens_seen": 82948864, "step": 480 }, { "epoch": 0.19224620303756995, "grad_norm": 58.189356919828406, "learning_rate": 5e-06, "loss": 0.7775, "num_input_tokens_seen": 83122064, "step": 481 }, { "epoch": 0.19224620303756995, "loss": 0.7105453014373779, "loss_ce": 0.0419052317738533, "loss_xval": 0.66796875, "num_input_tokens_seen": 83122064, "step": 481 }, { "epoch": 0.1926458832933653, "grad_norm": 157.82228179423655, "learning_rate": 5e-06, "loss": 1.1241, "num_input_tokens_seen": 83295248, "step": 482 }, { "epoch": 0.1926458832933653, "loss": 1.1308469772338867, "loss_ce": 0.034655675292015076, "loss_xval": 1.09375, "num_input_tokens_seen": 83295248, "step": 482 }, { "epoch": 0.19304556354916066, "grad_norm": 261.49086967978093, "learning_rate": 5e-06, "loss": 1.4113, "num_input_tokens_seen": 83468488, "step": 483 }, { "epoch": 0.19304556354916066, "loss": 1.4266133308410645, "loss_ce": 0.036842815577983856, "loss_xval": 1.390625, "num_input_tokens_seen": 83468488, "step": 483 }, { "epoch": 0.19344524380495604, "grad_norm": 259.4909482177068, "learning_rate": 5e-06, "loss": 1.1918, "num_input_tokens_seen": 83641736, "step": 484 }, { "epoch": 0.19344524380495604, "loss": 1.2513983249664307, "loss_ce": 0.03948421776294708, "loss_xval": 1.2109375, "num_input_tokens_seen": 83641736, "step": 484 }, { "epoch": 0.1938449240607514, "grad_norm": 209.39951992648977, "learning_rate": 5e-06, "loss": 0.9566, "num_input_tokens_seen": 83815056, "step": 485 }, { "epoch": 0.1938449240607514, "loss": 1.1501352787017822, "loss_ce": 0.03856303542852402, "loss_xval": 1.109375, "num_input_tokens_seen": 83815056, "step": 485 }, { "epoch": 0.19424460431654678, "grad_norm": 54.05007201034879, "learning_rate": 5e-06, "loss": 0.9943, "num_input_tokens_seen": 83987984, "step": 486 }, { "epoch": 0.19424460431654678, "loss": 1.3479745388031006, "loss_ce": 0.037183478474617004, "loss_xval": 1.3125, "num_input_tokens_seen": 83987984, "step": 486 }, { "epoch": 0.19464428457234212, "grad_norm": 145.86163072629424, "learning_rate": 5e-06, "loss": 1.1361, "num_input_tokens_seen": 84160704, "step": 487 }, { "epoch": 0.19464428457234212, "loss": 1.1853137016296387, "loss_ce": 0.040294162929058075, "loss_xval": 1.1484375, "num_input_tokens_seen": 84160704, "step": 487 }, { "epoch": 0.1950439648281375, "grad_norm": 305.17188970413605, "learning_rate": 5e-06, "loss": 0.9527, "num_input_tokens_seen": 84333744, "step": 488 }, { "epoch": 0.1950439648281375, "loss": 1.0381686687469482, "loss_ce": 0.03060019761323929, "loss_xval": 1.0078125, "num_input_tokens_seen": 84333744, "step": 488 }, { "epoch": 0.19544364508393286, "grad_norm": 367.7867921107738, "learning_rate": 5e-06, "loss": 0.9876, "num_input_tokens_seen": 84506240, "step": 489 }, { "epoch": 0.19544364508393286, "loss": 1.0127192735671997, "loss_ce": 0.033227067440748215, "loss_xval": 0.98046875, "num_input_tokens_seen": 84506240, "step": 489 }, { "epoch": 0.19584332533972823, "grad_norm": 109.77618272454248, "learning_rate": 5e-06, "loss": 0.9396, "num_input_tokens_seen": 84679600, "step": 490 }, { "epoch": 0.19584332533972823, "loss": 0.48934584856033325, "loss_ce": 0.034755997359752655, "loss_xval": 0.455078125, "num_input_tokens_seen": 84679600, "step": 490 }, { "epoch": 0.19624300559552357, "grad_norm": 195.19088075428, "learning_rate": 5e-06, "loss": 1.0253, "num_input_tokens_seen": 84852344, "step": 491 }, { "epoch": 0.19624300559552357, "loss": 1.0787075757980347, "loss_ce": 0.03622712194919586, "loss_xval": 1.0390625, "num_input_tokens_seen": 84852344, "step": 491 }, { "epoch": 0.19664268585131894, "grad_norm": 168.19680026401545, "learning_rate": 5e-06, "loss": 1.1053, "num_input_tokens_seen": 85025032, "step": 492 }, { "epoch": 0.19664268585131894, "loss": 1.0922410488128662, "loss_ce": 0.037919752299785614, "loss_xval": 1.0546875, "num_input_tokens_seen": 85025032, "step": 492 }, { "epoch": 0.19704236610711431, "grad_norm": 133.09158680380528, "learning_rate": 5e-06, "loss": 0.891, "num_input_tokens_seen": 85197936, "step": 493 }, { "epoch": 0.19704236610711431, "loss": 0.952610969543457, "loss_ce": 0.050999678671360016, "loss_xval": 0.90234375, "num_input_tokens_seen": 85197936, "step": 493 }, { "epoch": 0.19744204636290968, "grad_norm": 324.1718865120786, "learning_rate": 5e-06, "loss": 1.3506, "num_input_tokens_seen": 85371064, "step": 494 }, { "epoch": 0.19744204636290968, "loss": 1.3527730703353882, "loss_ce": 0.04369105398654938, "loss_xval": 1.3125, "num_input_tokens_seen": 85371064, "step": 494 }, { "epoch": 0.19784172661870503, "grad_norm": 223.13546731515177, "learning_rate": 5e-06, "loss": 0.9691, "num_input_tokens_seen": 85544192, "step": 495 }, { "epoch": 0.19784172661870503, "loss": 0.9390429258346558, "loss_ce": 0.03682119399309158, "loss_xval": 0.90234375, "num_input_tokens_seen": 85544192, "step": 495 }, { "epoch": 0.1982414068745004, "grad_norm": 101.21144573870228, "learning_rate": 5e-06, "loss": 0.9112, "num_input_tokens_seen": 85717424, "step": 496 }, { "epoch": 0.1982414068745004, "loss": 1.0302919149398804, "loss_ce": 0.046039044857025146, "loss_xval": 0.984375, "num_input_tokens_seen": 85717424, "step": 496 }, { "epoch": 0.19864108713029577, "grad_norm": 206.07249003860602, "learning_rate": 5e-06, "loss": 0.8683, "num_input_tokens_seen": 85890344, "step": 497 }, { "epoch": 0.19864108713029577, "loss": 0.8588770627975464, "loss_ce": 0.03905284404754639, "loss_xval": 0.8203125, "num_input_tokens_seen": 85890344, "step": 497 }, { "epoch": 0.19904076738609114, "grad_norm": 94.44663670538154, "learning_rate": 5e-06, "loss": 0.8524, "num_input_tokens_seen": 86063192, "step": 498 }, { "epoch": 0.19904076738609114, "loss": 0.6648346185684204, "loss_ce": 0.03678285330533981, "loss_xval": 0.62890625, "num_input_tokens_seen": 86063192, "step": 498 }, { "epoch": 0.19944044764188648, "grad_norm": 335.24198957414745, "learning_rate": 5e-06, "loss": 1.389, "num_input_tokens_seen": 86236056, "step": 499 }, { "epoch": 0.19944044764188648, "loss": 1.3793278932571411, "loss_ce": 0.04241389036178589, "loss_xval": 1.3359375, "num_input_tokens_seen": 86236056, "step": 499 }, { "epoch": 0.19984012789768185, "grad_norm": 65.37214172428197, "learning_rate": 5e-06, "loss": 1.2575, "num_input_tokens_seen": 86408824, "step": 500 }, { "epoch": 0.19984012789768185, "eval_websight_new_IoU": 0.08079056814312935, "eval_websight_new_MAE_all": 0.06199362501502037, "eval_websight_new_MAE_h": 0.05449218116700649, "eval_websight_new_MAE_w": 0.09156358614563942, "eval_websight_new_MAE_x": 0.025543496012687683, "eval_websight_new_MAE_y": 0.07637524232268333, "eval_websight_new_NUM_probability": 0.6408629715442657, "eval_websight_new_inside_bbox": 0.046875, "eval_websight_new_loss": 0.6728891134262085, "eval_websight_new_loss_ce": 0.04726765863597393, "eval_websight_new_loss_xval": 0.6082763671875, "eval_websight_new_runtime": 57.3968, "eval_websight_new_samples_per_second": 0.871, "eval_websight_new_steps_per_second": 0.035, "num_input_tokens_seen": 86408824, "step": 500 }, { "epoch": 0.19984012789768185, "eval_seeclick_IoU": 0.11065776646137238, "eval_seeclick_MAE_all": 0.10889718681573868, "eval_seeclick_MAE_h": 0.06004502810537815, "eval_seeclick_MAE_w": 0.16941364109516144, "eval_seeclick_MAE_x": 0.11266724020242691, "eval_seeclick_MAE_y": 0.09346283972263336, "eval_seeclick_NUM_probability": 0.6323218941688538, "eval_seeclick_inside_bbox": 0.0868055559694767, "eval_seeclick_loss": 2.2744133472442627, "eval_seeclick_loss_ce": 0.06857346370816231, "eval_seeclick_loss_xval": 2.0810546875, "eval_seeclick_runtime": 82.4728, "eval_seeclick_samples_per_second": 0.606, "eval_seeclick_steps_per_second": 0.024, "num_input_tokens_seen": 86408824, "step": 500 }, { "epoch": 0.19984012789768185, "eval_icons_IoU": 0.009586355474311858, "eval_icons_MAE_all": 0.06707138940691948, "eval_icons_MAE_h": 0.06313476897776127, "eval_icons_MAE_w": 0.06441785581409931, "eval_icons_MAE_x": 0.05763854831457138, "eval_icons_MAE_y": 0.08309439569711685, "eval_icons_NUM_probability": 0.6739359498023987, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 0.5873188972473145, "eval_icons_loss_ce": 0.0424294825643301, "eval_icons_loss_xval": 0.53759765625, "eval_icons_runtime": 81.7973, "eval_icons_samples_per_second": 0.611, "eval_icons_steps_per_second": 0.024, "num_input_tokens_seen": 86408824, "step": 500 }, { "epoch": 0.19984012789768185, "loss": 0.6232744455337524, "loss_ce": 0.044661134481430054, "loss_xval": 0.578125, "num_input_tokens_seen": 86408824, "step": 500 }, { "epoch": 0.20023980815347722, "grad_norm": 415.0441899230684, "learning_rate": 5e-06, "loss": 1.1496, "num_input_tokens_seen": 86581832, "step": 501 }, { "epoch": 0.20023980815347722, "loss": 1.2457921504974365, "loss_ce": 0.04388776421546936, "loss_xval": 1.203125, "num_input_tokens_seen": 86581832, "step": 501 }, { "epoch": 0.2006394884092726, "grad_norm": 202.89775372810757, "learning_rate": 5e-06, "loss": 0.9393, "num_input_tokens_seen": 86754704, "step": 502 }, { "epoch": 0.2006394884092726, "loss": 0.5830790400505066, "loss_ce": 0.045115165412425995, "loss_xval": 0.5390625, "num_input_tokens_seen": 86754704, "step": 502 }, { "epoch": 0.20103916866506794, "grad_norm": 435.08701069154466, "learning_rate": 5e-06, "loss": 1.521, "num_input_tokens_seen": 86927520, "step": 503 }, { "epoch": 0.20103916866506794, "loss": 1.6471202373504639, "loss_ce": 0.061182815581560135, "loss_xval": 1.5859375, "num_input_tokens_seen": 86927520, "step": 503 }, { "epoch": 0.2014388489208633, "grad_norm": 354.9315963502709, "learning_rate": 5e-06, "loss": 1.4481, "num_input_tokens_seen": 87100360, "step": 504 }, { "epoch": 0.2014388489208633, "loss": 1.7939571142196655, "loss_ce": 0.05225791037082672, "loss_xval": 1.7421875, "num_input_tokens_seen": 87100360, "step": 504 }, { "epoch": 0.20183852917665868, "grad_norm": 277.1538917985221, "learning_rate": 5e-06, "loss": 0.8188, "num_input_tokens_seen": 87273200, "step": 505 }, { "epoch": 0.20183852917665868, "loss": 0.6681489944458008, "loss_ce": 0.05096151679754257, "loss_xval": 0.6171875, "num_input_tokens_seen": 87273200, "step": 505 }, { "epoch": 0.20223820943245405, "grad_norm": 215.04429897623527, "learning_rate": 5e-06, "loss": 0.896, "num_input_tokens_seen": 87446384, "step": 506 }, { "epoch": 0.20223820943245405, "loss": 0.977022111415863, "loss_ce": 0.06540100276470184, "loss_xval": 0.91015625, "num_input_tokens_seen": 87446384, "step": 506 }, { "epoch": 0.2026378896882494, "grad_norm": 276.63932282618924, "learning_rate": 5e-06, "loss": 1.2516, "num_input_tokens_seen": 87619496, "step": 507 }, { "epoch": 0.2026378896882494, "loss": 0.9704437255859375, "loss_ce": 0.0629730224609375, "loss_xval": 0.90625, "num_input_tokens_seen": 87619496, "step": 507 }, { "epoch": 0.20303756994404476, "grad_norm": 96.67338812468043, "learning_rate": 5e-06, "loss": 0.9473, "num_input_tokens_seen": 87792584, "step": 508 }, { "epoch": 0.20303756994404476, "loss": 0.8171831965446472, "loss_ce": 0.08207576721906662, "loss_xval": 0.734375, "num_input_tokens_seen": 87792584, "step": 508 }, { "epoch": 0.20343725019984013, "grad_norm": 363.8508621351222, "learning_rate": 5e-06, "loss": 1.5687, "num_input_tokens_seen": 87965712, "step": 509 }, { "epoch": 0.20343725019984013, "loss": 1.5893386602401733, "loss_ce": 0.06773223727941513, "loss_xval": 1.5234375, "num_input_tokens_seen": 87965712, "step": 509 }, { "epoch": 0.2038369304556355, "grad_norm": 265.10336368041925, "learning_rate": 5e-06, "loss": 0.9704, "num_input_tokens_seen": 88138536, "step": 510 }, { "epoch": 0.2038369304556355, "loss": 0.9591758847236633, "loss_ce": 0.06952746957540512, "loss_xval": 0.890625, "num_input_tokens_seen": 88138536, "step": 510 }, { "epoch": 0.20423661071143084, "grad_norm": 150.68109411527718, "learning_rate": 5e-06, "loss": 0.8909, "num_input_tokens_seen": 88311912, "step": 511 }, { "epoch": 0.20423661071143084, "loss": 1.0047615766525269, "loss_ce": 0.07226639986038208, "loss_xval": 0.93359375, "num_input_tokens_seen": 88311912, "step": 511 }, { "epoch": 0.20463629096722621, "grad_norm": 189.68353880999203, "learning_rate": 5e-06, "loss": 0.709, "num_input_tokens_seen": 88484904, "step": 512 }, { "epoch": 0.20463629096722621, "loss": 0.7475023865699768, "loss_ce": 0.0678148865699768, "loss_xval": 0.6796875, "num_input_tokens_seen": 88484904, "step": 512 }, { "epoch": 0.20503597122302158, "grad_norm": 49.74252850484498, "learning_rate": 5e-06, "loss": 1.1139, "num_input_tokens_seen": 88658368, "step": 513 }, { "epoch": 0.20503597122302158, "loss": 1.322534203529358, "loss_ce": 0.06484372913837433, "loss_xval": 1.2578125, "num_input_tokens_seen": 88658368, "step": 513 }, { "epoch": 0.20543565147881696, "grad_norm": 62.01670257125715, "learning_rate": 5e-06, "loss": 1.1007, "num_input_tokens_seen": 88831128, "step": 514 }, { "epoch": 0.20543565147881696, "loss": 0.8021764159202576, "loss_ce": 0.05950063467025757, "loss_xval": 0.7421875, "num_input_tokens_seen": 88831128, "step": 514 }, { "epoch": 0.2058353317346123, "grad_norm": 61.10824221702106, "learning_rate": 5e-06, "loss": 0.7747, "num_input_tokens_seen": 89004360, "step": 515 }, { "epoch": 0.2058353317346123, "loss": 0.8074045181274414, "loss_ce": 0.055451322346925735, "loss_xval": 0.75, "num_input_tokens_seen": 89004360, "step": 515 }, { "epoch": 0.20623501199040767, "grad_norm": 213.54170746936944, "learning_rate": 5e-06, "loss": 1.4733, "num_input_tokens_seen": 89177432, "step": 516 }, { "epoch": 0.20623501199040767, "loss": 1.8919177055358887, "loss_ce": 0.07111698389053345, "loss_xval": 1.8203125, "num_input_tokens_seen": 89177432, "step": 516 }, { "epoch": 0.20663469224620304, "grad_norm": 188.66997417165135, "learning_rate": 5e-06, "loss": 1.0141, "num_input_tokens_seen": 89350448, "step": 517 }, { "epoch": 0.20663469224620304, "loss": 0.9915530681610107, "loss_ce": 0.06418493390083313, "loss_xval": 0.92578125, "num_input_tokens_seen": 89350448, "step": 517 }, { "epoch": 0.2070343725019984, "grad_norm": 54.79378455267396, "learning_rate": 5e-06, "loss": 1.0231, "num_input_tokens_seen": 89523664, "step": 518 }, { "epoch": 0.2070343725019984, "loss": 1.0046730041503906, "loss_ce": 0.051059648394584656, "loss_xval": 0.953125, "num_input_tokens_seen": 89523664, "step": 518 }, { "epoch": 0.20743405275779375, "grad_norm": 97.26893599654822, "learning_rate": 5e-06, "loss": 1.3021, "num_input_tokens_seen": 89696800, "step": 519 }, { "epoch": 0.20743405275779375, "loss": 1.5230860710144043, "loss_ce": 0.044448427855968475, "loss_xval": 1.4765625, "num_input_tokens_seen": 89696800, "step": 519 }, { "epoch": 0.20783373301358912, "grad_norm": 122.63155997518379, "learning_rate": 5e-06, "loss": 1.1363, "num_input_tokens_seen": 89869128, "step": 520 }, { "epoch": 0.20783373301358912, "loss": 0.7199119329452515, "loss_ce": 0.05108872056007385, "loss_xval": 0.66796875, "num_input_tokens_seen": 89869128, "step": 520 }, { "epoch": 0.2082334132693845, "grad_norm": 43.22504973101311, "learning_rate": 5e-06, "loss": 1.0852, "num_input_tokens_seen": 90042088, "step": 521 }, { "epoch": 0.2082334132693845, "loss": 0.9542793035507202, "loss_ce": 0.04509960114955902, "loss_xval": 0.91015625, "num_input_tokens_seen": 90042088, "step": 521 }, { "epoch": 0.20863309352517986, "grad_norm": 237.56955165262985, "learning_rate": 5e-06, "loss": 1.1241, "num_input_tokens_seen": 90215056, "step": 522 }, { "epoch": 0.20863309352517986, "loss": 0.8590470552444458, "loss_ce": 0.03726974129676819, "loss_xval": 0.8203125, "num_input_tokens_seen": 90215056, "step": 522 }, { "epoch": 0.2090327737809752, "grad_norm": 263.0137742752679, "learning_rate": 5e-06, "loss": 1.3867, "num_input_tokens_seen": 90387888, "step": 523 }, { "epoch": 0.2090327737809752, "loss": 1.0127967596054077, "loss_ce": 0.040872905403375626, "loss_xval": 0.97265625, "num_input_tokens_seen": 90387888, "step": 523 }, { "epoch": 0.20943245403677058, "grad_norm": 44.95766707948507, "learning_rate": 5e-06, "loss": 1.0766, "num_input_tokens_seen": 90561136, "step": 524 }, { "epoch": 0.20943245403677058, "loss": 0.6546859741210938, "loss_ce": 0.03603363782167435, "loss_xval": 0.6171875, "num_input_tokens_seen": 90561136, "step": 524 }, { "epoch": 0.20983213429256595, "grad_norm": 112.87978212103138, "learning_rate": 5e-06, "loss": 0.6245, "num_input_tokens_seen": 90734184, "step": 525 }, { "epoch": 0.20983213429256595, "loss": 0.4749688506126404, "loss_ce": 0.04808899015188217, "loss_xval": 0.427734375, "num_input_tokens_seen": 90734184, "step": 525 }, { "epoch": 0.21023181454836132, "grad_norm": 92.99996671691841, "learning_rate": 5e-06, "loss": 1.0497, "num_input_tokens_seen": 90907176, "step": 526 }, { "epoch": 0.21023181454836132, "loss": 0.8723914623260498, "loss_ce": 0.03266974911093712, "loss_xval": 0.83984375, "num_input_tokens_seen": 90907176, "step": 526 }, { "epoch": 0.2106314948041567, "grad_norm": 132.48797337096062, "learning_rate": 5e-06, "loss": 1.291, "num_input_tokens_seen": 91080520, "step": 527 }, { "epoch": 0.2106314948041567, "loss": 1.0182774066925049, "loss_ce": 0.03133901208639145, "loss_xval": 0.98828125, "num_input_tokens_seen": 91080520, "step": 527 }, { "epoch": 0.21103117505995203, "grad_norm": 67.2411409745012, "learning_rate": 5e-06, "loss": 0.6511, "num_input_tokens_seen": 91253584, "step": 528 }, { "epoch": 0.21103117505995203, "loss": 0.7829042673110962, "loss_ce": 0.0313173308968544, "loss_xval": 0.75, "num_input_tokens_seen": 91253584, "step": 528 }, { "epoch": 0.2114308553157474, "grad_norm": 119.38443221614165, "learning_rate": 5e-06, "loss": 1.0069, "num_input_tokens_seen": 91426584, "step": 529 }, { "epoch": 0.2114308553157474, "loss": 1.0257318019866943, "loss_ce": 0.02658626064658165, "loss_xval": 1.0, "num_input_tokens_seen": 91426584, "step": 529 }, { "epoch": 0.21183053557154277, "grad_norm": 161.8946233726489, "learning_rate": 5e-06, "loss": 1.1522, "num_input_tokens_seen": 91599784, "step": 530 }, { "epoch": 0.21183053557154277, "loss": 1.1873421669006348, "loss_ce": 0.024622494354844093, "loss_xval": 1.1640625, "num_input_tokens_seen": 91599784, "step": 530 }, { "epoch": 0.21223021582733814, "grad_norm": 322.1567587058893, "learning_rate": 5e-06, "loss": 1.5258, "num_input_tokens_seen": 91772792, "step": 531 }, { "epoch": 0.21223021582733814, "loss": 1.6199225187301636, "loss_ce": 0.03862369433045387, "loss_xval": 1.578125, "num_input_tokens_seen": 91772792, "step": 531 }, { "epoch": 0.21262989608313348, "grad_norm": 162.02156030703074, "learning_rate": 5e-06, "loss": 0.8099, "num_input_tokens_seen": 91945720, "step": 532 }, { "epoch": 0.21262989608313348, "loss": 0.7622925639152527, "loss_ce": 0.02779550477862358, "loss_xval": 0.734375, "num_input_tokens_seen": 91945720, "step": 532 }, { "epoch": 0.21302957633892886, "grad_norm": 136.96122336871258, "learning_rate": 5e-06, "loss": 1.1803, "num_input_tokens_seen": 92118664, "step": 533 }, { "epoch": 0.21302957633892886, "loss": 1.2401918172836304, "loss_ce": 0.021441802382469177, "loss_xval": 1.21875, "num_input_tokens_seen": 92118664, "step": 533 }, { "epoch": 0.21342925659472423, "grad_norm": 234.78901205861808, "learning_rate": 5e-06, "loss": 0.9575, "num_input_tokens_seen": 92288392, "step": 534 }, { "epoch": 0.21342925659472423, "loss": 0.6059136390686035, "loss_ce": 0.023638233542442322, "loss_xval": 0.58203125, "num_input_tokens_seen": 92288392, "step": 534 }, { "epoch": 0.2138289368505196, "grad_norm": 144.7989394237274, "learning_rate": 5e-06, "loss": 0.8869, "num_input_tokens_seen": 92461600, "step": 535 }, { "epoch": 0.2138289368505196, "loss": 1.0061213970184326, "loss_ce": 0.024554094299674034, "loss_xval": 0.98046875, "num_input_tokens_seen": 92461600, "step": 535 }, { "epoch": 0.21422861710631494, "grad_norm": 55.48959721021269, "learning_rate": 5e-06, "loss": 0.4912, "num_input_tokens_seen": 92634216, "step": 536 }, { "epoch": 0.21422861710631494, "loss": 0.47603410482406616, "loss_ce": 0.027242586016654968, "loss_xval": 0.44921875, "num_input_tokens_seen": 92634216, "step": 536 }, { "epoch": 0.2146282973621103, "grad_norm": 125.22804174984209, "learning_rate": 5e-06, "loss": 0.7744, "num_input_tokens_seen": 92807536, "step": 537 }, { "epoch": 0.2146282973621103, "loss": 0.5036810636520386, "loss_ce": 0.02418886497616768, "loss_xval": 0.48046875, "num_input_tokens_seen": 92807536, "step": 537 }, { "epoch": 0.21502797761790568, "grad_norm": 211.76831446430648, "learning_rate": 5e-06, "loss": 1.0454, "num_input_tokens_seen": 92980600, "step": 538 }, { "epoch": 0.21502797761790568, "loss": 0.9494002461433411, "loss_ce": 0.03143148496747017, "loss_xval": 0.91796875, "num_input_tokens_seen": 92980600, "step": 538 }, { "epoch": 0.21542765787370105, "grad_norm": 83.53737940813463, "learning_rate": 5e-06, "loss": 0.8112, "num_input_tokens_seen": 93153536, "step": 539 }, { "epoch": 0.21542765787370105, "loss": 0.7787027955055237, "loss_ce": 0.028214523568749428, "loss_xval": 0.75, "num_input_tokens_seen": 93153536, "step": 539 }, { "epoch": 0.2158273381294964, "grad_norm": 221.1772186610306, "learning_rate": 5e-06, "loss": 1.2162, "num_input_tokens_seen": 93326336, "step": 540 }, { "epoch": 0.2158273381294964, "loss": 0.7756029367446899, "loss_ce": 0.027067817747592926, "loss_xval": 0.75, "num_input_tokens_seen": 93326336, "step": 540 }, { "epoch": 0.21622701838529176, "grad_norm": 250.30628691634345, "learning_rate": 5e-06, "loss": 0.8389, "num_input_tokens_seen": 93499160, "step": 541 }, { "epoch": 0.21622701838529176, "loss": 0.6384867429733276, "loss_ce": 0.030210375785827637, "loss_xval": 0.609375, "num_input_tokens_seen": 93499160, "step": 541 }, { "epoch": 0.21662669864108713, "grad_norm": 60.81920570311037, "learning_rate": 5e-06, "loss": 1.1726, "num_input_tokens_seen": 93672248, "step": 542 }, { "epoch": 0.21662669864108713, "loss": 1.3165156841278076, "loss_ce": 0.03245798870921135, "loss_xval": 1.28125, "num_input_tokens_seen": 93672248, "step": 542 }, { "epoch": 0.2170263788968825, "grad_norm": 245.3824525578725, "learning_rate": 5e-06, "loss": 1.0387, "num_input_tokens_seen": 93845280, "step": 543 }, { "epoch": 0.2170263788968825, "loss": 1.4534016847610474, "loss_ce": 0.027132168412208557, "loss_xval": 1.4296875, "num_input_tokens_seen": 93845280, "step": 543 }, { "epoch": 0.21742605915267785, "grad_norm": 96.9503832488851, "learning_rate": 5e-06, "loss": 1.6084, "num_input_tokens_seen": 94018296, "step": 544 }, { "epoch": 0.21742605915267785, "loss": 1.4025704860687256, "loss_ce": 0.04173062741756439, "loss_xval": 1.359375, "num_input_tokens_seen": 94018296, "step": 544 }, { "epoch": 0.21782573940847322, "grad_norm": 248.6582812449355, "learning_rate": 5e-06, "loss": 0.829, "num_input_tokens_seen": 94191352, "step": 545 }, { "epoch": 0.21782573940847322, "loss": 0.7271380424499512, "loss_ce": 0.026942692697048187, "loss_xval": 0.69921875, "num_input_tokens_seen": 94191352, "step": 545 }, { "epoch": 0.2182254196642686, "grad_norm": 139.37534365707833, "learning_rate": 5e-06, "loss": 0.8576, "num_input_tokens_seen": 94364712, "step": 546 }, { "epoch": 0.2182254196642686, "loss": 1.2029194831848145, "loss_ce": 0.026039643213152885, "loss_xval": 1.1796875, "num_input_tokens_seen": 94364712, "step": 546 }, { "epoch": 0.21862509992006396, "grad_norm": 161.00715928409278, "learning_rate": 5e-06, "loss": 1.2751, "num_input_tokens_seen": 94534248, "step": 547 }, { "epoch": 0.21862509992006396, "loss": 0.7849478125572205, "loss_ce": 0.024938026443123817, "loss_xval": 0.76171875, "num_input_tokens_seen": 94534248, "step": 547 }, { "epoch": 0.2190247801758593, "grad_norm": 120.74445345712077, "learning_rate": 5e-06, "loss": 0.972, "num_input_tokens_seen": 94707432, "step": 548 }, { "epoch": 0.2190247801758593, "loss": 1.0243523120880127, "loss_ce": 0.04681321233510971, "loss_xval": 0.9765625, "num_input_tokens_seen": 94707432, "step": 548 }, { "epoch": 0.21942446043165467, "grad_norm": 149.35126901028477, "learning_rate": 5e-06, "loss": 1.621, "num_input_tokens_seen": 94880480, "step": 549 }, { "epoch": 0.21942446043165467, "loss": 0.938992977142334, "loss_ce": 0.02834843471646309, "loss_xval": 0.91015625, "num_input_tokens_seen": 94880480, "step": 549 }, { "epoch": 0.21982414068745004, "grad_norm": 96.31882416543029, "learning_rate": 5e-06, "loss": 0.9228, "num_input_tokens_seen": 95053720, "step": 550 }, { "epoch": 0.21982414068745004, "loss": 0.6176514029502869, "loss_ce": 0.026220720261335373, "loss_xval": 0.58984375, "num_input_tokens_seen": 95053720, "step": 550 }, { "epoch": 0.2202238209432454, "grad_norm": 89.42353943704879, "learning_rate": 5e-06, "loss": 0.8391, "num_input_tokens_seen": 95226696, "step": 551 }, { "epoch": 0.2202238209432454, "loss": 1.1121536493301392, "loss_ce": 0.025239594280719757, "loss_xval": 1.0859375, "num_input_tokens_seen": 95226696, "step": 551 }, { "epoch": 0.22062350119904076, "grad_norm": 60.245937188446725, "learning_rate": 5e-06, "loss": 1.1905, "num_input_tokens_seen": 95399936, "step": 552 }, { "epoch": 0.22062350119904076, "loss": 1.1478040218353271, "loss_ce": 0.03208138048648834, "loss_xval": 1.1171875, "num_input_tokens_seen": 95399936, "step": 552 }, { "epoch": 0.22102318145483613, "grad_norm": 250.376146897708, "learning_rate": 5e-06, "loss": 0.8334, "num_input_tokens_seen": 95573048, "step": 553 }, { "epoch": 0.22102318145483613, "loss": 0.9694182872772217, "loss_ce": 0.029293827712535858, "loss_xval": 0.94140625, "num_input_tokens_seen": 95573048, "step": 553 }, { "epoch": 0.2214228617106315, "grad_norm": 472.12030978419017, "learning_rate": 5e-06, "loss": 1.3447, "num_input_tokens_seen": 95746232, "step": 554 }, { "epoch": 0.2214228617106315, "loss": 1.390291690826416, "loss_ce": 0.031160805374383926, "loss_xval": 1.359375, "num_input_tokens_seen": 95746232, "step": 554 }, { "epoch": 0.22182254196642687, "grad_norm": 200.19161843381832, "learning_rate": 5e-06, "loss": 1.1299, "num_input_tokens_seen": 95919064, "step": 555 }, { "epoch": 0.22182254196642687, "loss": 1.2493796348571777, "loss_ce": 0.029714081436395645, "loss_xval": 1.21875, "num_input_tokens_seen": 95919064, "step": 555 }, { "epoch": 0.2222222222222222, "grad_norm": 264.8729753389226, "learning_rate": 5e-06, "loss": 1.3297, "num_input_tokens_seen": 96092128, "step": 556 }, { "epoch": 0.2222222222222222, "loss": 1.3867847919464111, "loss_ce": 0.03546644374728203, "loss_xval": 1.3515625, "num_input_tokens_seen": 96092128, "step": 556 }, { "epoch": 0.22262190247801758, "grad_norm": 292.84845504228457, "learning_rate": 5e-06, "loss": 1.2343, "num_input_tokens_seen": 96261344, "step": 557 }, { "epoch": 0.22262190247801758, "loss": 1.3330931663513184, "loss_ce": 0.02938220463693142, "loss_xval": 1.3046875, "num_input_tokens_seen": 96261344, "step": 557 }, { "epoch": 0.22302158273381295, "grad_norm": 60.53087991395975, "learning_rate": 5e-06, "loss": 0.841, "num_input_tokens_seen": 96434288, "step": 558 }, { "epoch": 0.22302158273381295, "loss": 0.9854141473770142, "loss_ce": 0.032960571348667145, "loss_xval": 0.953125, "num_input_tokens_seen": 96434288, "step": 558 }, { "epoch": 0.22342126298960832, "grad_norm": 197.24569669696305, "learning_rate": 5e-06, "loss": 1.209, "num_input_tokens_seen": 96607664, "step": 559 }, { "epoch": 0.22342126298960832, "loss": 0.886874794960022, "loss_ce": 0.055820122361183167, "loss_xval": 0.83203125, "num_input_tokens_seen": 96607664, "step": 559 }, { "epoch": 0.22382094324540366, "grad_norm": 90.62307871554165, "learning_rate": 5e-06, "loss": 0.8997, "num_input_tokens_seen": 96780576, "step": 560 }, { "epoch": 0.22382094324540366, "loss": 0.5703801512718201, "loss_ce": 0.035590097308158875, "loss_xval": 0.53515625, "num_input_tokens_seen": 96780576, "step": 560 }, { "epoch": 0.22422062350119903, "grad_norm": 210.46096711060173, "learning_rate": 5e-06, "loss": 0.8813, "num_input_tokens_seen": 96953128, "step": 561 }, { "epoch": 0.22422062350119903, "loss": 1.1930537223815918, "loss_ce": 0.050475526601076126, "loss_xval": 1.140625, "num_input_tokens_seen": 96953128, "step": 561 }, { "epoch": 0.2246203037569944, "grad_norm": 156.37219513558009, "learning_rate": 5e-06, "loss": 1.1836, "num_input_tokens_seen": 97126480, "step": 562 }, { "epoch": 0.2246203037569944, "loss": 0.6334319710731506, "loss_ce": 0.04236753284931183, "loss_xval": 0.58984375, "num_input_tokens_seen": 97126480, "step": 562 }, { "epoch": 0.22501998401278978, "grad_norm": 387.8888057437442, "learning_rate": 5e-06, "loss": 0.975, "num_input_tokens_seen": 97299688, "step": 563 }, { "epoch": 0.22501998401278978, "loss": 0.8660625219345093, "loss_ce": 0.04672662168741226, "loss_xval": 0.8203125, "num_input_tokens_seen": 97299688, "step": 563 }, { "epoch": 0.22541966426858512, "grad_norm": 64.648417204487, "learning_rate": 5e-06, "loss": 0.6787, "num_input_tokens_seen": 97472888, "step": 564 }, { "epoch": 0.22541966426858512, "loss": 0.90760338306427, "loss_ce": 0.04456621780991554, "loss_xval": 0.86328125, "num_input_tokens_seen": 97472888, "step": 564 }, { "epoch": 0.2258193445243805, "grad_norm": 411.13766405693985, "learning_rate": 5e-06, "loss": 1.0557, "num_input_tokens_seen": 97646160, "step": 565 }, { "epoch": 0.2258193445243805, "loss": 1.2993905544281006, "loss_ce": 0.042066287249326706, "loss_xval": 1.2578125, "num_input_tokens_seen": 97646160, "step": 565 }, { "epoch": 0.22621902478017586, "grad_norm": 217.77303028298573, "learning_rate": 5e-06, "loss": 0.8985, "num_input_tokens_seen": 97818944, "step": 566 }, { "epoch": 0.22621902478017586, "loss": 0.7498751282691956, "loss_ce": 0.041379086673259735, "loss_xval": 0.70703125, "num_input_tokens_seen": 97818944, "step": 566 }, { "epoch": 0.22661870503597123, "grad_norm": 245.13372340348388, "learning_rate": 5e-06, "loss": 0.8019, "num_input_tokens_seen": 97991832, "step": 567 }, { "epoch": 0.22661870503597123, "loss": 0.7386154532432556, "loss_ce": 0.03329318016767502, "loss_xval": 0.70703125, "num_input_tokens_seen": 97991832, "step": 567 }, { "epoch": 0.2270183852917666, "grad_norm": 257.82480324488694, "learning_rate": 5e-06, "loss": 1.2229, "num_input_tokens_seen": 98164624, "step": 568 }, { "epoch": 0.2270183852917666, "loss": 0.9997340440750122, "loss_ce": 0.04294687137007713, "loss_xval": 0.95703125, "num_input_tokens_seen": 98164624, "step": 568 }, { "epoch": 0.22741806554756194, "grad_norm": 34.59729056287196, "learning_rate": 5e-06, "loss": 0.9462, "num_input_tokens_seen": 98337936, "step": 569 }, { "epoch": 0.22741806554756194, "loss": 0.8521548509597778, "loss_ce": 0.03757966682314873, "loss_xval": 0.81640625, "num_input_tokens_seen": 98337936, "step": 569 }, { "epoch": 0.2278177458033573, "grad_norm": 111.7199151079931, "learning_rate": 5e-06, "loss": 0.853, "num_input_tokens_seen": 98511288, "step": 570 }, { "epoch": 0.2278177458033573, "loss": 0.8863071799278259, "loss_ce": 0.03657577931880951, "loss_xval": 0.8515625, "num_input_tokens_seen": 98511288, "step": 570 }, { "epoch": 0.22821742605915268, "grad_norm": 42.73190171410932, "learning_rate": 5e-06, "loss": 0.8986, "num_input_tokens_seen": 98684336, "step": 571 }, { "epoch": 0.22821742605915268, "loss": 1.1216399669647217, "loss_ce": 0.041561778634786606, "loss_xval": 1.078125, "num_input_tokens_seen": 98684336, "step": 571 }, { "epoch": 0.22861710631494805, "grad_norm": 176.58612953080763, "learning_rate": 5e-06, "loss": 1.2574, "num_input_tokens_seen": 98853952, "step": 572 }, { "epoch": 0.22861710631494805, "loss": 1.1767420768737793, "loss_ce": 0.034652289003133774, "loss_xval": 1.140625, "num_input_tokens_seen": 98853952, "step": 572 }, { "epoch": 0.2290167865707434, "grad_norm": 81.67430863927449, "learning_rate": 5e-06, "loss": 1.0762, "num_input_tokens_seen": 99026888, "step": 573 }, { "epoch": 0.2290167865707434, "loss": 1.4329999685287476, "loss_ce": 0.07716501504182816, "loss_xval": 1.359375, "num_input_tokens_seen": 99026888, "step": 573 }, { "epoch": 0.22941646682653877, "grad_norm": 274.37468677613424, "learning_rate": 5e-06, "loss": 1.2617, "num_input_tokens_seen": 99199640, "step": 574 }, { "epoch": 0.22941646682653877, "loss": 0.6166301965713501, "loss_ce": 0.029716167598962784, "loss_xval": 0.5859375, "num_input_tokens_seen": 99199640, "step": 574 }, { "epoch": 0.22981614708233414, "grad_norm": 26.081873724774624, "learning_rate": 5e-06, "loss": 0.6142, "num_input_tokens_seen": 99372312, "step": 575 }, { "epoch": 0.22981614708233414, "loss": 0.5156276822090149, "loss_ce": 0.03149682283401489, "loss_xval": 0.484375, "num_input_tokens_seen": 99372312, "step": 575 }, { "epoch": 0.2302158273381295, "grad_norm": 284.96481935200273, "learning_rate": 5e-06, "loss": 1.1397, "num_input_tokens_seen": 99545160, "step": 576 }, { "epoch": 0.2302158273381295, "loss": 1.3616349697113037, "loss_ce": 0.034730590879917145, "loss_xval": 1.328125, "num_input_tokens_seen": 99545160, "step": 576 }, { "epoch": 0.23061550759392485, "grad_norm": 44.124449238858176, "learning_rate": 5e-06, "loss": 0.7619, "num_input_tokens_seen": 99718512, "step": 577 }, { "epoch": 0.23061550759392485, "loss": 0.824275016784668, "loss_ce": 0.029658352956175804, "loss_xval": 0.79296875, "num_input_tokens_seen": 99718512, "step": 577 }, { "epoch": 0.23101518784972022, "grad_norm": 207.04912847362317, "learning_rate": 5e-06, "loss": 1.2726, "num_input_tokens_seen": 99891440, "step": 578 }, { "epoch": 0.23101518784972022, "loss": 1.4096300601959229, "loss_ce": 0.02791622281074524, "loss_xval": 1.3828125, "num_input_tokens_seen": 99891440, "step": 578 }, { "epoch": 0.2314148681055156, "grad_norm": 136.60800256923707, "learning_rate": 5e-06, "loss": 0.9458, "num_input_tokens_seen": 100064696, "step": 579 }, { "epoch": 0.2314148681055156, "loss": 0.8815451860427856, "loss_ce": 0.027907539159059525, "loss_xval": 0.85546875, "num_input_tokens_seen": 100064696, "step": 579 }, { "epoch": 0.23181454836131096, "grad_norm": 281.71610387045706, "learning_rate": 5e-06, "loss": 0.8355, "num_input_tokens_seen": 100237384, "step": 580 }, { "epoch": 0.23181454836131096, "loss": 0.7737770080566406, "loss_ce": 0.025363922119140625, "loss_xval": 0.75, "num_input_tokens_seen": 100237384, "step": 580 }, { "epoch": 0.2322142286171063, "grad_norm": 69.97613875254609, "learning_rate": 5e-06, "loss": 1.1513, "num_input_tokens_seen": 100410504, "step": 581 }, { "epoch": 0.2322142286171063, "loss": 1.1582691669464111, "loss_ce": 0.026433231309056282, "loss_xval": 1.1328125, "num_input_tokens_seen": 100410504, "step": 581 }, { "epoch": 0.23261390887290168, "grad_norm": 373.00175697859225, "learning_rate": 5e-06, "loss": 1.4461, "num_input_tokens_seen": 100583328, "step": 582 }, { "epoch": 0.23261390887290168, "loss": 1.4269239902496338, "loss_ce": 0.023115256801247597, "loss_xval": 1.40625, "num_input_tokens_seen": 100583328, "step": 582 }, { "epoch": 0.23301358912869705, "grad_norm": 149.603838664552, "learning_rate": 5e-06, "loss": 0.6656, "num_input_tokens_seen": 100755664, "step": 583 }, { "epoch": 0.23301358912869705, "loss": 0.7104263305664062, "loss_ce": 0.024635307490825653, "loss_xval": 0.6875, "num_input_tokens_seen": 100755664, "step": 583 }, { "epoch": 0.23341326938449242, "grad_norm": 315.2272192197468, "learning_rate": 5e-06, "loss": 0.7095, "num_input_tokens_seen": 100925176, "step": 584 }, { "epoch": 0.23341326938449242, "loss": 0.7385757565498352, "loss_ce": 0.03300934657454491, "loss_xval": 0.70703125, "num_input_tokens_seen": 100925176, "step": 584 }, { "epoch": 0.23381294964028776, "grad_norm": 158.70840429315962, "learning_rate": 5e-06, "loss": 1.2368, "num_input_tokens_seen": 101097904, "step": 585 }, { "epoch": 0.23381294964028776, "loss": 1.1981698274612427, "loss_ce": 0.028125843033194542, "loss_xval": 1.171875, "num_input_tokens_seen": 101097904, "step": 585 }, { "epoch": 0.23421262989608313, "grad_norm": 300.90753147063134, "learning_rate": 5e-06, "loss": 0.9731, "num_input_tokens_seen": 101270824, "step": 586 }, { "epoch": 0.23421262989608313, "loss": 1.0399353504180908, "loss_ce": 0.031390391290187836, "loss_xval": 1.0078125, "num_input_tokens_seen": 101270824, "step": 586 }, { "epoch": 0.2346123101518785, "grad_norm": 150.6022752310562, "learning_rate": 5e-06, "loss": 1.2954, "num_input_tokens_seen": 101443832, "step": 587 }, { "epoch": 0.2346123101518785, "loss": 0.8707741498947144, "loss_ce": 0.031174514442682266, "loss_xval": 0.83984375, "num_input_tokens_seen": 101443832, "step": 587 }, { "epoch": 0.23501199040767387, "grad_norm": 277.43328148547425, "learning_rate": 5e-06, "loss": 1.0352, "num_input_tokens_seen": 101616416, "step": 588 }, { "epoch": 0.23501199040767387, "loss": 1.0693809986114502, "loss_ce": 0.0334923230111599, "loss_xval": 1.0390625, "num_input_tokens_seen": 101616416, "step": 588 }, { "epoch": 0.2354116706634692, "grad_norm": 78.20054429498728, "learning_rate": 5e-06, "loss": 0.919, "num_input_tokens_seen": 101789472, "step": 589 }, { "epoch": 0.2354116706634692, "loss": 0.9403672814369202, "loss_ce": 0.02972276508808136, "loss_xval": 0.91015625, "num_input_tokens_seen": 101789472, "step": 589 }, { "epoch": 0.23581135091926458, "grad_norm": 248.8776432354872, "learning_rate": 5e-06, "loss": 1.3982, "num_input_tokens_seen": 101962248, "step": 590 }, { "epoch": 0.23581135091926458, "loss": 1.470240592956543, "loss_ce": 0.030176982283592224, "loss_xval": 1.4375, "num_input_tokens_seen": 101962248, "step": 590 }, { "epoch": 0.23621103117505995, "grad_norm": 34.509014167489795, "learning_rate": 5e-06, "loss": 0.7077, "num_input_tokens_seen": 102131336, "step": 591 }, { "epoch": 0.23621103117505995, "loss": 0.4956533908843994, "loss_ce": 0.03294587880373001, "loss_xval": 0.462890625, "num_input_tokens_seen": 102131336, "step": 591 }, { "epoch": 0.23661071143085532, "grad_norm": 315.0428753524298, "learning_rate": 5e-06, "loss": 0.9751, "num_input_tokens_seen": 102303824, "step": 592 }, { "epoch": 0.23661071143085532, "loss": 1.054516077041626, "loss_ce": 0.03449662774801254, "loss_xval": 1.0234375, "num_input_tokens_seen": 102303824, "step": 592 }, { "epoch": 0.23701039168665067, "grad_norm": 92.7050944230278, "learning_rate": 5e-06, "loss": 1.0229, "num_input_tokens_seen": 102476784, "step": 593 }, { "epoch": 0.23701039168665067, "loss": 1.73143470287323, "loss_ce": 0.03075111284852028, "loss_xval": 1.703125, "num_input_tokens_seen": 102476784, "step": 593 }, { "epoch": 0.23741007194244604, "grad_norm": 121.2204406389868, "learning_rate": 5e-06, "loss": 0.808, "num_input_tokens_seen": 102649976, "step": 594 }, { "epoch": 0.23741007194244604, "loss": 0.783922553062439, "loss_ce": 0.03868328034877777, "loss_xval": 0.74609375, "num_input_tokens_seen": 102649976, "step": 594 }, { "epoch": 0.2378097521982414, "grad_norm": 37.963372753565885, "learning_rate": 5e-06, "loss": 0.8382, "num_input_tokens_seen": 102822584, "step": 595 }, { "epoch": 0.2378097521982414, "loss": 0.7244482040405273, "loss_ce": 0.03298095613718033, "loss_xval": 0.69140625, "num_input_tokens_seen": 102822584, "step": 595 }, { "epoch": 0.23820943245403678, "grad_norm": 169.1225094579379, "learning_rate": 5e-06, "loss": 1.0284, "num_input_tokens_seen": 102995352, "step": 596 }, { "epoch": 0.23820943245403678, "loss": 1.4873781204223633, "loss_ce": 0.03516869992017746, "loss_xval": 1.453125, "num_input_tokens_seen": 102995352, "step": 596 }, { "epoch": 0.23860911270983212, "grad_norm": 56.84926795707111, "learning_rate": 5e-06, "loss": 0.8466, "num_input_tokens_seen": 103167920, "step": 597 }, { "epoch": 0.23860911270983212, "loss": 0.8535851240158081, "loss_ce": 0.0333947092294693, "loss_xval": 0.8203125, "num_input_tokens_seen": 103167920, "step": 597 }, { "epoch": 0.2390087929656275, "grad_norm": 245.5535091122652, "learning_rate": 5e-06, "loss": 0.8312, "num_input_tokens_seen": 103340488, "step": 598 }, { "epoch": 0.2390087929656275, "loss": 1.069108009338379, "loss_ce": 0.027115818113088608, "loss_xval": 1.0390625, "num_input_tokens_seen": 103340488, "step": 598 }, { "epoch": 0.23940847322142286, "grad_norm": 158.88384407024452, "learning_rate": 5e-06, "loss": 0.5671, "num_input_tokens_seen": 103513496, "step": 599 }, { "epoch": 0.23940847322142286, "loss": 0.4917399287223816, "loss_ce": 0.042154960334300995, "loss_xval": 0.44921875, "num_input_tokens_seen": 103513496, "step": 599 }, { "epoch": 0.23980815347721823, "grad_norm": 215.62794739008515, "learning_rate": 5e-06, "loss": 0.8888, "num_input_tokens_seen": 103686592, "step": 600 }, { "epoch": 0.23980815347721823, "loss": 0.6870980262756348, "loss_ce": 0.024866603314876556, "loss_xval": 0.6640625, "num_input_tokens_seen": 103686592, "step": 600 }, { "epoch": 0.24020783373301358, "grad_norm": 233.7424673976639, "learning_rate": 5e-06, "loss": 1.2323, "num_input_tokens_seen": 103859464, "step": 601 }, { "epoch": 0.24020783373301358, "loss": 1.235499382019043, "loss_ce": 0.024195652455091476, "loss_xval": 1.2109375, "num_input_tokens_seen": 103859464, "step": 601 }, { "epoch": 0.24060751398880895, "grad_norm": 191.42755343958055, "learning_rate": 5e-06, "loss": 0.907, "num_input_tokens_seen": 104032384, "step": 602 }, { "epoch": 0.24060751398880895, "loss": 0.9421808123588562, "loss_ce": 0.03629700094461441, "loss_xval": 0.90625, "num_input_tokens_seen": 104032384, "step": 602 }, { "epoch": 0.24100719424460432, "grad_norm": 308.9412242751799, "learning_rate": 5e-06, "loss": 1.2389, "num_input_tokens_seen": 104205312, "step": 603 }, { "epoch": 0.24100719424460432, "loss": 1.3211491107940674, "loss_ce": 0.02378581464290619, "loss_xval": 1.296875, "num_input_tokens_seen": 104205312, "step": 603 }, { "epoch": 0.2414068745003997, "grad_norm": 89.11609070631093, "learning_rate": 5e-06, "loss": 0.731, "num_input_tokens_seen": 104377936, "step": 604 }, { "epoch": 0.2414068745003997, "loss": 0.8285616040229797, "loss_ce": 0.0302217286080122, "loss_xval": 0.796875, "num_input_tokens_seen": 104377936, "step": 604 }, { "epoch": 0.24180655475619503, "grad_norm": 279.00947911880144, "learning_rate": 5e-06, "loss": 1.1599, "num_input_tokens_seen": 104550768, "step": 605 }, { "epoch": 0.24180655475619503, "loss": 1.0528864860534668, "loss_ce": 0.024688273668289185, "loss_xval": 1.03125, "num_input_tokens_seen": 104550768, "step": 605 }, { "epoch": 0.2422062350119904, "grad_norm": 80.1187354122152, "learning_rate": 5e-06, "loss": 0.8784, "num_input_tokens_seen": 104723680, "step": 606 }, { "epoch": 0.2422062350119904, "loss": 0.8573121428489685, "loss_ce": 0.02589123696088791, "loss_xval": 0.83203125, "num_input_tokens_seen": 104723680, "step": 606 }, { "epoch": 0.24260591526778577, "grad_norm": 255.05532627961566, "learning_rate": 5e-06, "loss": 1.1517, "num_input_tokens_seen": 104896272, "step": 607 }, { "epoch": 0.24260591526778577, "loss": 1.3371992111206055, "loss_ce": 0.023966766893863678, "loss_xval": 1.3125, "num_input_tokens_seen": 104896272, "step": 607 }, { "epoch": 0.24300559552358114, "grad_norm": 172.50234084378377, "learning_rate": 5e-06, "loss": 1.2108, "num_input_tokens_seen": 105063968, "step": 608 }, { "epoch": 0.24300559552358114, "loss": 1.5010284185409546, "loss_ce": 0.023855600506067276, "loss_xval": 1.4765625, "num_input_tokens_seen": 105063968, "step": 608 }, { "epoch": 0.2434052757793765, "grad_norm": 392.0650101886134, "learning_rate": 5e-06, "loss": 1.335, "num_input_tokens_seen": 105237008, "step": 609 }, { "epoch": 0.2434052757793765, "loss": 1.3141515254974365, "loss_ce": 0.026797983795404434, "loss_xval": 1.2890625, "num_input_tokens_seen": 105237008, "step": 609 }, { "epoch": 0.24380495603517185, "grad_norm": 61.45983002680283, "learning_rate": 5e-06, "loss": 0.8797, "num_input_tokens_seen": 105409768, "step": 610 }, { "epoch": 0.24380495603517185, "loss": 1.0218791961669922, "loss_ce": 0.027372296899557114, "loss_xval": 0.99609375, "num_input_tokens_seen": 105409768, "step": 610 }, { "epoch": 0.24420463629096723, "grad_norm": 301.269151881736, "learning_rate": 5e-06, "loss": 0.9493, "num_input_tokens_seen": 105582552, "step": 611 }, { "epoch": 0.24420463629096723, "loss": 1.1783733367919922, "loss_ce": 0.026517830789089203, "loss_xval": 1.1484375, "num_input_tokens_seen": 105582552, "step": 611 }, { "epoch": 0.2446043165467626, "grad_norm": 111.22066209378004, "learning_rate": 5e-06, "loss": 1.1837, "num_input_tokens_seen": 105755840, "step": 612 }, { "epoch": 0.2446043165467626, "loss": 1.2885253429412842, "loss_ce": 0.032177697867155075, "loss_xval": 1.2578125, "num_input_tokens_seen": 105755840, "step": 612 }, { "epoch": 0.24500399680255797, "grad_norm": 212.8848788943127, "learning_rate": 5e-06, "loss": 0.9066, "num_input_tokens_seen": 105928728, "step": 613 }, { "epoch": 0.24500399680255797, "loss": 1.120781660079956, "loss_ce": 0.024956412613391876, "loss_xval": 1.09375, "num_input_tokens_seen": 105928728, "step": 613 }, { "epoch": 0.2454036770583533, "grad_norm": 103.49456362295976, "learning_rate": 5e-06, "loss": 0.8015, "num_input_tokens_seen": 106101936, "step": 614 }, { "epoch": 0.2454036770583533, "loss": 0.8075883984565735, "loss_ce": 0.026216331869363785, "loss_xval": 0.78125, "num_input_tokens_seen": 106101936, "step": 614 }, { "epoch": 0.24580335731414868, "grad_norm": 248.325610563766, "learning_rate": 5e-06, "loss": 1.5101, "num_input_tokens_seen": 106275008, "step": 615 }, { "epoch": 0.24580335731414868, "loss": 1.5115883350372314, "loss_ce": 0.02550426870584488, "loss_xval": 1.484375, "num_input_tokens_seen": 106275008, "step": 615 }, { "epoch": 0.24620303756994405, "grad_norm": 167.96429221700566, "learning_rate": 5e-06, "loss": 1.2687, "num_input_tokens_seen": 106447848, "step": 616 }, { "epoch": 0.24620303756994405, "loss": 1.5167852640151978, "loss_ce": 0.03192197158932686, "loss_xval": 1.484375, "num_input_tokens_seen": 106447848, "step": 616 }, { "epoch": 0.24660271782573942, "grad_norm": 298.4988658186804, "learning_rate": 5e-06, "loss": 1.4475, "num_input_tokens_seen": 106620640, "step": 617 }, { "epoch": 0.24660271782573942, "loss": 1.7282776832580566, "loss_ce": 0.03162240982055664, "loss_xval": 1.6953125, "num_input_tokens_seen": 106620640, "step": 617 }, { "epoch": 0.24700239808153476, "grad_norm": 219.23244811627757, "learning_rate": 5e-06, "loss": 1.3819, "num_input_tokens_seen": 106793448, "step": 618 }, { "epoch": 0.24700239808153476, "loss": 0.7871071696281433, "loss_ce": 0.026853220537304878, "loss_xval": 0.76171875, "num_input_tokens_seen": 106793448, "step": 618 }, { "epoch": 0.24740207833733013, "grad_norm": 303.82496611640624, "learning_rate": 5e-06, "loss": 1.0851, "num_input_tokens_seen": 106966416, "step": 619 }, { "epoch": 0.24740207833733013, "loss": 1.2948808670043945, "loss_ce": 0.02705862559378147, "loss_xval": 1.265625, "num_input_tokens_seen": 106966416, "step": 619 }, { "epoch": 0.2478017585931255, "grad_norm": 169.45479385596363, "learning_rate": 5e-06, "loss": 1.7907, "num_input_tokens_seen": 107139480, "step": 620 }, { "epoch": 0.2478017585931255, "loss": 1.1137454509735107, "loss_ce": 0.03098176047205925, "loss_xval": 1.0859375, "num_input_tokens_seen": 107139480, "step": 620 }, { "epoch": 0.24820143884892087, "grad_norm": 280.20960363263634, "learning_rate": 5e-06, "loss": 1.0557, "num_input_tokens_seen": 107312464, "step": 621 }, { "epoch": 0.24820143884892087, "loss": 1.1125739812850952, "loss_ce": 0.031031014397740364, "loss_xval": 1.078125, "num_input_tokens_seen": 107312464, "step": 621 }, { "epoch": 0.24860111910471622, "grad_norm": 52.29310753242722, "learning_rate": 5e-06, "loss": 0.8947, "num_input_tokens_seen": 107485336, "step": 622 }, { "epoch": 0.24860111910471622, "loss": 0.861524760723114, "loss_ce": 0.03401007875800133, "loss_xval": 0.828125, "num_input_tokens_seen": 107485336, "step": 622 }, { "epoch": 0.2490007993605116, "grad_norm": 326.1027649366469, "learning_rate": 5e-06, "loss": 1.1985, "num_input_tokens_seen": 107658200, "step": 623 }, { "epoch": 0.2490007993605116, "loss": 1.141750693321228, "loss_ce": 0.031765300780534744, "loss_xval": 1.109375, "num_input_tokens_seen": 107658200, "step": 623 }, { "epoch": 0.24940047961630696, "grad_norm": 38.95977192063135, "learning_rate": 5e-06, "loss": 0.7603, "num_input_tokens_seen": 107831376, "step": 624 }, { "epoch": 0.24940047961630696, "loss": 0.727970540523529, "loss_ce": 0.02875177562236786, "loss_xval": 0.69921875, "num_input_tokens_seen": 107831376, "step": 624 }, { "epoch": 0.24980015987210233, "grad_norm": 252.07183773955228, "learning_rate": 5e-06, "loss": 1.5353, "num_input_tokens_seen": 108004032, "step": 625 }, { "epoch": 0.24980015987210233, "loss": 1.5708106756210327, "loss_ce": 0.04151376336812973, "loss_xval": 1.53125, "num_input_tokens_seen": 108004032, "step": 625 }, { "epoch": 0.2501998401278977, "grad_norm": 123.5509465218223, "learning_rate": 5e-06, "loss": 0.8309, "num_input_tokens_seen": 108176984, "step": 626 }, { "epoch": 0.2501998401278977, "loss": 0.5989866256713867, "loss_ce": 0.031481776386499405, "loss_xval": 0.56640625, "num_input_tokens_seen": 108176984, "step": 626 }, { "epoch": 0.25059952038369304, "grad_norm": 318.6509948375151, "learning_rate": 5e-06, "loss": 1.5763, "num_input_tokens_seen": 108350144, "step": 627 }, { "epoch": 0.25059952038369304, "loss": 2.1137542724609375, "loss_ce": 0.03538517281413078, "loss_xval": 2.078125, "num_input_tokens_seen": 108350144, "step": 627 }, { "epoch": 0.2509992006394884, "grad_norm": 61.521784410197895, "learning_rate": 5e-06, "loss": 0.9835, "num_input_tokens_seen": 108523376, "step": 628 }, { "epoch": 0.2509992006394884, "loss": 0.8479458093643188, "loss_ce": 0.03227199614048004, "loss_xval": 0.81640625, "num_input_tokens_seen": 108523376, "step": 628 }, { "epoch": 0.2513988808952838, "grad_norm": 126.59752275575777, "learning_rate": 5e-06, "loss": 1.0511, "num_input_tokens_seen": 108696208, "step": 629 }, { "epoch": 0.2513988808952838, "loss": 1.2120076417922974, "loss_ce": 0.02963467314839363, "loss_xval": 1.1796875, "num_input_tokens_seen": 108696208, "step": 629 }, { "epoch": 0.2517985611510791, "grad_norm": 83.31660909579162, "learning_rate": 5e-06, "loss": 0.9889, "num_input_tokens_seen": 108869176, "step": 630 }, { "epoch": 0.2517985611510791, "loss": 1.004129409790039, "loss_ce": 0.03550145775079727, "loss_xval": 0.96875, "num_input_tokens_seen": 108869176, "step": 630 }, { "epoch": 0.2521982414068745, "grad_norm": 114.49495819012182, "learning_rate": 5e-06, "loss": 0.8974, "num_input_tokens_seen": 109042280, "step": 631 }, { "epoch": 0.2521982414068745, "loss": 1.1485271453857422, "loss_ce": 0.0317058339715004, "loss_xval": 1.1171875, "num_input_tokens_seen": 109042280, "step": 631 }, { "epoch": 0.25259792166266987, "grad_norm": 158.50373331009686, "learning_rate": 5e-06, "loss": 0.9505, "num_input_tokens_seen": 109215320, "step": 632 }, { "epoch": 0.25259792166266987, "loss": 0.7758920192718506, "loss_ce": 0.022229932248592377, "loss_xval": 0.75390625, "num_input_tokens_seen": 109215320, "step": 632 }, { "epoch": 0.2529976019184652, "grad_norm": 124.3150224300894, "learning_rate": 5e-06, "loss": 1.041, "num_input_tokens_seen": 109388376, "step": 633 }, { "epoch": 0.2529976019184652, "loss": 0.7999504804611206, "loss_ce": 0.022484708577394485, "loss_xval": 0.77734375, "num_input_tokens_seen": 109388376, "step": 633 }, { "epoch": 0.2533972821742606, "grad_norm": 153.8473346571185, "learning_rate": 5e-06, "loss": 1.0814, "num_input_tokens_seen": 109561424, "step": 634 }, { "epoch": 0.2533972821742606, "loss": 0.7413469552993774, "loss_ce": 0.02229173481464386, "loss_xval": 0.71875, "num_input_tokens_seen": 109561424, "step": 634 }, { "epoch": 0.25379696243005595, "grad_norm": 123.4116823206711, "learning_rate": 5e-06, "loss": 0.6317, "num_input_tokens_seen": 109734584, "step": 635 }, { "epoch": 0.25379696243005595, "loss": 0.49806663393974304, "loss_ce": 0.024433817714452744, "loss_xval": 0.47265625, "num_input_tokens_seen": 109734584, "step": 635 }, { "epoch": 0.2541966426858513, "grad_norm": 155.95322284214282, "learning_rate": 5e-06, "loss": 0.9948, "num_input_tokens_seen": 109907808, "step": 636 }, { "epoch": 0.2541966426858513, "loss": 0.8807600736618042, "loss_ce": 0.022056490182876587, "loss_xval": 0.859375, "num_input_tokens_seen": 109907808, "step": 636 }, { "epoch": 0.2545963229416467, "grad_norm": 36.34285927695085, "learning_rate": 5e-06, "loss": 0.7191, "num_input_tokens_seen": 110080336, "step": 637 }, { "epoch": 0.2545963229416467, "loss": 0.7213298678398132, "loss_ce": 0.021866969764232635, "loss_xval": 0.69921875, "num_input_tokens_seen": 110080336, "step": 637 }, { "epoch": 0.25499600319744203, "grad_norm": 40.14733221133789, "learning_rate": 5e-06, "loss": 0.5536, "num_input_tokens_seen": 110253400, "step": 638 }, { "epoch": 0.25499600319744203, "loss": 0.7994478940963745, "loss_ce": 0.017709653824567795, "loss_xval": 0.78125, "num_input_tokens_seen": 110253400, "step": 638 }, { "epoch": 0.25539568345323743, "grad_norm": 123.57240267301728, "learning_rate": 5e-06, "loss": 0.9722, "num_input_tokens_seen": 110426568, "step": 639 }, { "epoch": 0.25539568345323743, "loss": 0.8483471274375916, "loss_ce": 0.014118612743914127, "loss_xval": 0.8359375, "num_input_tokens_seen": 110426568, "step": 639 }, { "epoch": 0.2557953637090328, "grad_norm": 75.02054811289982, "learning_rate": 5e-06, "loss": 0.7891, "num_input_tokens_seen": 110599520, "step": 640 }, { "epoch": 0.2557953637090328, "loss": 0.7000423669815063, "loss_ce": 0.023406604304909706, "loss_xval": 0.67578125, "num_input_tokens_seen": 110599520, "step": 640 }, { "epoch": 0.2561950439648281, "grad_norm": 91.74212057215723, "learning_rate": 5e-06, "loss": 1.2231, "num_input_tokens_seen": 110772552, "step": 641 }, { "epoch": 0.2561950439648281, "loss": 1.4409394264221191, "loss_ce": 0.014792068861424923, "loss_xval": 1.4296875, "num_input_tokens_seen": 110772552, "step": 641 }, { "epoch": 0.2565947242206235, "grad_norm": 157.31170807411007, "learning_rate": 5e-06, "loss": 0.8605, "num_input_tokens_seen": 110945136, "step": 642 }, { "epoch": 0.2565947242206235, "loss": 0.8512501120567322, "loss_ce": 0.015312610194087029, "loss_xval": 0.8359375, "num_input_tokens_seen": 110945136, "step": 642 }, { "epoch": 0.25699440447641886, "grad_norm": 58.06363263841326, "learning_rate": 5e-06, "loss": 1.0306, "num_input_tokens_seen": 111118304, "step": 643 }, { "epoch": 0.25699440447641886, "loss": 1.2408883571624756, "loss_ce": 0.014325831085443497, "loss_xval": 1.2265625, "num_input_tokens_seen": 111118304, "step": 643 }, { "epoch": 0.2573940847322142, "grad_norm": 83.05291570147797, "learning_rate": 5e-06, "loss": 0.8975, "num_input_tokens_seen": 111291296, "step": 644 }, { "epoch": 0.2573940847322142, "loss": 0.860167920589447, "loss_ce": 0.012267546728253365, "loss_xval": 0.84765625, "num_input_tokens_seen": 111291296, "step": 644 }, { "epoch": 0.2577937649880096, "grad_norm": 106.75725756616248, "learning_rate": 5e-06, "loss": 0.7986, "num_input_tokens_seen": 111464416, "step": 645 }, { "epoch": 0.2577937649880096, "loss": 0.8707510828971863, "loss_ce": 0.014671968296170235, "loss_xval": 0.85546875, "num_input_tokens_seen": 111464416, "step": 645 }, { "epoch": 0.25819344524380494, "grad_norm": 65.34641583085333, "learning_rate": 5e-06, "loss": 0.892, "num_input_tokens_seen": 111637288, "step": 646 }, { "epoch": 0.25819344524380494, "loss": 0.8701699376106262, "loss_ce": 0.022757841274142265, "loss_xval": 0.84765625, "num_input_tokens_seen": 111637288, "step": 646 }, { "epoch": 0.25859312549960034, "grad_norm": 69.58421071925531, "learning_rate": 5e-06, "loss": 0.7172, "num_input_tokens_seen": 111810496, "step": 647 }, { "epoch": 0.25859312549960034, "loss": 0.6108307242393494, "loss_ce": 0.0166535172611475, "loss_xval": 0.59375, "num_input_tokens_seen": 111810496, "step": 647 }, { "epoch": 0.2589928057553957, "grad_norm": 60.44211136131847, "learning_rate": 5e-06, "loss": 0.8919, "num_input_tokens_seen": 111983536, "step": 648 }, { "epoch": 0.2589928057553957, "loss": 1.001638412475586, "loss_ce": 0.021535882726311684, "loss_xval": 0.98046875, "num_input_tokens_seen": 111983536, "step": 648 }, { "epoch": 0.259392486011191, "grad_norm": 37.92595630946876, "learning_rate": 5e-06, "loss": 1.3082, "num_input_tokens_seen": 112156504, "step": 649 }, { "epoch": 0.259392486011191, "loss": 1.5613832473754883, "loss_ce": 0.020733918994665146, "loss_xval": 1.5390625, "num_input_tokens_seen": 112156504, "step": 649 }, { "epoch": 0.2597921662669864, "grad_norm": 28.670635237631853, "learning_rate": 5e-06, "loss": 1.019, "num_input_tokens_seen": 112329664, "step": 650 }, { "epoch": 0.2597921662669864, "loss": 1.3567280769348145, "loss_ce": 0.021767208352684975, "loss_xval": 1.3359375, "num_input_tokens_seen": 112329664, "step": 650 }, { "epoch": 0.26019184652278177, "grad_norm": 62.41074023562404, "learning_rate": 5e-06, "loss": 0.9244, "num_input_tokens_seen": 112502960, "step": 651 }, { "epoch": 0.26019184652278177, "loss": 0.9109457731246948, "loss_ce": 0.013728970661759377, "loss_xval": 0.8984375, "num_input_tokens_seen": 112502960, "step": 651 }, { "epoch": 0.26059152677857716, "grad_norm": 45.00813129699106, "learning_rate": 5e-06, "loss": 0.8637, "num_input_tokens_seen": 112675584, "step": 652 }, { "epoch": 0.26059152677857716, "loss": 0.6911357045173645, "loss_ce": 0.02640179917216301, "loss_xval": 0.6640625, "num_input_tokens_seen": 112675584, "step": 652 }, { "epoch": 0.2609912070343725, "grad_norm": 136.9051208047938, "learning_rate": 5e-06, "loss": 0.947, "num_input_tokens_seen": 112848712, "step": 653 }, { "epoch": 0.2609912070343725, "loss": 1.220213770866394, "loss_ce": 0.015074612572789192, "loss_xval": 1.203125, "num_input_tokens_seen": 112848712, "step": 653 }, { "epoch": 0.26139088729016785, "grad_norm": 191.51181392428464, "learning_rate": 5e-06, "loss": 1.027, "num_input_tokens_seen": 113021672, "step": 654 }, { "epoch": 0.26139088729016785, "loss": 0.8155316114425659, "loss_ce": 0.02170836180448532, "loss_xval": 0.79296875, "num_input_tokens_seen": 113021672, "step": 654 }, { "epoch": 0.26179056754596325, "grad_norm": 59.47364063857337, "learning_rate": 5e-06, "loss": 0.9311, "num_input_tokens_seen": 113194640, "step": 655 }, { "epoch": 0.26179056754596325, "loss": 0.7021055221557617, "loss_ce": 0.0113096684217453, "loss_xval": 0.69140625, "num_input_tokens_seen": 113194640, "step": 655 }, { "epoch": 0.2621902478017586, "grad_norm": 299.2925607489183, "learning_rate": 5e-06, "loss": 1.1994, "num_input_tokens_seen": 113367520, "step": 656 }, { "epoch": 0.2621902478017586, "loss": 1.1251329183578491, "loss_ce": 0.012095760554075241, "loss_xval": 1.109375, "num_input_tokens_seen": 113367520, "step": 656 }, { "epoch": 0.26258992805755393, "grad_norm": 152.9578503535034, "learning_rate": 5e-06, "loss": 0.6672, "num_input_tokens_seen": 113540536, "step": 657 }, { "epoch": 0.26258992805755393, "loss": 0.8924187421798706, "loss_ce": 0.01619800738990307, "loss_xval": 0.875, "num_input_tokens_seen": 113540536, "step": 657 }, { "epoch": 0.26298960831334933, "grad_norm": 174.0581314265997, "learning_rate": 5e-06, "loss": 1.0259, "num_input_tokens_seen": 113713200, "step": 658 }, { "epoch": 0.26298960831334933, "loss": 1.100234031677246, "loss_ce": 0.011733030900359154, "loss_xval": 1.0859375, "num_input_tokens_seen": 113713200, "step": 658 }, { "epoch": 0.2633892885691447, "grad_norm": 84.02652993262616, "learning_rate": 5e-06, "loss": 0.7989, "num_input_tokens_seen": 113886064, "step": 659 }, { "epoch": 0.2633892885691447, "loss": 0.5299695730209351, "loss_ce": 0.014466674998402596, "loss_xval": 0.515625, "num_input_tokens_seen": 113886064, "step": 659 }, { "epoch": 0.2637889688249401, "grad_norm": 236.5341573563948, "learning_rate": 5e-06, "loss": 1.0824, "num_input_tokens_seen": 114058936, "step": 660 }, { "epoch": 0.2637889688249401, "loss": 0.8847863674163818, "loss_ce": 0.013448446989059448, "loss_xval": 0.87109375, "num_input_tokens_seen": 114058936, "step": 660 }, { "epoch": 0.2641886490807354, "grad_norm": 169.64090644183318, "learning_rate": 5e-06, "loss": 1.0582, "num_input_tokens_seen": 114231344, "step": 661 }, { "epoch": 0.2641886490807354, "loss": 1.3163893222808838, "loss_ce": 0.01853768527507782, "loss_xval": 1.296875, "num_input_tokens_seen": 114231344, "step": 661 }, { "epoch": 0.26458832933653076, "grad_norm": 175.88382335589282, "learning_rate": 5e-06, "loss": 0.6119, "num_input_tokens_seen": 114404496, "step": 662 }, { "epoch": 0.26458832933653076, "loss": 0.6016393899917603, "loss_ce": 0.019119868054986, "loss_xval": 0.58203125, "num_input_tokens_seen": 114404496, "step": 662 }, { "epoch": 0.26498800959232616, "grad_norm": 195.8103408122979, "learning_rate": 5e-06, "loss": 1.1647, "num_input_tokens_seen": 114577520, "step": 663 }, { "epoch": 0.26498800959232616, "loss": 0.993999719619751, "loss_ce": 0.018291711807250977, "loss_xval": 0.9765625, "num_input_tokens_seen": 114577520, "step": 663 }, { "epoch": 0.2653876898481215, "grad_norm": 138.23767892074832, "learning_rate": 5e-06, "loss": 1.0253, "num_input_tokens_seen": 114750672, "step": 664 }, { "epoch": 0.2653876898481215, "loss": 1.1113959550857544, "loss_ce": 0.018378403037786484, "loss_xval": 1.09375, "num_input_tokens_seen": 114750672, "step": 664 }, { "epoch": 0.26578737010391684, "grad_norm": 190.91695554448776, "learning_rate": 5e-06, "loss": 1.0631, "num_input_tokens_seen": 114923496, "step": 665 }, { "epoch": 0.26578737010391684, "loss": 0.6147146821022034, "loss_ce": 0.024016443639993668, "loss_xval": 0.58984375, "num_input_tokens_seen": 114923496, "step": 665 }, { "epoch": 0.26618705035971224, "grad_norm": 102.45689102808429, "learning_rate": 5e-06, "loss": 0.8536, "num_input_tokens_seen": 115096112, "step": 666 }, { "epoch": 0.26618705035971224, "loss": 0.6601771116256714, "loss_ce": 0.0206507109105587, "loss_xval": 0.640625, "num_input_tokens_seen": 115096112, "step": 666 }, { "epoch": 0.2665867306155076, "grad_norm": 175.7561493345075, "learning_rate": 5e-06, "loss": 0.9328, "num_input_tokens_seen": 115269536, "step": 667 }, { "epoch": 0.2665867306155076, "loss": 1.2699246406555176, "loss_ce": 0.029201963916420937, "loss_xval": 1.2421875, "num_input_tokens_seen": 115269536, "step": 667 }, { "epoch": 0.266986410871303, "grad_norm": 75.22857527978371, "learning_rate": 5e-06, "loss": 0.661, "num_input_tokens_seen": 115442560, "step": 668 }, { "epoch": 0.266986410871303, "loss": 0.6588489413261414, "loss_ce": 0.030308909714221954, "loss_xval": 0.62890625, "num_input_tokens_seen": 115442560, "step": 668 }, { "epoch": 0.2673860911270983, "grad_norm": 146.67725635888027, "learning_rate": 5e-06, "loss": 0.968, "num_input_tokens_seen": 115615360, "step": 669 }, { "epoch": 0.2673860911270983, "loss": 1.12626314163208, "loss_ce": 0.026165474206209183, "loss_xval": 1.1015625, "num_input_tokens_seen": 115615360, "step": 669 }, { "epoch": 0.26778577138289367, "grad_norm": 133.73687162857127, "learning_rate": 5e-06, "loss": 0.8349, "num_input_tokens_seen": 115788160, "step": 670 }, { "epoch": 0.26778577138289367, "loss": 1.1909589767456055, "loss_ce": 0.02347848378121853, "loss_xval": 1.1640625, "num_input_tokens_seen": 115788160, "step": 670 }, { "epoch": 0.26818545163868907, "grad_norm": 270.0264431031233, "learning_rate": 5e-06, "loss": 1.0488, "num_input_tokens_seen": 115960856, "step": 671 }, { "epoch": 0.26818545163868907, "loss": 0.9442439675331116, "loss_ce": 0.03384360671043396, "loss_xval": 0.91015625, "num_input_tokens_seen": 115960856, "step": 671 }, { "epoch": 0.2685851318944844, "grad_norm": 118.67248768897453, "learning_rate": 5e-06, "loss": 0.9384, "num_input_tokens_seen": 116133480, "step": 672 }, { "epoch": 0.2685851318944844, "loss": 0.7503706216812134, "loss_ce": 0.023441843688488007, "loss_xval": 0.7265625, "num_input_tokens_seen": 116133480, "step": 672 }, { "epoch": 0.26898481215027975, "grad_norm": 410.22210699900944, "learning_rate": 5e-06, "loss": 0.984, "num_input_tokens_seen": 116306376, "step": 673 }, { "epoch": 0.26898481215027975, "loss": 1.0442287921905518, "loss_ce": 0.017373330891132355, "loss_xval": 1.0234375, "num_input_tokens_seen": 116306376, "step": 673 }, { "epoch": 0.26938449240607515, "grad_norm": 49.025436414803835, "learning_rate": 5e-06, "loss": 1.0499, "num_input_tokens_seen": 116479536, "step": 674 }, { "epoch": 0.26938449240607515, "loss": 1.7757625579833984, "loss_ce": 0.01941489428281784, "loss_xval": 1.7578125, "num_input_tokens_seen": 116479536, "step": 674 }, { "epoch": 0.2697841726618705, "grad_norm": 413.2558380877526, "learning_rate": 5e-06, "loss": 0.8974, "num_input_tokens_seen": 116652840, "step": 675 }, { "epoch": 0.2697841726618705, "loss": 1.0395095348358154, "loss_ce": 0.024128668010234833, "loss_xval": 1.015625, "num_input_tokens_seen": 116652840, "step": 675 }, { "epoch": 0.2701838529176659, "grad_norm": 121.24926053527112, "learning_rate": 5e-06, "loss": 1.2369, "num_input_tokens_seen": 116825496, "step": 676 }, { "epoch": 0.2701838529176659, "loss": 1.2461820840835571, "loss_ce": 0.0210844948887825, "loss_xval": 1.2265625, "num_input_tokens_seen": 116825496, "step": 676 }, { "epoch": 0.27058353317346123, "grad_norm": 268.47864300466, "learning_rate": 5e-06, "loss": 0.9889, "num_input_tokens_seen": 116998824, "step": 677 }, { "epoch": 0.27058353317346123, "loss": 0.8895606994628906, "loss_ce": 0.021884921938180923, "loss_xval": 0.8671875, "num_input_tokens_seen": 116998824, "step": 677 }, { "epoch": 0.2709832134292566, "grad_norm": 64.88427270509409, "learning_rate": 5e-06, "loss": 0.6857, "num_input_tokens_seen": 117171936, "step": 678 }, { "epoch": 0.2709832134292566, "loss": 0.778445839881897, "loss_ce": 0.022342335432767868, "loss_xval": 0.7578125, "num_input_tokens_seen": 117171936, "step": 678 }, { "epoch": 0.271382893685052, "grad_norm": 166.62187626988478, "learning_rate": 5e-06, "loss": 0.6886, "num_input_tokens_seen": 117344920, "step": 679 }, { "epoch": 0.271382893685052, "loss": 0.4085671603679657, "loss_ce": 0.023191187530755997, "loss_xval": 0.384765625, "num_input_tokens_seen": 117344920, "step": 679 }, { "epoch": 0.2717825739408473, "grad_norm": 52.26940993035468, "learning_rate": 5e-06, "loss": 1.0123, "num_input_tokens_seen": 117517976, "step": 680 }, { "epoch": 0.2717825739408473, "loss": 1.3288424015045166, "loss_ce": 0.02269016206264496, "loss_xval": 1.3046875, "num_input_tokens_seen": 117517976, "step": 680 }, { "epoch": 0.27218225419664266, "grad_norm": 142.4492173509578, "learning_rate": 5e-06, "loss": 0.7816, "num_input_tokens_seen": 117690720, "step": 681 }, { "epoch": 0.27218225419664266, "loss": 0.7112863063812256, "loss_ce": 0.026471804827451706, "loss_xval": 0.68359375, "num_input_tokens_seen": 117690720, "step": 681 }, { "epoch": 0.27258193445243806, "grad_norm": 70.93340773235381, "learning_rate": 5e-06, "loss": 0.8648, "num_input_tokens_seen": 117863704, "step": 682 }, { "epoch": 0.27258193445243806, "loss": 1.0744261741638184, "loss_ce": 0.02364499494433403, "loss_xval": 1.046875, "num_input_tokens_seen": 117863704, "step": 682 }, { "epoch": 0.2729816147082334, "grad_norm": 65.75630002146995, "learning_rate": 5e-06, "loss": 0.8312, "num_input_tokens_seen": 118036656, "step": 683 }, { "epoch": 0.2729816147082334, "loss": 0.5977785587310791, "loss_ce": 0.023987047374248505, "loss_xval": 0.57421875, "num_input_tokens_seen": 118036656, "step": 683 }, { "epoch": 0.2733812949640288, "grad_norm": 54.55328598333207, "learning_rate": 5e-06, "loss": 0.5361, "num_input_tokens_seen": 118209616, "step": 684 }, { "epoch": 0.2733812949640288, "loss": 0.4402380585670471, "loss_ce": 0.01866826042532921, "loss_xval": 0.421875, "num_input_tokens_seen": 118209616, "step": 684 }, { "epoch": 0.27378097521982414, "grad_norm": 72.43434827664534, "learning_rate": 5e-06, "loss": 0.8812, "num_input_tokens_seen": 118382624, "step": 685 }, { "epoch": 0.27378097521982414, "loss": 0.9849303364753723, "loss_ce": 0.023275673389434814, "loss_xval": 0.9609375, "num_input_tokens_seen": 118382624, "step": 685 }, { "epoch": 0.2741806554756195, "grad_norm": 65.98449745999218, "learning_rate": 5e-06, "loss": 0.9268, "num_input_tokens_seen": 118555344, "step": 686 }, { "epoch": 0.2741806554756195, "loss": 1.1211669445037842, "loss_ce": 0.021313386037945747, "loss_xval": 1.1015625, "num_input_tokens_seen": 118555344, "step": 686 }, { "epoch": 0.2745803357314149, "grad_norm": 63.9892841046574, "learning_rate": 5e-06, "loss": 0.8567, "num_input_tokens_seen": 118728288, "step": 687 }, { "epoch": 0.2745803357314149, "loss": 0.9254956245422363, "loss_ce": 0.020466340705752373, "loss_xval": 0.90625, "num_input_tokens_seen": 118728288, "step": 687 }, { "epoch": 0.2749800159872102, "grad_norm": 56.42397672481398, "learning_rate": 5e-06, "loss": 0.9393, "num_input_tokens_seen": 118900712, "step": 688 }, { "epoch": 0.2749800159872102, "loss": 0.8299415111541748, "loss_ce": 0.01860113814473152, "loss_xval": 0.8125, "num_input_tokens_seen": 118900712, "step": 688 }, { "epoch": 0.2753796962430056, "grad_norm": 91.65977622605617, "learning_rate": 5e-06, "loss": 0.8144, "num_input_tokens_seen": 119073720, "step": 689 }, { "epoch": 0.2753796962430056, "loss": 0.9090508818626404, "loss_ce": 0.016350697726011276, "loss_xval": 0.89453125, "num_input_tokens_seen": 119073720, "step": 689 }, { "epoch": 0.27577937649880097, "grad_norm": 39.025883096673645, "learning_rate": 5e-06, "loss": 0.6471, "num_input_tokens_seen": 119246792, "step": 690 }, { "epoch": 0.27577937649880097, "loss": 0.6969112157821655, "loss_ce": 0.014599241316318512, "loss_xval": 0.68359375, "num_input_tokens_seen": 119246792, "step": 690 }, { "epoch": 0.2761790567545963, "grad_norm": 36.73841496397389, "learning_rate": 5e-06, "loss": 0.6317, "num_input_tokens_seen": 119419560, "step": 691 }, { "epoch": 0.2761790567545963, "loss": 0.9147318005561829, "loss_ce": 0.012082915753126144, "loss_xval": 0.90234375, "num_input_tokens_seen": 119419560, "step": 691 }, { "epoch": 0.2765787370103917, "grad_norm": 168.6230690419483, "learning_rate": 5e-06, "loss": 0.7357, "num_input_tokens_seen": 119592824, "step": 692 }, { "epoch": 0.2765787370103917, "loss": 0.6877519488334656, "loss_ce": 0.014412128366529942, "loss_xval": 0.671875, "num_input_tokens_seen": 119592824, "step": 692 }, { "epoch": 0.27697841726618705, "grad_norm": 52.75849147183071, "learning_rate": 5e-06, "loss": 0.7668, "num_input_tokens_seen": 119762056, "step": 693 }, { "epoch": 0.27697841726618705, "loss": 0.33476799726486206, "loss_ce": 0.009755777195096016, "loss_xval": 0.32421875, "num_input_tokens_seen": 119762056, "step": 693 }, { "epoch": 0.2773780975219824, "grad_norm": 131.1361500389134, "learning_rate": 5e-06, "loss": 0.9371, "num_input_tokens_seen": 119935104, "step": 694 }, { "epoch": 0.2773780975219824, "loss": 1.146599531173706, "loss_ce": 0.009636607021093369, "loss_xval": 1.140625, "num_input_tokens_seen": 119935104, "step": 694 }, { "epoch": 0.2777777777777778, "grad_norm": 47.6259432845342, "learning_rate": 5e-06, "loss": 1.0213, "num_input_tokens_seen": 120107776, "step": 695 }, { "epoch": 0.2777777777777778, "loss": 0.7354253530502319, "loss_ce": 0.011792542412877083, "loss_xval": 0.72265625, "num_input_tokens_seen": 120107776, "step": 695 }, { "epoch": 0.27817745803357313, "grad_norm": 291.1641874227151, "learning_rate": 5e-06, "loss": 0.7937, "num_input_tokens_seen": 120280744, "step": 696 }, { "epoch": 0.27817745803357313, "loss": 0.6080259084701538, "loss_ce": 0.01232281606644392, "loss_xval": 0.59375, "num_input_tokens_seen": 120280744, "step": 696 }, { "epoch": 0.27857713828936853, "grad_norm": 129.99843252720103, "learning_rate": 5e-06, "loss": 0.8997, "num_input_tokens_seen": 120453824, "step": 697 }, { "epoch": 0.27857713828936853, "loss": 0.6489090919494629, "loss_ce": 0.008528226986527443, "loss_xval": 0.640625, "num_input_tokens_seen": 120453824, "step": 697 }, { "epoch": 0.2789768185451639, "grad_norm": 136.31513410825676, "learning_rate": 5e-06, "loss": 0.5905, "num_input_tokens_seen": 120626520, "step": 698 }, { "epoch": 0.2789768185451639, "loss": 0.7713261842727661, "loss_ce": 0.007776360027492046, "loss_xval": 0.76171875, "num_input_tokens_seen": 120626520, "step": 698 }, { "epoch": 0.2793764988009592, "grad_norm": 55.383571736824486, "learning_rate": 5e-06, "loss": 0.9588, "num_input_tokens_seen": 120799808, "step": 699 }, { "epoch": 0.2793764988009592, "loss": 1.009063482284546, "loss_ce": 0.011138629168272018, "loss_xval": 0.99609375, "num_input_tokens_seen": 120799808, "step": 699 }, { "epoch": 0.2797761790567546, "grad_norm": 195.29831466108058, "learning_rate": 5e-06, "loss": 1.1378, "num_input_tokens_seen": 120972440, "step": 700 }, { "epoch": 0.2797761790567546, "loss": 1.147801399230957, "loss_ce": 0.013279901817440987, "loss_xval": 1.1328125, "num_input_tokens_seen": 120972440, "step": 700 }, { "epoch": 0.28017585931254996, "grad_norm": 117.78468354041925, "learning_rate": 5e-06, "loss": 0.6818, "num_input_tokens_seen": 121145656, "step": 701 }, { "epoch": 0.28017585931254996, "loss": 0.387192964553833, "loss_ce": 0.012437107972800732, "loss_xval": 0.375, "num_input_tokens_seen": 121145656, "step": 701 }, { "epoch": 0.2805755395683453, "grad_norm": 349.182381437043, "learning_rate": 5e-06, "loss": 0.9223, "num_input_tokens_seen": 121318464, "step": 702 }, { "epoch": 0.2805755395683453, "loss": 0.9129467606544495, "loss_ce": 0.01377684623003006, "loss_xval": 0.8984375, "num_input_tokens_seen": 121318464, "step": 702 }, { "epoch": 0.2809752198241407, "grad_norm": 78.93412862466627, "learning_rate": 5e-06, "loss": 1.0278, "num_input_tokens_seen": 121491328, "step": 703 }, { "epoch": 0.2809752198241407, "loss": 0.9784796237945557, "loss_ce": 0.014093691483139992, "loss_xval": 0.96484375, "num_input_tokens_seen": 121491328, "step": 703 }, { "epoch": 0.28137490007993604, "grad_norm": 153.8104816648774, "learning_rate": 5e-06, "loss": 0.713, "num_input_tokens_seen": 121664304, "step": 704 }, { "epoch": 0.28137490007993604, "loss": 0.562119722366333, "loss_ce": 0.02134818211197853, "loss_xval": 0.5390625, "num_input_tokens_seen": 121664304, "step": 704 }, { "epoch": 0.28177458033573144, "grad_norm": 62.97326893902962, "learning_rate": 5e-06, "loss": 0.6645, "num_input_tokens_seen": 121837640, "step": 705 }, { "epoch": 0.28177458033573144, "loss": 0.5091035962104797, "loss_ce": 0.014718795195221901, "loss_xval": 0.494140625, "num_input_tokens_seen": 121837640, "step": 705 }, { "epoch": 0.2821742605915268, "grad_norm": 124.77914466366667, "learning_rate": 5e-06, "loss": 0.5219, "num_input_tokens_seen": 122010648, "step": 706 }, { "epoch": 0.2821742605915268, "loss": 0.6143680810928345, "loss_ce": 0.02232704497873783, "loss_xval": 0.59375, "num_input_tokens_seen": 122010648, "step": 706 }, { "epoch": 0.2825739408473221, "grad_norm": 61.81361725933705, "learning_rate": 5e-06, "loss": 0.7228, "num_input_tokens_seen": 122183368, "step": 707 }, { "epoch": 0.2825739408473221, "loss": 0.7822574973106384, "loss_ce": 0.023407384753227234, "loss_xval": 0.7578125, "num_input_tokens_seen": 122183368, "step": 707 }, { "epoch": 0.2829736211031175, "grad_norm": 49.1614349431222, "learning_rate": 5e-06, "loss": 0.9115, "num_input_tokens_seen": 122356296, "step": 708 }, { "epoch": 0.2829736211031175, "loss": 0.7862190008163452, "loss_ce": 0.019556399434804916, "loss_xval": 0.765625, "num_input_tokens_seen": 122356296, "step": 708 }, { "epoch": 0.28337330135891287, "grad_norm": 127.70627885796577, "learning_rate": 5e-06, "loss": 0.785, "num_input_tokens_seen": 122529488, "step": 709 }, { "epoch": 0.28337330135891287, "loss": 1.0741759538650513, "loss_ce": 0.03291618824005127, "loss_xval": 1.0390625, "num_input_tokens_seen": 122529488, "step": 709 }, { "epoch": 0.2837729816147082, "grad_norm": 31.053281397497233, "learning_rate": 5e-06, "loss": 0.8754, "num_input_tokens_seen": 122702752, "step": 710 }, { "epoch": 0.2837729816147082, "loss": 0.7221265435218811, "loss_ce": 0.019550863653421402, "loss_xval": 0.703125, "num_input_tokens_seen": 122702752, "step": 710 }, { "epoch": 0.2841726618705036, "grad_norm": 212.21575411655493, "learning_rate": 5e-06, "loss": 0.8754, "num_input_tokens_seen": 122875696, "step": 711 }, { "epoch": 0.2841726618705036, "loss": 0.996657133102417, "loss_ce": 0.02216985821723938, "loss_xval": 0.97265625, "num_input_tokens_seen": 122875696, "step": 711 }, { "epoch": 0.28457234212629895, "grad_norm": 40.100860368736996, "learning_rate": 5e-06, "loss": 0.7592, "num_input_tokens_seen": 123048768, "step": 712 }, { "epoch": 0.28457234212629895, "loss": 0.8356152772903442, "loss_ce": 0.01981930062174797, "loss_xval": 0.81640625, "num_input_tokens_seen": 123048768, "step": 712 }, { "epoch": 0.28497202238209435, "grad_norm": 214.2157109789503, "learning_rate": 5e-06, "loss": 0.732, "num_input_tokens_seen": 123221664, "step": 713 }, { "epoch": 0.28497202238209435, "loss": 0.5940902829170227, "loss_ce": 0.019993610680103302, "loss_xval": 0.57421875, "num_input_tokens_seen": 123221664, "step": 713 }, { "epoch": 0.2853717026378897, "grad_norm": 40.609562804706826, "learning_rate": 5e-06, "loss": 1.1444, "num_input_tokens_seen": 123394488, "step": 714 }, { "epoch": 0.2853717026378897, "loss": 0.7391673922538757, "loss_ce": 0.028840193524956703, "loss_xval": 0.7109375, "num_input_tokens_seen": 123394488, "step": 714 }, { "epoch": 0.28577138289368503, "grad_norm": 178.02080973482205, "learning_rate": 5e-06, "loss": 0.9773, "num_input_tokens_seen": 123567376, "step": 715 }, { "epoch": 0.28577138289368503, "loss": 0.922229528427124, "loss_ce": 0.018909169360995293, "loss_xval": 0.90234375, "num_input_tokens_seen": 123567376, "step": 715 }, { "epoch": 0.28617106314948043, "grad_norm": 123.04011045066329, "learning_rate": 5e-06, "loss": 1.068, "num_input_tokens_seen": 123740088, "step": 716 }, { "epoch": 0.28617106314948043, "loss": 1.4456578493118286, "loss_ce": 0.016336563974618912, "loss_xval": 1.4296875, "num_input_tokens_seen": 123740088, "step": 716 }, { "epoch": 0.2865707434052758, "grad_norm": 260.6443722826441, "learning_rate": 5e-06, "loss": 0.8816, "num_input_tokens_seen": 123913104, "step": 717 }, { "epoch": 0.2865707434052758, "loss": 0.5610959529876709, "loss_ce": 0.0250852033495903, "loss_xval": 0.53515625, "num_input_tokens_seen": 123913104, "step": 717 }, { "epoch": 0.2869704236610711, "grad_norm": 54.92546204084786, "learning_rate": 5e-06, "loss": 0.8094, "num_input_tokens_seen": 124085896, "step": 718 }, { "epoch": 0.2869704236610711, "loss": 0.9141414761543274, "loss_ce": 0.017046771943569183, "loss_xval": 0.8984375, "num_input_tokens_seen": 124085896, "step": 718 }, { "epoch": 0.2873701039168665, "grad_norm": 201.82756469697114, "learning_rate": 5e-06, "loss": 0.99, "num_input_tokens_seen": 124259080, "step": 719 }, { "epoch": 0.2873701039168665, "loss": 1.0651757717132568, "loss_ce": 0.01634761318564415, "loss_xval": 1.046875, "num_input_tokens_seen": 124259080, "step": 719 }, { "epoch": 0.28776978417266186, "grad_norm": 102.45815081080835, "learning_rate": 5e-06, "loss": 0.8853, "num_input_tokens_seen": 124431976, "step": 720 }, { "epoch": 0.28776978417266186, "loss": 1.0132455825805664, "loss_ce": 0.016541466116905212, "loss_xval": 0.99609375, "num_input_tokens_seen": 124431976, "step": 720 }, { "epoch": 0.28816946442845726, "grad_norm": 160.91023116493113, "learning_rate": 5e-06, "loss": 0.5593, "num_input_tokens_seen": 124604832, "step": 721 }, { "epoch": 0.28816946442845726, "loss": 0.4822537302970886, "loss_ce": 0.01203891821205616, "loss_xval": 0.470703125, "num_input_tokens_seen": 124604832, "step": 721 }, { "epoch": 0.2885691446842526, "grad_norm": 72.91741185752112, "learning_rate": 5e-06, "loss": 0.8368, "num_input_tokens_seen": 124778048, "step": 722 }, { "epoch": 0.2885691446842526, "loss": 1.0014230012893677, "loss_ce": 0.017902549356222153, "loss_xval": 0.984375, "num_input_tokens_seen": 124778048, "step": 722 }, { "epoch": 0.28896882494004794, "grad_norm": 146.8668026594306, "learning_rate": 5e-06, "loss": 0.8567, "num_input_tokens_seen": 124951344, "step": 723 }, { "epoch": 0.28896882494004794, "loss": 0.8188395500183105, "loss_ce": 0.015738962218165398, "loss_xval": 0.8046875, "num_input_tokens_seen": 124951344, "step": 723 }, { "epoch": 0.28936850519584334, "grad_norm": 42.10566547583542, "learning_rate": 5e-06, "loss": 0.5991, "num_input_tokens_seen": 125124264, "step": 724 }, { "epoch": 0.28936850519584334, "loss": 0.6889089345932007, "loss_ce": 0.01135767251253128, "loss_xval": 0.67578125, "num_input_tokens_seen": 125124264, "step": 724 }, { "epoch": 0.2897681854516387, "grad_norm": 123.25802216263138, "learning_rate": 5e-06, "loss": 0.7095, "num_input_tokens_seen": 125297072, "step": 725 }, { "epoch": 0.2897681854516387, "loss": 0.25660455226898193, "loss_ce": 0.010632868856191635, "loss_xval": 0.24609375, "num_input_tokens_seen": 125297072, "step": 725 }, { "epoch": 0.290167865707434, "grad_norm": 68.66541377559568, "learning_rate": 5e-06, "loss": 1.1072, "num_input_tokens_seen": 125469880, "step": 726 }, { "epoch": 0.290167865707434, "loss": 1.066777229309082, "loss_ce": 0.011845514178276062, "loss_xval": 1.0546875, "num_input_tokens_seen": 125469880, "step": 726 }, { "epoch": 0.2905675459632294, "grad_norm": 118.39291047454554, "learning_rate": 5e-06, "loss": 1.0021, "num_input_tokens_seen": 125642760, "step": 727 }, { "epoch": 0.2905675459632294, "loss": 1.0062450170516968, "loss_ce": 0.020649326965212822, "loss_xval": 0.984375, "num_input_tokens_seen": 125642760, "step": 727 }, { "epoch": 0.29096722621902477, "grad_norm": 133.06920888386995, "learning_rate": 5e-06, "loss": 1.0436, "num_input_tokens_seen": 125815440, "step": 728 }, { "epoch": 0.29096722621902477, "loss": 1.2720887660980225, "loss_ce": 0.014886559918522835, "loss_xval": 1.2578125, "num_input_tokens_seen": 125815440, "step": 728 }, { "epoch": 0.29136690647482016, "grad_norm": 41.10436043549906, "learning_rate": 5e-06, "loss": 0.9073, "num_input_tokens_seen": 125988424, "step": 729 }, { "epoch": 0.29136690647482016, "loss": 1.0338069200515747, "loss_ce": 0.01378735899925232, "loss_xval": 1.0234375, "num_input_tokens_seen": 125988424, "step": 729 }, { "epoch": 0.2917665867306155, "grad_norm": 171.41761601925776, "learning_rate": 5e-06, "loss": 1.1713, "num_input_tokens_seen": 126161680, "step": 730 }, { "epoch": 0.2917665867306155, "loss": 1.4746264219284058, "loss_ce": 0.011247565969824791, "loss_xval": 1.4609375, "num_input_tokens_seen": 126161680, "step": 730 }, { "epoch": 0.29216626698641085, "grad_norm": 30.728524368886863, "learning_rate": 5e-06, "loss": 0.9324, "num_input_tokens_seen": 126334240, "step": 731 }, { "epoch": 0.29216626698641085, "loss": 1.2535715103149414, "loss_ce": 0.01419171690940857, "loss_xval": 1.2421875, "num_input_tokens_seen": 126334240, "step": 731 }, { "epoch": 0.29256594724220625, "grad_norm": 154.08953622194392, "learning_rate": 5e-06, "loss": 0.6463, "num_input_tokens_seen": 126506840, "step": 732 }, { "epoch": 0.29256594724220625, "loss": 0.7345225811004639, "loss_ce": 0.013819379732012749, "loss_xval": 0.71875, "num_input_tokens_seen": 126506840, "step": 732 }, { "epoch": 0.2929656274980016, "grad_norm": 67.98532313091332, "learning_rate": 5e-06, "loss": 0.5375, "num_input_tokens_seen": 126679928, "step": 733 }, { "epoch": 0.2929656274980016, "loss": 0.6890181303024292, "loss_ce": 0.013786174356937408, "loss_xval": 0.67578125, "num_input_tokens_seen": 126679928, "step": 733 }, { "epoch": 0.293365307753797, "grad_norm": 169.58988174707864, "learning_rate": 5e-06, "loss": 0.8897, "num_input_tokens_seen": 126852704, "step": 734 }, { "epoch": 0.293365307753797, "loss": 0.8869085311889648, "loss_ce": 0.011359157972037792, "loss_xval": 0.875, "num_input_tokens_seen": 126852704, "step": 734 }, { "epoch": 0.29376498800959233, "grad_norm": 188.55727116539376, "learning_rate": 5e-06, "loss": 0.655, "num_input_tokens_seen": 127025936, "step": 735 }, { "epoch": 0.29376498800959233, "loss": 0.6508488655090332, "loss_ce": 0.009491443634033203, "loss_xval": 0.640625, "num_input_tokens_seen": 127025936, "step": 735 }, { "epoch": 0.2941646682653877, "grad_norm": 57.679259705456175, "learning_rate": 5e-06, "loss": 1.2767, "num_input_tokens_seen": 127198680, "step": 736 }, { "epoch": 0.2941646682653877, "loss": 1.423262596130371, "loss_ce": 0.00944419577717781, "loss_xval": 1.4140625, "num_input_tokens_seen": 127198680, "step": 736 }, { "epoch": 0.2945643485211831, "grad_norm": 145.57825597022912, "learning_rate": 5e-06, "loss": 0.8213, "num_input_tokens_seen": 127371416, "step": 737 }, { "epoch": 0.2945643485211831, "loss": 1.0838699340820312, "loss_ce": 0.01673116721212864, "loss_xval": 1.0703125, "num_input_tokens_seen": 127371416, "step": 737 }, { "epoch": 0.2949640287769784, "grad_norm": 49.478090664102595, "learning_rate": 5e-06, "loss": 0.893, "num_input_tokens_seen": 127544496, "step": 738 }, { "epoch": 0.2949640287769784, "loss": 1.1251657009124756, "loss_ce": 0.012983132153749466, "loss_xval": 1.109375, "num_input_tokens_seen": 127544496, "step": 738 }, { "epoch": 0.29536370903277376, "grad_norm": 152.29658505097626, "learning_rate": 5e-06, "loss": 0.6981, "num_input_tokens_seen": 127713600, "step": 739 }, { "epoch": 0.29536370903277376, "loss": 0.8325961828231812, "loss_ce": 0.02095065638422966, "loss_xval": 0.8125, "num_input_tokens_seen": 127713600, "step": 739 }, { "epoch": 0.29576338928856916, "grad_norm": 104.36305781820683, "learning_rate": 5e-06, "loss": 0.9132, "num_input_tokens_seen": 127886552, "step": 740 }, { "epoch": 0.29576338928856916, "loss": 0.8980221152305603, "loss_ce": 0.019421041011810303, "loss_xval": 0.87890625, "num_input_tokens_seen": 127886552, "step": 740 }, { "epoch": 0.2961630695443645, "grad_norm": 358.56742391146184, "learning_rate": 5e-06, "loss": 1.2376, "num_input_tokens_seen": 128059336, "step": 741 }, { "epoch": 0.2961630695443645, "loss": 1.2877583503723145, "loss_ce": 0.014748061075806618, "loss_xval": 1.2734375, "num_input_tokens_seen": 128059336, "step": 741 }, { "epoch": 0.2965627498001599, "grad_norm": 38.999705304706644, "learning_rate": 5e-06, "loss": 0.7955, "num_input_tokens_seen": 128232384, "step": 742 }, { "epoch": 0.2965627498001599, "loss": 0.7341563701629639, "loss_ce": 0.022913720458745956, "loss_xval": 0.7109375, "num_input_tokens_seen": 128232384, "step": 742 }, { "epoch": 0.29696243005595524, "grad_norm": 58.58273127409154, "learning_rate": 5e-06, "loss": 0.6488, "num_input_tokens_seen": 128405632, "step": 743 }, { "epoch": 0.29696243005595524, "loss": 0.8151225447654724, "loss_ce": 0.016538549214601517, "loss_xval": 0.796875, "num_input_tokens_seen": 128405632, "step": 743 }, { "epoch": 0.2973621103117506, "grad_norm": 34.3307729926042, "learning_rate": 5e-06, "loss": 0.8303, "num_input_tokens_seen": 128578240, "step": 744 }, { "epoch": 0.2973621103117506, "loss": 0.7622973322868347, "loss_ce": 0.012358361855149269, "loss_xval": 0.75, "num_input_tokens_seen": 128578240, "step": 744 }, { "epoch": 0.297761790567546, "grad_norm": 80.36115843481713, "learning_rate": 5e-06, "loss": 0.7475, "num_input_tokens_seen": 128751168, "step": 745 }, { "epoch": 0.297761790567546, "loss": 0.5837757587432861, "loss_ce": 0.01822400838136673, "loss_xval": 0.56640625, "num_input_tokens_seen": 128751168, "step": 745 }, { "epoch": 0.2981614708233413, "grad_norm": 92.27821846652243, "learning_rate": 5e-06, "loss": 1.0467, "num_input_tokens_seen": 128923664, "step": 746 }, { "epoch": 0.2981614708233413, "loss": 0.9637579917907715, "loss_ce": 0.014783459715545177, "loss_xval": 0.94921875, "num_input_tokens_seen": 128923664, "step": 746 }, { "epoch": 0.29856115107913667, "grad_norm": 59.953210838770005, "learning_rate": 5e-06, "loss": 1.1645, "num_input_tokens_seen": 129096520, "step": 747 }, { "epoch": 0.29856115107913667, "loss": 0.559451162815094, "loss_ce": 0.011294430121779442, "loss_xval": 0.546875, "num_input_tokens_seen": 129096520, "step": 747 }, { "epoch": 0.29896083133493206, "grad_norm": 54.717171712560024, "learning_rate": 5e-06, "loss": 0.6601, "num_input_tokens_seen": 129269376, "step": 748 }, { "epoch": 0.29896083133493206, "loss": 0.5837520956993103, "loss_ce": 0.01343961339443922, "loss_xval": 0.5703125, "num_input_tokens_seen": 129269376, "step": 748 }, { "epoch": 0.2993605115907274, "grad_norm": 40.816773316862005, "learning_rate": 5e-06, "loss": 0.6647, "num_input_tokens_seen": 129441928, "step": 749 }, { "epoch": 0.2993605115907274, "loss": 0.7464295029640198, "loss_ce": 0.020355278626084328, "loss_xval": 0.7265625, "num_input_tokens_seen": 129441928, "step": 749 }, { "epoch": 0.2997601918465228, "grad_norm": 129.2032035605195, "learning_rate": 5e-06, "loss": 0.694, "num_input_tokens_seen": 129614136, "step": 750 }, { "epoch": 0.2997601918465228, "eval_websight_new_IoU": 0.32972943782806396, "eval_websight_new_MAE_all": 0.03321713022887707, "eval_websight_new_MAE_h": 0.03598089702427387, "eval_websight_new_MAE_w": 0.05507303401827812, "eval_websight_new_MAE_x": 0.021736985072493553, "eval_websight_new_MAE_y": 0.020077602006495, "eval_websight_new_NUM_probability": 0.9082909226417542, "eval_websight_new_inside_bbox": 0.5902777910232544, "eval_websight_new_loss": 0.3363611698150635, "eval_websight_new_loss_ce": 0.00987301068380475, "eval_websight_new_loss_xval": 0.2787322998046875, "eval_websight_new_runtime": 59.6643, "eval_websight_new_samples_per_second": 0.838, "eval_websight_new_steps_per_second": 0.034, "num_input_tokens_seen": 129614136, "step": 750 }, { "epoch": 0.2997601918465228, "eval_seeclick_IoU": 0.21530038118362427, "eval_seeclick_MAE_all": 0.0899505689740181, "eval_seeclick_MAE_h": 0.03901367634534836, "eval_seeclick_MAE_w": 0.1384214162826538, "eval_seeclick_MAE_x": 0.11315473914146423, "eval_seeclick_MAE_y": 0.0692124255001545, "eval_seeclick_NUM_probability": 0.8880393803119659, "eval_seeclick_inside_bbox": 0.3229166716337204, "eval_seeclick_loss": 2.2942774295806885, "eval_seeclick_loss_ce": 0.026869087480008602, "eval_seeclick_loss_xval": 2.2388916015625, "eval_seeclick_runtime": 89.7723, "eval_seeclick_samples_per_second": 0.557, "eval_seeclick_steps_per_second": 0.022, "num_input_tokens_seen": 129614136, "step": 750 }, { "epoch": 0.2997601918465228, "eval_icons_IoU": 0.09595663845539093, "eval_icons_MAE_all": 0.035994925536215305, "eval_icons_MAE_h": 0.0310601107776165, "eval_icons_MAE_w": 0.02391492947936058, "eval_icons_MAE_x": 0.05677058733999729, "eval_icons_MAE_y": 0.03223407082259655, "eval_icons_NUM_probability": 0.9089525938034058, "eval_icons_inside_bbox": 0.2048611119389534, "eval_icons_loss": 0.2528549134731293, "eval_icons_loss_ce": 0.012350890785455704, "eval_icons_loss_xval": 0.222991943359375, "eval_icons_runtime": 82.7604, "eval_icons_samples_per_second": 0.604, "eval_icons_steps_per_second": 0.024, "num_input_tokens_seen": 129614136, "step": 750 }, { "epoch": 0.2997601918465228, "loss": 0.3234509825706482, "loss_ce": 0.014796189963817596, "loss_xval": 0.30859375, "num_input_tokens_seen": 129614136, "step": 750 }, { "epoch": 0.30015987210231815, "grad_norm": 50.44016771405421, "learning_rate": 5e-06, "loss": 0.6259, "num_input_tokens_seen": 129786896, "step": 751 }, { "epoch": 0.30015987210231815, "loss": 0.6966589689254761, "loss_ce": 0.015018315985798836, "loss_xval": 0.6796875, "num_input_tokens_seen": 129786896, "step": 751 }, { "epoch": 0.3005595523581135, "grad_norm": 188.04641922934573, "learning_rate": 5e-06, "loss": 1.1675, "num_input_tokens_seen": 129959752, "step": 752 }, { "epoch": 0.3005595523581135, "loss": 1.1777859926223755, "loss_ce": 0.016409026458859444, "loss_xval": 1.1640625, "num_input_tokens_seen": 129959752, "step": 752 }, { "epoch": 0.3009592326139089, "grad_norm": 64.08122838016891, "learning_rate": 5e-06, "loss": 1.021, "num_input_tokens_seen": 130132688, "step": 753 }, { "epoch": 0.3009592326139089, "loss": 0.7875679731369019, "loss_ce": 0.011078734882175922, "loss_xval": 0.77734375, "num_input_tokens_seen": 130132688, "step": 753 }, { "epoch": 0.30135891286970423, "grad_norm": 238.25273070676948, "learning_rate": 5e-06, "loss": 0.9882, "num_input_tokens_seen": 130306096, "step": 754 }, { "epoch": 0.30135891286970423, "loss": 1.330396056175232, "loss_ce": 0.009595339186489582, "loss_xval": 1.3203125, "num_input_tokens_seen": 130306096, "step": 754 }, { "epoch": 0.3017585931254996, "grad_norm": 98.65332854764719, "learning_rate": 5e-06, "loss": 0.8745, "num_input_tokens_seen": 130479152, "step": 755 }, { "epoch": 0.3017585931254996, "loss": 0.6690840125083923, "loss_ce": 0.009049820713698864, "loss_xval": 0.66015625, "num_input_tokens_seen": 130479152, "step": 755 }, { "epoch": 0.302158273381295, "grad_norm": 279.6737307195592, "learning_rate": 5e-06, "loss": 0.735, "num_input_tokens_seen": 130651792, "step": 756 }, { "epoch": 0.302158273381295, "loss": 0.8260841965675354, "loss_ce": 0.008701398968696594, "loss_xval": 0.81640625, "num_input_tokens_seen": 130651792, "step": 756 }, { "epoch": 0.3025579536370903, "grad_norm": 168.73328107773594, "learning_rate": 5e-06, "loss": 0.7155, "num_input_tokens_seen": 130824800, "step": 757 }, { "epoch": 0.3025579536370903, "loss": 0.9893529415130615, "loss_ce": 0.009982806630432606, "loss_xval": 0.98046875, "num_input_tokens_seen": 130824800, "step": 757 }, { "epoch": 0.3029576338928857, "grad_norm": 229.3989099063003, "learning_rate": 5e-06, "loss": 0.7724, "num_input_tokens_seen": 130997424, "step": 758 }, { "epoch": 0.3029576338928857, "loss": 1.1186637878417969, "loss_ce": 0.011974346823990345, "loss_xval": 1.109375, "num_input_tokens_seen": 130997424, "step": 758 }, { "epoch": 0.30335731414868106, "grad_norm": 138.1632528777344, "learning_rate": 5e-06, "loss": 0.7744, "num_input_tokens_seen": 131170472, "step": 759 }, { "epoch": 0.30335731414868106, "loss": 0.7636563777923584, "loss_ce": 0.0134122334420681, "loss_xval": 0.75, "num_input_tokens_seen": 131170472, "step": 759 }, { "epoch": 0.3037569944044764, "grad_norm": 226.06896222225566, "learning_rate": 5e-06, "loss": 0.9072, "num_input_tokens_seen": 131343872, "step": 760 }, { "epoch": 0.3037569944044764, "loss": 0.6551499962806702, "loss_ce": 0.022825779393315315, "loss_xval": 0.6328125, "num_input_tokens_seen": 131343872, "step": 760 }, { "epoch": 0.3041566746602718, "grad_norm": 140.47231030423117, "learning_rate": 5e-06, "loss": 0.801, "num_input_tokens_seen": 131516864, "step": 761 }, { "epoch": 0.3041566746602718, "loss": 0.7796874642372131, "loss_ce": 0.018823187798261642, "loss_xval": 0.76171875, "num_input_tokens_seen": 131516864, "step": 761 }, { "epoch": 0.30455635491606714, "grad_norm": 191.03867527797138, "learning_rate": 5e-06, "loss": 0.8019, "num_input_tokens_seen": 131686080, "step": 762 }, { "epoch": 0.30455635491606714, "loss": 0.8312917947769165, "loss_ce": 0.01952417567372322, "loss_xval": 0.8125, "num_input_tokens_seen": 131686080, "step": 762 }, { "epoch": 0.3049560351718625, "grad_norm": 99.15375963784552, "learning_rate": 5e-06, "loss": 0.9744, "num_input_tokens_seen": 131859384, "step": 763 }, { "epoch": 0.3049560351718625, "loss": 1.0681936740875244, "loss_ce": 0.016435783356428146, "loss_xval": 1.0546875, "num_input_tokens_seen": 131859384, "step": 763 }, { "epoch": 0.3053557154276579, "grad_norm": 213.65930477816116, "learning_rate": 5e-06, "loss": 1.204, "num_input_tokens_seen": 132032312, "step": 764 }, { "epoch": 0.3053557154276579, "loss": 1.4312927722930908, "loss_ce": 0.019061321392655373, "loss_xval": 1.4140625, "num_input_tokens_seen": 132032312, "step": 764 }, { "epoch": 0.3057553956834532, "grad_norm": 108.68993738295507, "learning_rate": 5e-06, "loss": 0.61, "num_input_tokens_seen": 132205336, "step": 765 }, { "epoch": 0.3057553956834532, "loss": 0.6518961787223816, "loss_ce": 0.026041686534881592, "loss_xval": 0.625, "num_input_tokens_seen": 132205336, "step": 765 }, { "epoch": 0.3061550759392486, "grad_norm": 211.70893336363255, "learning_rate": 5e-06, "loss": 0.8624, "num_input_tokens_seen": 132378208, "step": 766 }, { "epoch": 0.3061550759392486, "loss": 0.7774863243103027, "loss_ce": 0.019185544922947884, "loss_xval": 0.7578125, "num_input_tokens_seen": 132378208, "step": 766 }, { "epoch": 0.30655475619504396, "grad_norm": 67.90632571223537, "learning_rate": 5e-06, "loss": 0.7338, "num_input_tokens_seen": 132551608, "step": 767 }, { "epoch": 0.30655475619504396, "loss": 0.9820546507835388, "loss_ce": 0.020140592008829117, "loss_xval": 0.9609375, "num_input_tokens_seen": 132551608, "step": 767 }, { "epoch": 0.3069544364508393, "grad_norm": 333.3981316273364, "learning_rate": 5e-06, "loss": 0.83, "num_input_tokens_seen": 132724568, "step": 768 }, { "epoch": 0.3069544364508393, "loss": 0.8645689487457275, "loss_ce": 0.020635826513171196, "loss_xval": 0.84375, "num_input_tokens_seen": 132724568, "step": 768 }, { "epoch": 0.3073541167066347, "grad_norm": 83.04924236141541, "learning_rate": 5e-06, "loss": 0.6825, "num_input_tokens_seen": 132897376, "step": 769 }, { "epoch": 0.3073541167066347, "loss": 0.8747704029083252, "loss_ce": 0.036544110625982285, "loss_xval": 0.83984375, "num_input_tokens_seen": 132897376, "step": 769 }, { "epoch": 0.30775379696243005, "grad_norm": 98.58312316090154, "learning_rate": 5e-06, "loss": 0.933, "num_input_tokens_seen": 133070104, "step": 770 }, { "epoch": 0.30775379696243005, "loss": 0.47699296474456787, "loss_ce": 0.19757401943206787, "loss_xval": 0.279296875, "num_input_tokens_seen": 133070104, "step": 770 }, { "epoch": 0.30815347721822545, "grad_norm": 67.93244599542409, "learning_rate": 5e-06, "loss": 1.1321, "num_input_tokens_seen": 133243488, "step": 771 }, { "epoch": 0.30815347721822545, "loss": 1.0369318723678589, "loss_ce": 0.13879956305027008, "loss_xval": 0.8984375, "num_input_tokens_seen": 133243488, "step": 771 }, { "epoch": 0.3085531574740208, "grad_norm": 46.27524938342765, "learning_rate": 5e-06, "loss": 0.842, "num_input_tokens_seen": 133412784, "step": 772 }, { "epoch": 0.3085531574740208, "loss": 0.8269345164299011, "loss_ce": 0.09854095429182053, "loss_xval": 0.7265625, "num_input_tokens_seen": 133412784, "step": 772 }, { "epoch": 0.30895283772981613, "grad_norm": 50.41952425911126, "learning_rate": 5e-06, "loss": 1.1107, "num_input_tokens_seen": 133585896, "step": 773 }, { "epoch": 0.30895283772981613, "loss": 1.146827220916748, "loss_ce": 0.07016701996326447, "loss_xval": 1.078125, "num_input_tokens_seen": 133585896, "step": 773 }, { "epoch": 0.30935251798561153, "grad_norm": 82.74316039825493, "learning_rate": 5e-06, "loss": 0.6717, "num_input_tokens_seen": 133758544, "step": 774 }, { "epoch": 0.30935251798561153, "loss": 0.5270382165908813, "loss_ce": 0.08367883414030075, "loss_xval": 0.443359375, "num_input_tokens_seen": 133758544, "step": 774 }, { "epoch": 0.3097521982414069, "grad_norm": 66.82965857737175, "learning_rate": 5e-06, "loss": 0.9872, "num_input_tokens_seen": 133931776, "step": 775 }, { "epoch": 0.3097521982414069, "loss": 1.059539794921875, "loss_ce": 0.07809457927942276, "loss_xval": 0.98046875, "num_input_tokens_seen": 133931776, "step": 775 }, { "epoch": 0.3101518784972022, "grad_norm": 61.167284008978314, "learning_rate": 5e-06, "loss": 0.7877, "num_input_tokens_seen": 134104704, "step": 776 }, { "epoch": 0.3101518784972022, "loss": 0.8881216049194336, "loss_ce": 0.0730580985546112, "loss_xval": 0.81640625, "num_input_tokens_seen": 134104704, "step": 776 }, { "epoch": 0.3105515587529976, "grad_norm": 84.1144974470978, "learning_rate": 5e-06, "loss": 0.7481, "num_input_tokens_seen": 134277464, "step": 777 }, { "epoch": 0.3105515587529976, "loss": 0.6984782218933105, "loss_ce": 0.047355152666568756, "loss_xval": 0.65234375, "num_input_tokens_seen": 134277464, "step": 777 }, { "epoch": 0.31095123900879296, "grad_norm": 115.97957410951801, "learning_rate": 5e-06, "loss": 0.9494, "num_input_tokens_seen": 134450816, "step": 778 }, { "epoch": 0.31095123900879296, "loss": 1.0162138938903809, "loss_ce": 0.046731531620025635, "loss_xval": 0.96875, "num_input_tokens_seen": 134450816, "step": 778 }, { "epoch": 0.31135091926458835, "grad_norm": 103.83504655014406, "learning_rate": 5e-06, "loss": 0.5651, "num_input_tokens_seen": 134624024, "step": 779 }, { "epoch": 0.31135091926458835, "loss": 0.5172841548919678, "loss_ce": 0.02302144654095173, "loss_xval": 0.494140625, "num_input_tokens_seen": 134624024, "step": 779 }, { "epoch": 0.3117505995203837, "grad_norm": 63.90837696559783, "learning_rate": 5e-06, "loss": 0.7547, "num_input_tokens_seen": 134797272, "step": 780 }, { "epoch": 0.3117505995203837, "loss": 0.7474457025527954, "loss_ce": 0.026986707001924515, "loss_xval": 0.71875, "num_input_tokens_seen": 134797272, "step": 780 }, { "epoch": 0.31215027977617904, "grad_norm": 115.9985412220161, "learning_rate": 5e-06, "loss": 0.9481, "num_input_tokens_seen": 134970320, "step": 781 }, { "epoch": 0.31215027977617904, "loss": 0.7179272174835205, "loss_ce": 0.01968502625823021, "loss_xval": 0.69921875, "num_input_tokens_seen": 134970320, "step": 781 }, { "epoch": 0.31254996003197444, "grad_norm": 41.252406618208745, "learning_rate": 5e-06, "loss": 0.7365, "num_input_tokens_seen": 135143296, "step": 782 }, { "epoch": 0.31254996003197444, "loss": 0.676671028137207, "loss_ce": 0.025548022240400314, "loss_xval": 0.65234375, "num_input_tokens_seen": 135143296, "step": 782 }, { "epoch": 0.3129496402877698, "grad_norm": 102.07811583540216, "learning_rate": 5e-06, "loss": 1.3992, "num_input_tokens_seen": 135316624, "step": 783 }, { "epoch": 0.3129496402877698, "loss": 1.6752166748046875, "loss_ce": 0.03459162265062332, "loss_xval": 1.640625, "num_input_tokens_seen": 135316624, "step": 783 }, { "epoch": 0.3133493205435651, "grad_norm": 112.62388613696993, "learning_rate": 5e-06, "loss": 0.8903, "num_input_tokens_seen": 135489416, "step": 784 }, { "epoch": 0.3133493205435651, "loss": 0.9434410929679871, "loss_ce": 0.029622741043567657, "loss_xval": 0.9140625, "num_input_tokens_seen": 135489416, "step": 784 }, { "epoch": 0.3137490007993605, "grad_norm": 81.92869001107958, "learning_rate": 5e-06, "loss": 0.5826, "num_input_tokens_seen": 135662392, "step": 785 }, { "epoch": 0.3137490007993605, "loss": 0.8590636253356934, "loss_ce": 0.032403476536273956, "loss_xval": 0.828125, "num_input_tokens_seen": 135662392, "step": 785 }, { "epoch": 0.31414868105515587, "grad_norm": 56.93984188783562, "learning_rate": 5e-06, "loss": 0.6952, "num_input_tokens_seen": 135835648, "step": 786 }, { "epoch": 0.31414868105515587, "loss": 0.7936071157455444, "loss_ce": 0.03201046958565712, "loss_xval": 0.76171875, "num_input_tokens_seen": 135835648, "step": 786 }, { "epoch": 0.31454836131095126, "grad_norm": 123.08995481669876, "learning_rate": 5e-06, "loss": 1.0795, "num_input_tokens_seen": 136008360, "step": 787 }, { "epoch": 0.31454836131095126, "loss": 1.2340142726898193, "loss_ce": 0.022832613438367844, "loss_xval": 1.2109375, "num_input_tokens_seen": 136008360, "step": 787 }, { "epoch": 0.3149480415667466, "grad_norm": 192.62508663565438, "learning_rate": 5e-06, "loss": 0.7918, "num_input_tokens_seen": 136180904, "step": 788 }, { "epoch": 0.3149480415667466, "loss": 0.6366986036300659, "loss_ce": 0.04831964522600174, "loss_xval": 0.58984375, "num_input_tokens_seen": 136180904, "step": 788 }, { "epoch": 0.31534772182254195, "grad_norm": 51.10447508784947, "learning_rate": 5e-06, "loss": 0.8927, "num_input_tokens_seen": 136353552, "step": 789 }, { "epoch": 0.31534772182254195, "loss": 1.050945520401001, "loss_ce": 0.05485168844461441, "loss_xval": 0.99609375, "num_input_tokens_seen": 136353552, "step": 789 }, { "epoch": 0.31574740207833735, "grad_norm": 154.61700039241353, "learning_rate": 5e-06, "loss": 0.8251, "num_input_tokens_seen": 136526584, "step": 790 }, { "epoch": 0.31574740207833735, "loss": 0.8927372694015503, "loss_ce": 0.0441044420003891, "loss_xval": 0.84765625, "num_input_tokens_seen": 136526584, "step": 790 }, { "epoch": 0.3161470823341327, "grad_norm": 42.16296097175742, "learning_rate": 5e-06, "loss": 0.7295, "num_input_tokens_seen": 136699328, "step": 791 }, { "epoch": 0.3161470823341327, "loss": 0.9556566476821899, "loss_ce": 0.04513419792056084, "loss_xval": 0.91015625, "num_input_tokens_seen": 136699328, "step": 791 }, { "epoch": 0.31654676258992803, "grad_norm": 138.874910538935, "learning_rate": 5e-06, "loss": 0.6915, "num_input_tokens_seen": 136871856, "step": 792 }, { "epoch": 0.31654676258992803, "loss": 0.6887627840042114, "loss_ce": 0.03727353364229202, "loss_xval": 0.65234375, "num_input_tokens_seen": 136871856, "step": 792 }, { "epoch": 0.31694644284572343, "grad_norm": 65.9961884504279, "learning_rate": 5e-06, "loss": 0.6059, "num_input_tokens_seen": 137045064, "step": 793 }, { "epoch": 0.31694644284572343, "loss": 0.5695608854293823, "loss_ce": 0.041973013430833817, "loss_xval": 0.52734375, "num_input_tokens_seen": 137045064, "step": 793 }, { "epoch": 0.3173461231015188, "grad_norm": 129.30759832098593, "learning_rate": 5e-06, "loss": 0.8537, "num_input_tokens_seen": 137218184, "step": 794 }, { "epoch": 0.3173461231015188, "loss": 0.6901719570159912, "loss_ce": 0.03502054512500763, "loss_xval": 0.65625, "num_input_tokens_seen": 137218184, "step": 794 }, { "epoch": 0.31774580335731417, "grad_norm": 86.71836721194123, "learning_rate": 5e-06, "loss": 0.7562, "num_input_tokens_seen": 137391216, "step": 795 }, { "epoch": 0.31774580335731417, "loss": 0.7299758195877075, "loss_ce": 0.03719630092382431, "loss_xval": 0.69140625, "num_input_tokens_seen": 137391216, "step": 795 }, { "epoch": 0.3181454836131095, "grad_norm": 57.12461147367074, "learning_rate": 5e-06, "loss": 0.7832, "num_input_tokens_seen": 137564144, "step": 796 }, { "epoch": 0.3181454836131095, "loss": 0.5537126064300537, "loss_ce": 0.03189250826835632, "loss_xval": 0.5234375, "num_input_tokens_seen": 137564144, "step": 796 }, { "epoch": 0.31854516386890486, "grad_norm": 65.79780600817213, "learning_rate": 5e-06, "loss": 0.7925, "num_input_tokens_seen": 137737192, "step": 797 }, { "epoch": 0.31854516386890486, "loss": 0.9935466051101685, "loss_ce": 0.029099617153406143, "loss_xval": 0.96484375, "num_input_tokens_seen": 137737192, "step": 797 }, { "epoch": 0.31894484412470026, "grad_norm": 105.16048221090652, "learning_rate": 5e-06, "loss": 0.5815, "num_input_tokens_seen": 137910264, "step": 798 }, { "epoch": 0.31894484412470026, "loss": 0.6643279790878296, "loss_ce": 0.03017270937561989, "loss_xval": 0.6328125, "num_input_tokens_seen": 137910264, "step": 798 }, { "epoch": 0.3193445243804956, "grad_norm": 72.96586018987574, "learning_rate": 5e-06, "loss": 1.1874, "num_input_tokens_seen": 138083392, "step": 799 }, { "epoch": 0.3193445243804956, "loss": 1.442209243774414, "loss_ce": 0.024972904473543167, "loss_xval": 1.4140625, "num_input_tokens_seen": 138083392, "step": 799 }, { "epoch": 0.31974420463629094, "grad_norm": 157.17421656167332, "learning_rate": 5e-06, "loss": 0.9201, "num_input_tokens_seen": 138256112, "step": 800 }, { "epoch": 0.31974420463629094, "loss": 1.115356206893921, "loss_ce": 0.024047698825597763, "loss_xval": 1.09375, "num_input_tokens_seen": 138256112, "step": 800 }, { "epoch": 0.32014388489208634, "grad_norm": 110.192018937514, "learning_rate": 5e-06, "loss": 1.0826, "num_input_tokens_seen": 138428960, "step": 801 }, { "epoch": 0.32014388489208634, "loss": 1.336624264717102, "loss_ce": 0.06312566250562668, "loss_xval": 1.2734375, "num_input_tokens_seen": 138428960, "step": 801 }, { "epoch": 0.3205435651478817, "grad_norm": 151.99977831225056, "learning_rate": 5e-06, "loss": 0.8894, "num_input_tokens_seen": 138601992, "step": 802 }, { "epoch": 0.3205435651478817, "loss": 0.6037086844444275, "loss_ce": 0.02289813756942749, "loss_xval": 0.58203125, "num_input_tokens_seen": 138601992, "step": 802 }, { "epoch": 0.3209432454036771, "grad_norm": 52.06306805397873, "learning_rate": 5e-06, "loss": 0.7899, "num_input_tokens_seen": 138774584, "step": 803 }, { "epoch": 0.3209432454036771, "loss": 0.744976282119751, "loss_ce": 0.019451454281806946, "loss_xval": 0.7265625, "num_input_tokens_seen": 138774584, "step": 803 }, { "epoch": 0.3213429256594724, "grad_norm": 39.91507915716981, "learning_rate": 5e-06, "loss": 0.6145, "num_input_tokens_seen": 138947832, "step": 804 }, { "epoch": 0.3213429256594724, "loss": 0.7825067043304443, "loss_ce": 0.03006529062986374, "loss_xval": 0.75390625, "num_input_tokens_seen": 138947832, "step": 804 }, { "epoch": 0.32174260591526777, "grad_norm": 75.5906555802562, "learning_rate": 5e-06, "loss": 0.7645, "num_input_tokens_seen": 139120760, "step": 805 }, { "epoch": 0.32174260591526777, "loss": 0.8884168267250061, "loss_ce": 0.0256849005818367, "loss_xval": 0.86328125, "num_input_tokens_seen": 139120760, "step": 805 }, { "epoch": 0.32214228617106316, "grad_norm": 146.3475272918251, "learning_rate": 5e-06, "loss": 0.7265, "num_input_tokens_seen": 139293776, "step": 806 }, { "epoch": 0.32214228617106316, "loss": 0.8622835874557495, "loss_ce": 0.02243983931839466, "loss_xval": 0.83984375, "num_input_tokens_seen": 139293776, "step": 806 }, { "epoch": 0.3225419664268585, "grad_norm": 149.26645866928035, "learning_rate": 5e-06, "loss": 0.6571, "num_input_tokens_seen": 139466240, "step": 807 }, { "epoch": 0.3225419664268585, "loss": 0.9698929190635681, "loss_ce": 0.018721019849181175, "loss_xval": 0.953125, "num_input_tokens_seen": 139466240, "step": 807 }, { "epoch": 0.3229416466826539, "grad_norm": 85.13022329777696, "learning_rate": 5e-06, "loss": 0.7868, "num_input_tokens_seen": 139639432, "step": 808 }, { "epoch": 0.3229416466826539, "loss": 0.3776288628578186, "loss_ce": 0.029606396332383156, "loss_xval": 0.34765625, "num_input_tokens_seen": 139639432, "step": 808 }, { "epoch": 0.32334132693844925, "grad_norm": 124.8610805709589, "learning_rate": 5e-06, "loss": 0.51, "num_input_tokens_seen": 139813008, "step": 809 }, { "epoch": 0.32334132693844925, "loss": 0.42539799213409424, "loss_ce": 0.028364313766360283, "loss_xval": 0.396484375, "num_input_tokens_seen": 139813008, "step": 809 }, { "epoch": 0.3237410071942446, "grad_norm": 35.664202919806826, "learning_rate": 5e-06, "loss": 0.9645, "num_input_tokens_seen": 139982784, "step": 810 }, { "epoch": 0.3237410071942446, "loss": 0.6386287212371826, "loss_ce": 0.026446137577295303, "loss_xval": 0.61328125, "num_input_tokens_seen": 139982784, "step": 810 }, { "epoch": 0.32414068745004, "grad_norm": 63.87833532222325, "learning_rate": 5e-06, "loss": 0.6453, "num_input_tokens_seen": 140156176, "step": 811 }, { "epoch": 0.32414068745004, "loss": 0.4492732882499695, "loss_ce": 0.025262057781219482, "loss_xval": 0.423828125, "num_input_tokens_seen": 140156176, "step": 811 }, { "epoch": 0.32454036770583533, "grad_norm": 67.34889598567726, "learning_rate": 5e-06, "loss": 0.8529, "num_input_tokens_seen": 140329472, "step": 812 }, { "epoch": 0.32454036770583533, "loss": 0.3586152493953705, "loss_ce": 0.028720222413539886, "loss_xval": 0.330078125, "num_input_tokens_seen": 140329472, "step": 812 }, { "epoch": 0.3249400479616307, "grad_norm": 70.12852323782404, "learning_rate": 5e-06, "loss": 0.5924, "num_input_tokens_seen": 140502504, "step": 813 }, { "epoch": 0.3249400479616307, "loss": 0.7642978429794312, "loss_ce": 0.025772511959075928, "loss_xval": 0.73828125, "num_input_tokens_seen": 140502504, "step": 813 }, { "epoch": 0.32533972821742607, "grad_norm": 28.086180346559747, "learning_rate": 5e-06, "loss": 0.4791, "num_input_tokens_seen": 140675400, "step": 814 }, { "epoch": 0.32533972821742607, "loss": 0.7082566022872925, "loss_ce": 0.015293995849788189, "loss_xval": 0.69140625, "num_input_tokens_seen": 140675400, "step": 814 }, { "epoch": 0.3257394084732214, "grad_norm": 101.79711405925198, "learning_rate": 5e-06, "loss": 1.158, "num_input_tokens_seen": 140848184, "step": 815 }, { "epoch": 0.3257394084732214, "loss": 0.7473459839820862, "loss_ce": 0.015107257291674614, "loss_xval": 0.73046875, "num_input_tokens_seen": 140848184, "step": 815 }, { "epoch": 0.3261390887290168, "grad_norm": 193.45790211450287, "learning_rate": 5e-06, "loss": 0.8277, "num_input_tokens_seen": 141021312, "step": 816 }, { "epoch": 0.3261390887290168, "loss": 0.8551950454711914, "loss_ce": 0.013276074081659317, "loss_xval": 0.84375, "num_input_tokens_seen": 141021312, "step": 816 }, { "epoch": 0.32653876898481216, "grad_norm": 123.7203597173954, "learning_rate": 5e-06, "loss": 0.8849, "num_input_tokens_seen": 141194280, "step": 817 }, { "epoch": 0.32653876898481216, "loss": 0.8089841604232788, "loss_ce": 0.014916693791747093, "loss_xval": 0.79296875, "num_input_tokens_seen": 141194280, "step": 817 }, { "epoch": 0.3269384492406075, "grad_norm": 123.03492826004478, "learning_rate": 5e-06, "loss": 1.1935, "num_input_tokens_seen": 141367040, "step": 818 }, { "epoch": 0.3269384492406075, "loss": 1.457615852355957, "loss_ce": 0.01865091174840927, "loss_xval": 1.4375, "num_input_tokens_seen": 141367040, "step": 818 }, { "epoch": 0.3273381294964029, "grad_norm": 196.81361407257646, "learning_rate": 5e-06, "loss": 0.8593, "num_input_tokens_seen": 141540360, "step": 819 }, { "epoch": 0.3273381294964029, "loss": 0.7007966041564941, "loss_ce": 0.01708076149225235, "loss_xval": 0.68359375, "num_input_tokens_seen": 141540360, "step": 819 }, { "epoch": 0.32773780975219824, "grad_norm": 58.596043671656304, "learning_rate": 5e-06, "loss": 0.895, "num_input_tokens_seen": 141713328, "step": 820 }, { "epoch": 0.32773780975219824, "loss": 0.9711037278175354, "loss_ce": 0.01468280702829361, "loss_xval": 0.95703125, "num_input_tokens_seen": 141713328, "step": 820 }, { "epoch": 0.3281374900079936, "grad_norm": 191.31795115874968, "learning_rate": 5e-06, "loss": 0.8616, "num_input_tokens_seen": 141886664, "step": 821 }, { "epoch": 0.3281374900079936, "loss": 0.8032501935958862, "loss_ce": 0.010769794695079327, "loss_xval": 0.79296875, "num_input_tokens_seen": 141886664, "step": 821 }, { "epoch": 0.328537170263789, "grad_norm": 64.81683384211863, "learning_rate": 5e-06, "loss": 0.5945, "num_input_tokens_seen": 142059672, "step": 822 }, { "epoch": 0.328537170263789, "loss": 0.43535593152046204, "loss_ce": 0.02971627749502659, "loss_xval": 0.40625, "num_input_tokens_seen": 142059672, "step": 822 }, { "epoch": 0.3289368505195843, "grad_norm": 184.53548859363346, "learning_rate": 5e-06, "loss": 0.7032, "num_input_tokens_seen": 142232456, "step": 823 }, { "epoch": 0.3289368505195843, "loss": 0.6750579476356506, "loss_ce": 0.06104426458477974, "loss_xval": 0.61328125, "num_input_tokens_seen": 142232456, "step": 823 }, { "epoch": 0.3293365307753797, "grad_norm": 118.97174765110988, "learning_rate": 5e-06, "loss": 0.9834, "num_input_tokens_seen": 142405272, "step": 824 }, { "epoch": 0.3293365307753797, "loss": 1.2425031661987305, "loss_ce": 0.051829393953084946, "loss_xval": 1.1875, "num_input_tokens_seen": 142405272, "step": 824 }, { "epoch": 0.32973621103117506, "grad_norm": 296.29048064640887, "learning_rate": 5e-06, "loss": 0.9834, "num_input_tokens_seen": 142578256, "step": 825 }, { "epoch": 0.32973621103117506, "loss": 1.2711055278778076, "loss_ce": 0.02940632961690426, "loss_xval": 1.2421875, "num_input_tokens_seen": 142578256, "step": 825 }, { "epoch": 0.3301358912869704, "grad_norm": 51.48298081505895, "learning_rate": 5e-06, "loss": 0.762, "num_input_tokens_seen": 142751040, "step": 826 }, { "epoch": 0.3301358912869704, "loss": 0.8536327481269836, "loss_ce": 0.021448887884616852, "loss_xval": 0.83203125, "num_input_tokens_seen": 142751040, "step": 826 }, { "epoch": 0.3305355715427658, "grad_norm": 318.4751539614244, "learning_rate": 5e-06, "loss": 1.316, "num_input_tokens_seen": 142924288, "step": 827 }, { "epoch": 0.3305355715427658, "loss": 0.8164308071136475, "loss_ce": 0.023950327187776566, "loss_xval": 0.79296875, "num_input_tokens_seen": 142924288, "step": 827 }, { "epoch": 0.33093525179856115, "grad_norm": 54.08071398162099, "learning_rate": 5e-06, "loss": 0.5481, "num_input_tokens_seen": 143096912, "step": 828 }, { "epoch": 0.33093525179856115, "loss": 0.5444949269294739, "loss_ce": 0.028564732521772385, "loss_xval": 0.515625, "num_input_tokens_seen": 143096912, "step": 828 }, { "epoch": 0.3313349320543565, "grad_norm": 244.38305682024998, "learning_rate": 5e-06, "loss": 0.8846, "num_input_tokens_seen": 143270104, "step": 829 }, { "epoch": 0.3313349320543565, "loss": 1.1831023693084717, "loss_ce": 0.05102230980992317, "loss_xval": 1.1328125, "num_input_tokens_seen": 143270104, "step": 829 }, { "epoch": 0.3317346123101519, "grad_norm": 78.02138265678809, "learning_rate": 5e-06, "loss": 1.0717, "num_input_tokens_seen": 143442832, "step": 830 }, { "epoch": 0.3317346123101519, "loss": 1.199249267578125, "loss_ce": 0.02102671191096306, "loss_xval": 1.1796875, "num_input_tokens_seen": 143442832, "step": 830 }, { "epoch": 0.33213429256594723, "grad_norm": 244.2166319233294, "learning_rate": 5e-06, "loss": 0.9575, "num_input_tokens_seen": 143615664, "step": 831 }, { "epoch": 0.33213429256594723, "loss": 0.8177416324615479, "loss_ce": 0.02526114135980606, "loss_xval": 0.79296875, "num_input_tokens_seen": 143615664, "step": 831 }, { "epoch": 0.33253397282174263, "grad_norm": 202.87305428352477, "learning_rate": 5e-06, "loss": 1.0996, "num_input_tokens_seen": 143788624, "step": 832 }, { "epoch": 0.33253397282174263, "loss": 0.7815216183662415, "loss_ce": 0.024685688316822052, "loss_xval": 0.7578125, "num_input_tokens_seen": 143788624, "step": 832 }, { "epoch": 0.33293365307753797, "grad_norm": 267.1015074921022, "learning_rate": 5e-06, "loss": 0.8148, "num_input_tokens_seen": 143961728, "step": 833 }, { "epoch": 0.33293365307753797, "loss": 1.0413076877593994, "loss_ce": 0.030077166855335236, "loss_xval": 1.0078125, "num_input_tokens_seen": 143961728, "step": 833 }, { "epoch": 0.3333333333333333, "grad_norm": 249.0188358069812, "learning_rate": 5e-06, "loss": 0.6692, "num_input_tokens_seen": 144134792, "step": 834 }, { "epoch": 0.3333333333333333, "loss": 0.577985405921936, "loss_ce": 0.025495212525129318, "loss_xval": 0.55078125, "num_input_tokens_seen": 144134792, "step": 834 }, { "epoch": 0.3337330135891287, "grad_norm": 168.98599906136002, "learning_rate": 5e-06, "loss": 0.7907, "num_input_tokens_seen": 144307776, "step": 835 }, { "epoch": 0.3337330135891287, "loss": 0.7566037774085999, "loss_ce": 0.03101782500743866, "loss_xval": 0.7265625, "num_input_tokens_seen": 144307776, "step": 835 }, { "epoch": 0.33413269384492406, "grad_norm": 223.09060693099514, "learning_rate": 5e-06, "loss": 0.8478, "num_input_tokens_seen": 144480840, "step": 836 }, { "epoch": 0.33413269384492406, "loss": 0.6522843241691589, "loss_ce": 0.03289957344532013, "loss_xval": 0.62109375, "num_input_tokens_seen": 144480840, "step": 836 }, { "epoch": 0.3345323741007194, "grad_norm": 26.914313057404094, "learning_rate": 5e-06, "loss": 0.3649, "num_input_tokens_seen": 144654280, "step": 837 }, { "epoch": 0.3345323741007194, "loss": 0.3446645140647888, "loss_ce": 0.02459615468978882, "loss_xval": 0.3203125, "num_input_tokens_seen": 144654280, "step": 837 }, { "epoch": 0.3349320543565148, "grad_norm": 180.4789733229281, "learning_rate": 5e-06, "loss": 0.9611, "num_input_tokens_seen": 144827136, "step": 838 }, { "epoch": 0.3349320543565148, "loss": 0.5780286192893982, "loss_ce": 0.02566044218838215, "loss_xval": 0.55078125, "num_input_tokens_seen": 144827136, "step": 838 }, { "epoch": 0.33533173461231014, "grad_norm": 90.74097333134976, "learning_rate": 5e-06, "loss": 0.6284, "num_input_tokens_seen": 145000568, "step": 839 }, { "epoch": 0.33533173461231014, "loss": 0.7638094425201416, "loss_ce": 0.0294344425201416, "loss_xval": 0.734375, "num_input_tokens_seen": 145000568, "step": 839 }, { "epoch": 0.33573141486810554, "grad_norm": 112.5969382238021, "learning_rate": 5e-06, "loss": 0.8517, "num_input_tokens_seen": 145173128, "step": 840 }, { "epoch": 0.33573141486810554, "loss": 0.4753795266151428, "loss_ce": 0.025672491639852524, "loss_xval": 0.44921875, "num_input_tokens_seen": 145173128, "step": 840 }, { "epoch": 0.3361310951239009, "grad_norm": 160.69367514258622, "learning_rate": 5e-06, "loss": 0.9031, "num_input_tokens_seen": 145345760, "step": 841 }, { "epoch": 0.3361310951239009, "loss": 1.1150989532470703, "loss_ce": 0.022447630763053894, "loss_xval": 1.09375, "num_input_tokens_seen": 145345760, "step": 841 }, { "epoch": 0.3365307753796962, "grad_norm": 83.89344588868427, "learning_rate": 5e-06, "loss": 0.6187, "num_input_tokens_seen": 145518800, "step": 842 }, { "epoch": 0.3365307753796962, "loss": 0.8424967527389526, "loss_ce": 0.020902525633573532, "loss_xval": 0.8203125, "num_input_tokens_seen": 145518800, "step": 842 }, { "epoch": 0.3369304556354916, "grad_norm": 128.21717519579028, "learning_rate": 5e-06, "loss": 0.7996, "num_input_tokens_seen": 145691888, "step": 843 }, { "epoch": 0.3369304556354916, "loss": 0.5368384718894958, "loss_ce": 0.01852792128920555, "loss_xval": 0.51953125, "num_input_tokens_seen": 145691888, "step": 843 }, { "epoch": 0.33733013589128696, "grad_norm": 27.261036954593337, "learning_rate": 5e-06, "loss": 0.5289, "num_input_tokens_seen": 145865024, "step": 844 }, { "epoch": 0.33733013589128696, "loss": 0.6117612719535828, "loss_ce": 0.02216166816651821, "loss_xval": 0.58984375, "num_input_tokens_seen": 145865024, "step": 844 }, { "epoch": 0.3377298161470823, "grad_norm": 148.51182247944908, "learning_rate": 5e-06, "loss": 0.8651, "num_input_tokens_seen": 146037824, "step": 845 }, { "epoch": 0.3377298161470823, "loss": 0.6133238077163696, "loss_ce": 0.017864754423499107, "loss_xval": 0.59375, "num_input_tokens_seen": 146037824, "step": 845 }, { "epoch": 0.3381294964028777, "grad_norm": 71.55720025246332, "learning_rate": 5e-06, "loss": 0.5425, "num_input_tokens_seen": 146210432, "step": 846 }, { "epoch": 0.3381294964028777, "loss": 0.8297093510627747, "loss_ce": 0.014645876362919807, "loss_xval": 0.81640625, "num_input_tokens_seen": 146210432, "step": 846 }, { "epoch": 0.33852917665867305, "grad_norm": 97.5892415923415, "learning_rate": 5e-06, "loss": 0.6328, "num_input_tokens_seen": 146383528, "step": 847 }, { "epoch": 0.33852917665867305, "loss": 0.6266300082206726, "loss_ce": 0.0211612731218338, "loss_xval": 0.60546875, "num_input_tokens_seen": 146383528, "step": 847 }, { "epoch": 0.33892885691446845, "grad_norm": 153.6287910779392, "learning_rate": 5e-06, "loss": 0.6496, "num_input_tokens_seen": 146555992, "step": 848 }, { "epoch": 0.33892885691446845, "loss": 0.628968358039856, "loss_ce": 0.012330153957009315, "loss_xval": 0.6171875, "num_input_tokens_seen": 146555992, "step": 848 }, { "epoch": 0.3393285371702638, "grad_norm": 119.93950349993199, "learning_rate": 5e-06, "loss": 0.47, "num_input_tokens_seen": 146728976, "step": 849 }, { "epoch": 0.3393285371702638, "loss": 0.5641285181045532, "loss_ce": 0.013103111647069454, "loss_xval": 0.55078125, "num_input_tokens_seen": 146728976, "step": 849 }, { "epoch": 0.33972821742605913, "grad_norm": 151.5642271469956, "learning_rate": 5e-06, "loss": 0.828, "num_input_tokens_seen": 146901664, "step": 850 }, { "epoch": 0.33972821742605913, "loss": 0.6333939433097839, "loss_ce": 0.009858794510364532, "loss_xval": 0.625, "num_input_tokens_seen": 146901664, "step": 850 }, { "epoch": 0.34012789768185453, "grad_norm": 63.2888069151697, "learning_rate": 5e-06, "loss": 0.5584, "num_input_tokens_seen": 147074640, "step": 851 }, { "epoch": 0.34012789768185453, "loss": 0.6273359656333923, "loss_ce": 0.010331545025110245, "loss_xval": 0.6171875, "num_input_tokens_seen": 147074640, "step": 851 }, { "epoch": 0.3405275779376499, "grad_norm": 133.0194702803221, "learning_rate": 5e-06, "loss": 0.9838, "num_input_tokens_seen": 147247328, "step": 852 }, { "epoch": 0.3405275779376499, "loss": 1.4191014766693115, "loss_ce": 0.011447655037045479, "loss_xval": 1.40625, "num_input_tokens_seen": 147247328, "step": 852 }, { "epoch": 0.34092725819344527, "grad_norm": 67.27535330850881, "learning_rate": 5e-06, "loss": 0.4894, "num_input_tokens_seen": 147420576, "step": 853 }, { "epoch": 0.34092725819344527, "loss": 0.5238457918167114, "loss_ce": 0.008770117536187172, "loss_xval": 0.515625, "num_input_tokens_seen": 147420576, "step": 853 }, { "epoch": 0.3413269384492406, "grad_norm": 155.66643248889466, "learning_rate": 5e-06, "loss": 0.7028, "num_input_tokens_seen": 147593552, "step": 854 }, { "epoch": 0.3413269384492406, "loss": 0.6921306252479553, "loss_ce": 0.020133551210165024, "loss_xval": 0.671875, "num_input_tokens_seen": 147593552, "step": 854 }, { "epoch": 0.34172661870503596, "grad_norm": 62.82339145784998, "learning_rate": 5e-06, "loss": 0.4399, "num_input_tokens_seen": 147766336, "step": 855 }, { "epoch": 0.34172661870503596, "loss": 0.4368290901184082, "loss_ce": 0.008331773802638054, "loss_xval": 0.427734375, "num_input_tokens_seen": 147766336, "step": 855 }, { "epoch": 0.34212629896083135, "grad_norm": 169.0372459131588, "learning_rate": 5e-06, "loss": 1.1117, "num_input_tokens_seen": 147939288, "step": 856 }, { "epoch": 0.34212629896083135, "loss": 1.1313327550888062, "loss_ce": 0.01048314105719328, "loss_xval": 1.1171875, "num_input_tokens_seen": 147939288, "step": 856 }, { "epoch": 0.3425259792166267, "grad_norm": 160.87608512503493, "learning_rate": 5e-06, "loss": 0.9064, "num_input_tokens_seen": 148112320, "step": 857 }, { "epoch": 0.3425259792166267, "loss": 1.256239891052246, "loss_ce": 0.008559215813875198, "loss_xval": 1.25, "num_input_tokens_seen": 148112320, "step": 857 }, { "epoch": 0.34292565947242204, "grad_norm": 105.44623396035678, "learning_rate": 5e-06, "loss": 0.7052, "num_input_tokens_seen": 148285280, "step": 858 }, { "epoch": 0.34292565947242204, "loss": 0.8094460368156433, "loss_ce": 0.010129651054739952, "loss_xval": 0.80078125, "num_input_tokens_seen": 148285280, "step": 858 }, { "epoch": 0.34332533972821744, "grad_norm": 120.42122131819227, "learning_rate": 5e-06, "loss": 0.737, "num_input_tokens_seen": 148457720, "step": 859 }, { "epoch": 0.34332533972821744, "loss": 0.742914080619812, "loss_ce": 0.009515605866909027, "loss_xval": 0.734375, "num_input_tokens_seen": 148457720, "step": 859 }, { "epoch": 0.3437250199840128, "grad_norm": 130.49015429141713, "learning_rate": 5e-06, "loss": 0.7364, "num_input_tokens_seen": 148631048, "step": 860 }, { "epoch": 0.3437250199840128, "loss": 1.0588502883911133, "loss_ce": 0.011364908888936043, "loss_xval": 1.046875, "num_input_tokens_seen": 148631048, "step": 860 }, { "epoch": 0.3441247002398082, "grad_norm": 136.66847470521824, "learning_rate": 5e-06, "loss": 0.8036, "num_input_tokens_seen": 148804048, "step": 861 }, { "epoch": 0.3441247002398082, "loss": 0.9435184597969055, "loss_ce": 0.010290941223502159, "loss_xval": 0.93359375, "num_input_tokens_seen": 148804048, "step": 861 }, { "epoch": 0.3445243804956035, "grad_norm": 150.40577419616318, "learning_rate": 5e-06, "loss": 0.8159, "num_input_tokens_seen": 148977112, "step": 862 }, { "epoch": 0.3445243804956035, "loss": 1.0028644800186157, "loss_ce": 0.016902528703212738, "loss_xval": 0.984375, "num_input_tokens_seen": 148977112, "step": 862 }, { "epoch": 0.34492406075139886, "grad_norm": 59.347926899056496, "learning_rate": 5e-06, "loss": 0.6159, "num_input_tokens_seen": 149149576, "step": 863 }, { "epoch": 0.34492406075139886, "loss": 0.5505508184432983, "loss_ce": 0.01075590681284666, "loss_xval": 0.5390625, "num_input_tokens_seen": 149149576, "step": 863 }, { "epoch": 0.34532374100719426, "grad_norm": 19.849931423983264, "learning_rate": 5e-06, "loss": 0.3624, "num_input_tokens_seen": 149322448, "step": 864 }, { "epoch": 0.34532374100719426, "loss": 0.39450711011886597, "loss_ce": 0.02286403253674507, "loss_xval": 0.37109375, "num_input_tokens_seen": 149322448, "step": 864 }, { "epoch": 0.3457234212629896, "grad_norm": 93.95032815890265, "learning_rate": 5e-06, "loss": 0.7113, "num_input_tokens_seen": 149495400, "step": 865 }, { "epoch": 0.3457234212629896, "loss": 0.6392983794212341, "loss_ce": 0.007950708270072937, "loss_xval": 0.6328125, "num_input_tokens_seen": 149495400, "step": 865 }, { "epoch": 0.34612310151878495, "grad_norm": 47.376116576369185, "learning_rate": 5e-06, "loss": 0.5306, "num_input_tokens_seen": 149668592, "step": 866 }, { "epoch": 0.34612310151878495, "loss": 0.8008941411972046, "loss_ce": 0.008505244739353657, "loss_xval": 0.79296875, "num_input_tokens_seen": 149668592, "step": 866 }, { "epoch": 0.34652278177458035, "grad_norm": 79.5279904225274, "learning_rate": 5e-06, "loss": 0.7948, "num_input_tokens_seen": 149841352, "step": 867 }, { "epoch": 0.34652278177458035, "loss": 0.7384788990020752, "loss_ce": 0.010390488430857658, "loss_xval": 0.7265625, "num_input_tokens_seen": 149841352, "step": 867 }, { "epoch": 0.3469224620303757, "grad_norm": 135.31620195017132, "learning_rate": 5e-06, "loss": 0.9246, "num_input_tokens_seen": 150014072, "step": 868 }, { "epoch": 0.3469224620303757, "loss": 1.0314879417419434, "loss_ce": 0.010796924121677876, "loss_xval": 1.0234375, "num_input_tokens_seen": 150014072, "step": 868 }, { "epoch": 0.3473221422861711, "grad_norm": 127.44588322169365, "learning_rate": 5e-06, "loss": 0.932, "num_input_tokens_seen": 150187008, "step": 869 }, { "epoch": 0.3473221422861711, "loss": 0.8235166668891907, "loss_ce": 0.01760845072567463, "loss_xval": 0.8046875, "num_input_tokens_seen": 150187008, "step": 869 }, { "epoch": 0.34772182254196643, "grad_norm": 135.87113372177404, "learning_rate": 5e-06, "loss": 0.7667, "num_input_tokens_seen": 150359864, "step": 870 }, { "epoch": 0.34772182254196643, "loss": 0.5125239491462708, "loss_ce": 0.008007319644093513, "loss_xval": 0.50390625, "num_input_tokens_seen": 150359864, "step": 870 }, { "epoch": 0.3481215027977618, "grad_norm": 146.7548911832383, "learning_rate": 5e-06, "loss": 0.736, "num_input_tokens_seen": 150532848, "step": 871 }, { "epoch": 0.3481215027977618, "loss": 0.9291001558303833, "loss_ce": 0.009239314123988152, "loss_xval": 0.91796875, "num_input_tokens_seen": 150532848, "step": 871 }, { "epoch": 0.34852118305355717, "grad_norm": 133.2402111732672, "learning_rate": 5e-06, "loss": 0.8006, "num_input_tokens_seen": 150705504, "step": 872 }, { "epoch": 0.34852118305355717, "loss": 0.8471918106079102, "loss_ce": 0.01296328753232956, "loss_xval": 0.8359375, "num_input_tokens_seen": 150705504, "step": 872 }, { "epoch": 0.3489208633093525, "grad_norm": 110.66209021170407, "learning_rate": 5e-06, "loss": 0.7153, "num_input_tokens_seen": 150878208, "step": 873 }, { "epoch": 0.3489208633093525, "loss": 0.5486522316932678, "loss_ce": 0.01294666901230812, "loss_xval": 0.53515625, "num_input_tokens_seen": 150878208, "step": 873 }, { "epoch": 0.34932054356514786, "grad_norm": 188.24093809918526, "learning_rate": 5e-06, "loss": 0.5398, "num_input_tokens_seen": 151051624, "step": 874 }, { "epoch": 0.34932054356514786, "loss": 0.646808922290802, "loss_ce": 0.008014976046979427, "loss_xval": 0.640625, "num_input_tokens_seen": 151051624, "step": 874 }, { "epoch": 0.34972022382094325, "grad_norm": 64.80165189770501, "learning_rate": 5e-06, "loss": 0.5948, "num_input_tokens_seen": 151224760, "step": 875 }, { "epoch": 0.34972022382094325, "loss": 0.7177197933197021, "loss_ce": 0.008735395967960358, "loss_xval": 0.7109375, "num_input_tokens_seen": 151224760, "step": 875 }, { "epoch": 0.3501199040767386, "grad_norm": 217.1609995782766, "learning_rate": 5e-06, "loss": 0.8025, "num_input_tokens_seen": 151397592, "step": 876 }, { "epoch": 0.3501199040767386, "loss": 0.7121882438659668, "loss_ce": 0.014678522013127804, "loss_xval": 0.69921875, "num_input_tokens_seen": 151397592, "step": 876 }, { "epoch": 0.350519584332534, "grad_norm": 36.431244186683855, "learning_rate": 5e-06, "loss": 0.8813, "num_input_tokens_seen": 151570816, "step": 877 }, { "epoch": 0.350519584332534, "loss": 0.89690101146698, "loss_ce": 0.010609478689730167, "loss_xval": 0.88671875, "num_input_tokens_seen": 151570816, "step": 877 }, { "epoch": 0.35091926458832934, "grad_norm": 177.67298531218174, "learning_rate": 5e-06, "loss": 0.7987, "num_input_tokens_seen": 151744144, "step": 878 }, { "epoch": 0.35091926458832934, "loss": 0.9523735046386719, "loss_ce": 0.013164570555090904, "loss_xval": 0.9375, "num_input_tokens_seen": 151744144, "step": 878 }, { "epoch": 0.3513189448441247, "grad_norm": 71.39991832784374, "learning_rate": 5e-06, "loss": 0.5297, "num_input_tokens_seen": 151917624, "step": 879 }, { "epoch": 0.3513189448441247, "loss": 0.5013617277145386, "loss_ce": 0.012103933840990067, "loss_xval": 0.48828125, "num_input_tokens_seen": 151917624, "step": 879 }, { "epoch": 0.3517186250999201, "grad_norm": 70.96128356278321, "learning_rate": 5e-06, "loss": 0.6057, "num_input_tokens_seen": 152090752, "step": 880 }, { "epoch": 0.3517186250999201, "loss": 0.4656725227832794, "loss_ce": 0.016270659863948822, "loss_xval": 0.44921875, "num_input_tokens_seen": 152090752, "step": 880 }, { "epoch": 0.3521183053557154, "grad_norm": 82.47791554411735, "learning_rate": 5e-06, "loss": 0.6792, "num_input_tokens_seen": 152263832, "step": 881 }, { "epoch": 0.3521183053557154, "loss": 0.7818809747695923, "loss_ce": 0.015645675361156464, "loss_xval": 0.765625, "num_input_tokens_seen": 152263832, "step": 881 }, { "epoch": 0.35251798561151076, "grad_norm": 60.893015026895945, "learning_rate": 5e-06, "loss": 0.6402, "num_input_tokens_seen": 152436992, "step": 882 }, { "epoch": 0.35251798561151076, "loss": 0.6701950430870056, "loss_ce": 0.012968515045940876, "loss_xval": 0.65625, "num_input_tokens_seen": 152436992, "step": 882 }, { "epoch": 0.35291766586730616, "grad_norm": 104.03511210850309, "learning_rate": 5e-06, "loss": 0.6192, "num_input_tokens_seen": 152609648, "step": 883 }, { "epoch": 0.35291766586730616, "loss": 0.603535532951355, "loss_ce": 0.01662144437432289, "loss_xval": 0.5859375, "num_input_tokens_seen": 152609648, "step": 883 }, { "epoch": 0.3533173461231015, "grad_norm": 22.54287001703602, "learning_rate": 5e-06, "loss": 0.4427, "num_input_tokens_seen": 152782776, "step": 884 }, { "epoch": 0.3533173461231015, "loss": 0.3957730233669281, "loss_ce": 0.016134345903992653, "loss_xval": 0.37890625, "num_input_tokens_seen": 152782776, "step": 884 }, { "epoch": 0.3537170263788969, "grad_norm": 84.24382922528733, "learning_rate": 5e-06, "loss": 0.6745, "num_input_tokens_seen": 152955672, "step": 885 }, { "epoch": 0.3537170263788969, "loss": 0.5475041270256042, "loss_ce": 0.012897195294499397, "loss_xval": 0.53515625, "num_input_tokens_seen": 152955672, "step": 885 }, { "epoch": 0.35411670663469225, "grad_norm": 88.2327166019281, "learning_rate": 5e-06, "loss": 0.539, "num_input_tokens_seen": 153128288, "step": 886 }, { "epoch": 0.35411670663469225, "loss": 0.502876877784729, "loss_ce": 0.01227623037993908, "loss_xval": 0.490234375, "num_input_tokens_seen": 153128288, "step": 886 }, { "epoch": 0.3545163868904876, "grad_norm": 128.72303277494018, "learning_rate": 5e-06, "loss": 0.7006, "num_input_tokens_seen": 153300984, "step": 887 }, { "epoch": 0.3545163868904876, "loss": 0.760562539100647, "loss_ce": 0.010928753763437271, "loss_xval": 0.75, "num_input_tokens_seen": 153300984, "step": 887 }, { "epoch": 0.354916067146283, "grad_norm": 54.2554350644671, "learning_rate": 5e-06, "loss": 0.5323, "num_input_tokens_seen": 153474224, "step": 888 }, { "epoch": 0.354916067146283, "loss": 0.6280389428138733, "loss_ce": 0.010485243052244186, "loss_xval": 0.6171875, "num_input_tokens_seen": 153474224, "step": 888 }, { "epoch": 0.35531574740207833, "grad_norm": 129.45656745633136, "learning_rate": 5e-06, "loss": 0.7172, "num_input_tokens_seen": 153647280, "step": 889 }, { "epoch": 0.35531574740207833, "loss": 0.9391533136367798, "loss_ce": 0.008367151021957397, "loss_xval": 0.9296875, "num_input_tokens_seen": 153647280, "step": 889 }, { "epoch": 0.35571542765787373, "grad_norm": 44.89891618018207, "learning_rate": 5e-06, "loss": 0.4303, "num_input_tokens_seen": 153820864, "step": 890 }, { "epoch": 0.35571542765787373, "loss": 0.4807822108268738, "loss_ce": 0.012703584507107735, "loss_xval": 0.46875, "num_input_tokens_seen": 153820864, "step": 890 }, { "epoch": 0.35611510791366907, "grad_norm": 116.72687188293727, "learning_rate": 5e-06, "loss": 0.5185, "num_input_tokens_seen": 153994288, "step": 891 }, { "epoch": 0.35611510791366907, "loss": 0.284774512052536, "loss_ce": 0.010848723351955414, "loss_xval": 0.2734375, "num_input_tokens_seen": 153994288, "step": 891 }, { "epoch": 0.3565147881694644, "grad_norm": 26.176387267105216, "learning_rate": 5e-06, "loss": 0.3183, "num_input_tokens_seen": 154167208, "step": 892 }, { "epoch": 0.3565147881694644, "loss": 0.1847338080406189, "loss_ce": 0.009440846741199493, "loss_xval": 0.17578125, "num_input_tokens_seen": 154167208, "step": 892 }, { "epoch": 0.3569144684252598, "grad_norm": 165.85099821106596, "learning_rate": 5e-06, "loss": 0.6986, "num_input_tokens_seen": 154339944, "step": 893 }, { "epoch": 0.3569144684252598, "loss": 0.6996178030967712, "loss_ce": 0.008089495822787285, "loss_xval": 0.69140625, "num_input_tokens_seen": 154339944, "step": 893 }, { "epoch": 0.35731414868105515, "grad_norm": 60.57304398442198, "learning_rate": 5e-06, "loss": 0.6213, "num_input_tokens_seen": 154513144, "step": 894 }, { "epoch": 0.35731414868105515, "loss": 0.6562942266464233, "loss_ce": 0.0067581310868263245, "loss_xval": 0.6484375, "num_input_tokens_seen": 154513144, "step": 894 }, { "epoch": 0.3577138289368505, "grad_norm": 88.21158150555097, "learning_rate": 5e-06, "loss": 0.9994, "num_input_tokens_seen": 154686024, "step": 895 }, { "epoch": 0.3577138289368505, "loss": 0.9512639045715332, "loss_ce": 0.006012419238686562, "loss_xval": 0.9453125, "num_input_tokens_seen": 154686024, "step": 895 }, { "epoch": 0.3581135091926459, "grad_norm": 55.98236696389787, "learning_rate": 5e-06, "loss": 0.8411, "num_input_tokens_seen": 154858752, "step": 896 }, { "epoch": 0.3581135091926459, "loss": 0.9027426242828369, "loss_ce": 0.008821753785014153, "loss_xval": 0.89453125, "num_input_tokens_seen": 154858752, "step": 896 }, { "epoch": 0.35851318944844124, "grad_norm": 86.30176471176888, "learning_rate": 5e-06, "loss": 0.5935, "num_input_tokens_seen": 155031920, "step": 897 }, { "epoch": 0.35851318944844124, "loss": 0.8349786400794983, "loss_ce": 0.012407823465764523, "loss_xval": 0.82421875, "num_input_tokens_seen": 155031920, "step": 897 }, { "epoch": 0.35891286970423664, "grad_norm": 47.41906607681754, "learning_rate": 5e-06, "loss": 0.5413, "num_input_tokens_seen": 155205008, "step": 898 }, { "epoch": 0.35891286970423664, "loss": 0.5134440660476685, "loss_ce": 0.007950928062200546, "loss_xval": 0.50390625, "num_input_tokens_seen": 155205008, "step": 898 }, { "epoch": 0.359312549960032, "grad_norm": 30.61976486692005, "learning_rate": 5e-06, "loss": 0.6285, "num_input_tokens_seen": 155378224, "step": 899 }, { "epoch": 0.359312549960032, "loss": 0.762615442276001, "loss_ce": 0.012127195484936237, "loss_xval": 0.75, "num_input_tokens_seen": 155378224, "step": 899 }, { "epoch": 0.3597122302158273, "grad_norm": 43.585381340210624, "learning_rate": 5e-06, "loss": 0.4863, "num_input_tokens_seen": 155551440, "step": 900 }, { "epoch": 0.3597122302158273, "loss": 0.48361706733703613, "loss_ce": 0.008275268599390984, "loss_xval": 0.474609375, "num_input_tokens_seen": 155551440, "step": 900 }, { "epoch": 0.3601119104716227, "grad_norm": 30.9989474941419, "learning_rate": 5e-06, "loss": 0.4965, "num_input_tokens_seen": 155724368, "step": 901 }, { "epoch": 0.3601119104716227, "loss": 0.49682554602622986, "loss_ce": 0.006957381498068571, "loss_xval": 0.490234375, "num_input_tokens_seen": 155724368, "step": 901 }, { "epoch": 0.36051159072741806, "grad_norm": 113.42580772320302, "learning_rate": 5e-06, "loss": 0.6446, "num_input_tokens_seen": 155897552, "step": 902 }, { "epoch": 0.36051159072741806, "loss": 0.9667686223983765, "loss_ce": 0.007417993154376745, "loss_xval": 0.9609375, "num_input_tokens_seen": 155897552, "step": 902 }, { "epoch": 0.3609112709832134, "grad_norm": 101.77745338904332, "learning_rate": 5e-06, "loss": 0.4489, "num_input_tokens_seen": 156070800, "step": 903 }, { "epoch": 0.3609112709832134, "loss": 0.30879414081573486, "loss_ce": 0.004655956290662289, "loss_xval": 0.3046875, "num_input_tokens_seen": 156070800, "step": 903 }, { "epoch": 0.3613109512390088, "grad_norm": 98.20446936043231, "learning_rate": 5e-06, "loss": 0.7204, "num_input_tokens_seen": 156243696, "step": 904 }, { "epoch": 0.3613109512390088, "loss": 0.5402124524116516, "loss_ce": 0.008504673838615417, "loss_xval": 0.53125, "num_input_tokens_seen": 156243696, "step": 904 }, { "epoch": 0.36171063149480415, "grad_norm": 110.94970569500595, "learning_rate": 5e-06, "loss": 0.5071, "num_input_tokens_seen": 156416472, "step": 905 }, { "epoch": 0.36171063149480415, "loss": 0.33861905336380005, "loss_ce": 0.012080967426300049, "loss_xval": 0.326171875, "num_input_tokens_seen": 156416472, "step": 905 }, { "epoch": 0.36211031175059955, "grad_norm": 125.05593579343002, "learning_rate": 5e-06, "loss": 0.4928, "num_input_tokens_seen": 156589520, "step": 906 }, { "epoch": 0.36211031175059955, "loss": 0.6595361232757568, "loss_ce": 0.008992912247776985, "loss_xval": 0.65234375, "num_input_tokens_seen": 156589520, "step": 906 }, { "epoch": 0.3625099920063949, "grad_norm": 73.10342927626562, "learning_rate": 5e-06, "loss": 0.8154, "num_input_tokens_seen": 156762224, "step": 907 }, { "epoch": 0.3625099920063949, "loss": 0.6952431797981262, "loss_ce": 0.009085968136787415, "loss_xval": 0.6875, "num_input_tokens_seen": 156762224, "step": 907 }, { "epoch": 0.36290967226219023, "grad_norm": 126.39497282050058, "learning_rate": 5e-06, "loss": 1.1672, "num_input_tokens_seen": 156935288, "step": 908 }, { "epoch": 0.36290967226219023, "loss": 0.9857476949691772, "loss_ce": 0.006133475806564093, "loss_xval": 0.98046875, "num_input_tokens_seen": 156935288, "step": 908 }, { "epoch": 0.36330935251798563, "grad_norm": 54.826078470566614, "learning_rate": 5e-06, "loss": 0.504, "num_input_tokens_seen": 157108208, "step": 909 }, { "epoch": 0.36330935251798563, "loss": 0.29077398777008057, "loss_ce": 0.0048853312619030476, "loss_xval": 0.28515625, "num_input_tokens_seen": 157108208, "step": 909 }, { "epoch": 0.36370903277378097, "grad_norm": 71.30519374447884, "learning_rate": 5e-06, "loss": 0.6028, "num_input_tokens_seen": 157281176, "step": 910 }, { "epoch": 0.36370903277378097, "loss": 0.48019158840179443, "loss_ce": 0.007047041319310665, "loss_xval": 0.47265625, "num_input_tokens_seen": 157281176, "step": 910 }, { "epoch": 0.3641087130295763, "grad_norm": 52.76501098451776, "learning_rate": 5e-06, "loss": 0.6989, "num_input_tokens_seen": 157454056, "step": 911 }, { "epoch": 0.3641087130295763, "loss": 0.8536124229431152, "loss_ce": 0.005712021142244339, "loss_xval": 0.84765625, "num_input_tokens_seen": 157454056, "step": 911 }, { "epoch": 0.3645083932853717, "grad_norm": 70.87644756607571, "learning_rate": 5e-06, "loss": 0.5284, "num_input_tokens_seen": 157626944, "step": 912 }, { "epoch": 0.3645083932853717, "loss": 0.7316247224807739, "loss_ce": 0.004940135404467583, "loss_xval": 0.7265625, "num_input_tokens_seen": 157626944, "step": 912 }, { "epoch": 0.36490807354116706, "grad_norm": 102.12641205437491, "learning_rate": 5e-06, "loss": 0.7, "num_input_tokens_seen": 157800248, "step": 913 }, { "epoch": 0.36490807354116706, "loss": 0.8764115571975708, "loss_ce": 0.00440229382365942, "loss_xval": 0.87109375, "num_input_tokens_seen": 157800248, "step": 913 }, { "epoch": 0.36530775379696245, "grad_norm": 79.45654741039287, "learning_rate": 5e-06, "loss": 0.6452, "num_input_tokens_seen": 157973304, "step": 914 }, { "epoch": 0.36530775379696245, "loss": 0.3669845163822174, "loss_ce": 0.009074367582798004, "loss_xval": 0.357421875, "num_input_tokens_seen": 157973304, "step": 914 }, { "epoch": 0.3657074340527578, "grad_norm": 124.30904914641106, "learning_rate": 5e-06, "loss": 0.7281, "num_input_tokens_seen": 158146416, "step": 915 }, { "epoch": 0.3657074340527578, "loss": 0.82874596118927, "loss_ce": 0.009165898896753788, "loss_xval": 0.8203125, "num_input_tokens_seen": 158146416, "step": 915 }, { "epoch": 0.36610711430855314, "grad_norm": 42.101453883892475, "learning_rate": 5e-06, "loss": 0.6469, "num_input_tokens_seen": 158319176, "step": 916 }, { "epoch": 0.36610711430855314, "loss": 0.7069438695907593, "loss_ce": 0.017673827707767487, "loss_xval": 0.6875, "num_input_tokens_seen": 158319176, "step": 916 }, { "epoch": 0.36650679456434854, "grad_norm": 162.51287996615943, "learning_rate": 5e-06, "loss": 0.6696, "num_input_tokens_seen": 158492232, "step": 917 }, { "epoch": 0.36650679456434854, "loss": 0.90346360206604, "loss_ce": 0.005209219641983509, "loss_xval": 0.8984375, "num_input_tokens_seen": 158492232, "step": 917 }, { "epoch": 0.3669064748201439, "grad_norm": 100.54356211089177, "learning_rate": 5e-06, "loss": 0.5464, "num_input_tokens_seen": 158665144, "step": 918 }, { "epoch": 0.3669064748201439, "loss": 0.5094872713088989, "loss_ce": 0.011562451720237732, "loss_xval": 0.498046875, "num_input_tokens_seen": 158665144, "step": 918 }, { "epoch": 0.3673061550759392, "grad_norm": 76.05379399514754, "learning_rate": 5e-06, "loss": 0.7202, "num_input_tokens_seen": 158838016, "step": 919 }, { "epoch": 0.3673061550759392, "loss": 0.7858811616897583, "loss_ce": 0.008903573267161846, "loss_xval": 0.77734375, "num_input_tokens_seen": 158838016, "step": 919 }, { "epoch": 0.3677058353317346, "grad_norm": 90.74075761256418, "learning_rate": 5e-06, "loss": 0.9269, "num_input_tokens_seen": 159011192, "step": 920 }, { "epoch": 0.3677058353317346, "loss": 0.9359513521194458, "loss_ce": 0.00675212824717164, "loss_xval": 0.9296875, "num_input_tokens_seen": 159011192, "step": 920 }, { "epoch": 0.36810551558752996, "grad_norm": 45.087533766808335, "learning_rate": 5e-06, "loss": 0.683, "num_input_tokens_seen": 159184136, "step": 921 }, { "epoch": 0.36810551558752996, "loss": 0.9866589307785034, "loss_ce": 0.005701903253793716, "loss_xval": 0.98046875, "num_input_tokens_seen": 159184136, "step": 921 }, { "epoch": 0.36850519584332536, "grad_norm": 107.55288335570066, "learning_rate": 5e-06, "loss": 0.676, "num_input_tokens_seen": 159356600, "step": 922 }, { "epoch": 0.36850519584332536, "loss": 0.4224510192871094, "loss_ce": 0.007106784265488386, "loss_xval": 0.416015625, "num_input_tokens_seen": 159356600, "step": 922 }, { "epoch": 0.3689048760991207, "grad_norm": 54.68109555714196, "learning_rate": 5e-06, "loss": 0.7, "num_input_tokens_seen": 159529752, "step": 923 }, { "epoch": 0.3689048760991207, "loss": 0.4360928535461426, "loss_ce": 0.011532355099916458, "loss_xval": 0.423828125, "num_input_tokens_seen": 159529752, "step": 923 }, { "epoch": 0.36930455635491605, "grad_norm": 42.085657682472316, "learning_rate": 5e-06, "loss": 0.5586, "num_input_tokens_seen": 159702560, "step": 924 }, { "epoch": 0.36930455635491605, "loss": 0.6730107069015503, "loss_ce": 0.005652267951518297, "loss_xval": 0.66796875, "num_input_tokens_seen": 159702560, "step": 924 }, { "epoch": 0.36970423661071145, "grad_norm": 47.95856995776537, "learning_rate": 5e-06, "loss": 1.0306, "num_input_tokens_seen": 159875480, "step": 925 }, { "epoch": 0.36970423661071145, "loss": 1.4013919830322266, "loss_ce": 0.005395848304033279, "loss_xval": 1.3984375, "num_input_tokens_seen": 159875480, "step": 925 }, { "epoch": 0.3701039168665068, "grad_norm": 63.74419610065917, "learning_rate": 5e-06, "loss": 0.8646, "num_input_tokens_seen": 160048464, "step": 926 }, { "epoch": 0.3701039168665068, "loss": 0.7054557800292969, "loss_ce": 0.008556396700441837, "loss_xval": 0.6953125, "num_input_tokens_seen": 160048464, "step": 926 }, { "epoch": 0.37050359712230213, "grad_norm": 120.90791495364564, "learning_rate": 5e-06, "loss": 0.6159, "num_input_tokens_seen": 160217688, "step": 927 }, { "epoch": 0.37050359712230213, "loss": 0.663061261177063, "loss_ce": 0.006322955247014761, "loss_xval": 0.65625, "num_input_tokens_seen": 160217688, "step": 927 }, { "epoch": 0.37090327737809753, "grad_norm": 30.76703590033193, "learning_rate": 5e-06, "loss": 0.5309, "num_input_tokens_seen": 160390592, "step": 928 }, { "epoch": 0.37090327737809753, "loss": 0.5543885231018066, "loss_ce": 0.0052857049740850925, "loss_xval": 0.55078125, "num_input_tokens_seen": 160390592, "step": 928 }, { "epoch": 0.37130295763389287, "grad_norm": 113.83660820851645, "learning_rate": 5e-06, "loss": 0.5308, "num_input_tokens_seen": 160563232, "step": 929 }, { "epoch": 0.37130295763389287, "loss": 0.59651780128479, "loss_ce": 0.00856616348028183, "loss_xval": 0.58984375, "num_input_tokens_seen": 160563232, "step": 929 }, { "epoch": 0.37170263788968827, "grad_norm": 47.41293456779943, "learning_rate": 5e-06, "loss": 0.708, "num_input_tokens_seen": 160736496, "step": 930 }, { "epoch": 0.37170263788968827, "loss": 0.9490618705749512, "loss_ce": 0.00680114608258009, "loss_xval": 0.94140625, "num_input_tokens_seen": 160736496, "step": 930 }, { "epoch": 0.3721023181454836, "grad_norm": 158.06844665795316, "learning_rate": 5e-06, "loss": 0.644, "num_input_tokens_seen": 160909248, "step": 931 }, { "epoch": 0.3721023181454836, "loss": 0.5230578184127808, "loss_ce": 0.005845884792506695, "loss_xval": 0.515625, "num_input_tokens_seen": 160909248, "step": 931 }, { "epoch": 0.37250199840127896, "grad_norm": 51.036944593601305, "learning_rate": 5e-06, "loss": 0.7556, "num_input_tokens_seen": 161082424, "step": 932 }, { "epoch": 0.37250199840127896, "loss": 0.789161205291748, "loss_ce": 0.00980327744036913, "loss_xval": 0.78125, "num_input_tokens_seen": 161082424, "step": 932 }, { "epoch": 0.37290167865707435, "grad_norm": 64.86944261600239, "learning_rate": 5e-06, "loss": 0.5996, "num_input_tokens_seen": 161251584, "step": 933 }, { "epoch": 0.37290167865707435, "loss": 0.6159493923187256, "loss_ce": 0.007917143404483795, "loss_xval": 0.609375, "num_input_tokens_seen": 161251584, "step": 933 }, { "epoch": 0.3733013589128697, "grad_norm": 70.73385265509664, "learning_rate": 5e-06, "loss": 0.5539, "num_input_tokens_seen": 161424536, "step": 934 }, { "epoch": 0.3733013589128697, "loss": 0.46938377618789673, "loss_ce": 0.007774879224598408, "loss_xval": 0.4609375, "num_input_tokens_seen": 161424536, "step": 934 }, { "epoch": 0.3737010391686651, "grad_norm": 91.39872145012366, "learning_rate": 5e-06, "loss": 0.5084, "num_input_tokens_seen": 161597208, "step": 935 }, { "epoch": 0.3737010391686651, "loss": 0.5436455011367798, "loss_ce": 0.005315391346812248, "loss_xval": 0.5390625, "num_input_tokens_seen": 161597208, "step": 935 }, { "epoch": 0.37410071942446044, "grad_norm": 34.91207932565985, "learning_rate": 5e-06, "loss": 0.6474, "num_input_tokens_seen": 161770632, "step": 936 }, { "epoch": 0.37410071942446044, "loss": 0.6301032304763794, "loss_ce": 0.008216038346290588, "loss_xval": 0.62109375, "num_input_tokens_seen": 161770632, "step": 936 }, { "epoch": 0.3745003996802558, "grad_norm": 64.44416456277416, "learning_rate": 5e-06, "loss": 0.5532, "num_input_tokens_seen": 161943488, "step": 937 }, { "epoch": 0.3745003996802558, "loss": 0.6335973739624023, "loss_ce": 0.007498729042708874, "loss_xval": 0.625, "num_input_tokens_seen": 161943488, "step": 937 }, { "epoch": 0.3749000799360512, "grad_norm": 40.232602977833224, "learning_rate": 5e-06, "loss": 0.6844, "num_input_tokens_seen": 162116368, "step": 938 }, { "epoch": 0.3749000799360512, "loss": 0.6123180389404297, "loss_ce": 0.005567554850131273, "loss_xval": 0.60546875, "num_input_tokens_seen": 162116368, "step": 938 }, { "epoch": 0.3752997601918465, "grad_norm": 53.75798255647753, "learning_rate": 5e-06, "loss": 0.7675, "num_input_tokens_seen": 162289448, "step": 939 }, { "epoch": 0.3752997601918465, "loss": 0.9261175990104675, "loss_ce": 0.014862729236483574, "loss_xval": 0.91015625, "num_input_tokens_seen": 162289448, "step": 939 }, { "epoch": 0.37569944044764186, "grad_norm": 39.58720080674575, "learning_rate": 5e-06, "loss": 0.8083, "num_input_tokens_seen": 162462168, "step": 940 }, { "epoch": 0.37569944044764186, "loss": 0.8281588554382324, "loss_ce": 0.00760222552344203, "loss_xval": 0.8203125, "num_input_tokens_seen": 162462168, "step": 940 }, { "epoch": 0.37609912070343726, "grad_norm": 83.49540063053209, "learning_rate": 5e-06, "loss": 0.5544, "num_input_tokens_seen": 162635232, "step": 941 }, { "epoch": 0.37609912070343726, "loss": 0.6313626766204834, "loss_ce": 0.008254722692072392, "loss_xval": 0.625, "num_input_tokens_seen": 162635232, "step": 941 }, { "epoch": 0.3764988009592326, "grad_norm": 25.10804339460941, "learning_rate": 5e-06, "loss": 0.4647, "num_input_tokens_seen": 162808504, "step": 942 }, { "epoch": 0.3764988009592326, "loss": 0.6199823021888733, "loss_ce": 0.006884154863655567, "loss_xval": 0.61328125, "num_input_tokens_seen": 162808504, "step": 942 }, { "epoch": 0.376898481215028, "grad_norm": 117.86457604419098, "learning_rate": 5e-06, "loss": 0.4543, "num_input_tokens_seen": 162981520, "step": 943 }, { "epoch": 0.376898481215028, "loss": 0.6796769499778748, "loss_ce": 0.008473317138850689, "loss_xval": 0.671875, "num_input_tokens_seen": 162981520, "step": 943 }, { "epoch": 0.37729816147082335, "grad_norm": 86.42324771771638, "learning_rate": 5e-06, "loss": 0.7931, "num_input_tokens_seen": 163154416, "step": 944 }, { "epoch": 0.37729816147082335, "loss": 0.7076585292816162, "loss_ce": 0.005021838005632162, "loss_xval": 0.703125, "num_input_tokens_seen": 163154416, "step": 944 }, { "epoch": 0.3776978417266187, "grad_norm": 88.55505827409151, "learning_rate": 5e-06, "loss": 0.6935, "num_input_tokens_seen": 163327568, "step": 945 }, { "epoch": 0.3776978417266187, "loss": 0.789901852607727, "loss_ce": 0.00938426237553358, "loss_xval": 0.78125, "num_input_tokens_seen": 163327568, "step": 945 }, { "epoch": 0.3780975219824141, "grad_norm": 59.41303258965943, "learning_rate": 5e-06, "loss": 0.4882, "num_input_tokens_seen": 163500632, "step": 946 }, { "epoch": 0.3780975219824141, "loss": 0.64866042137146, "loss_ce": 0.006875764578580856, "loss_xval": 0.640625, "num_input_tokens_seen": 163500632, "step": 946 }, { "epoch": 0.37849720223820943, "grad_norm": 79.47629063266275, "learning_rate": 5e-06, "loss": 0.6103, "num_input_tokens_seen": 163673792, "step": 947 }, { "epoch": 0.37849720223820943, "loss": 0.34968358278274536, "loss_ce": 0.004804443567991257, "loss_xval": 0.345703125, "num_input_tokens_seen": 163673792, "step": 947 }, { "epoch": 0.37889688249400477, "grad_norm": 85.59131590620106, "learning_rate": 5e-06, "loss": 0.7057, "num_input_tokens_seen": 163846624, "step": 948 }, { "epoch": 0.37889688249400477, "loss": 0.6663787961006165, "loss_ce": 0.004757732152938843, "loss_xval": 0.66015625, "num_input_tokens_seen": 163846624, "step": 948 }, { "epoch": 0.37929656274980017, "grad_norm": 76.63253940661868, "learning_rate": 5e-06, "loss": 0.6282, "num_input_tokens_seen": 164019440, "step": 949 }, { "epoch": 0.37929656274980017, "loss": 0.7174029350280762, "loss_ce": 0.006465459242463112, "loss_xval": 0.7109375, "num_input_tokens_seen": 164019440, "step": 949 }, { "epoch": 0.3796962430055955, "grad_norm": 108.31628507448607, "learning_rate": 5e-06, "loss": 0.5647, "num_input_tokens_seen": 164192424, "step": 950 }, { "epoch": 0.3796962430055955, "loss": 0.6702345609664917, "loss_ce": 0.005500704515725374, "loss_xval": 0.6640625, "num_input_tokens_seen": 164192424, "step": 950 }, { "epoch": 0.3800959232613909, "grad_norm": 187.11072677320212, "learning_rate": 5e-06, "loss": 0.7636, "num_input_tokens_seen": 164365360, "step": 951 }, { "epoch": 0.3800959232613909, "loss": 0.6925037503242493, "loss_ce": 0.007933435961604118, "loss_xval": 0.68359375, "num_input_tokens_seen": 164365360, "step": 951 }, { "epoch": 0.38049560351718625, "grad_norm": 103.05122060180793, "learning_rate": 5e-06, "loss": 0.5753, "num_input_tokens_seen": 164538256, "step": 952 }, { "epoch": 0.38049560351718625, "loss": 0.4882552921772003, "loss_ce": 0.007603436708450317, "loss_xval": 0.48046875, "num_input_tokens_seen": 164538256, "step": 952 }, { "epoch": 0.3808952837729816, "grad_norm": 96.4099328188074, "learning_rate": 5e-06, "loss": 0.4646, "num_input_tokens_seen": 164711880, "step": 953 }, { "epoch": 0.3808952837729816, "loss": 0.3607138395309448, "loss_ce": 0.008541014045476913, "loss_xval": 0.3515625, "num_input_tokens_seen": 164711880, "step": 953 }, { "epoch": 0.381294964028777, "grad_norm": 67.6984785917304, "learning_rate": 5e-06, "loss": 0.8889, "num_input_tokens_seen": 164881448, "step": 954 }, { "epoch": 0.381294964028777, "loss": 0.927307665348053, "loss_ce": 0.01062064804136753, "loss_xval": 0.91796875, "num_input_tokens_seen": 164881448, "step": 954 }, { "epoch": 0.38169464428457234, "grad_norm": 153.94541262913765, "learning_rate": 5e-06, "loss": 0.8191, "num_input_tokens_seen": 165054160, "step": 955 }, { "epoch": 0.38169464428457234, "loss": 0.47236108779907227, "loss_ce": 0.006662844214588404, "loss_xval": 0.46484375, "num_input_tokens_seen": 165054160, "step": 955 }, { "epoch": 0.3820943245403677, "grad_norm": 149.72575420742487, "learning_rate": 5e-06, "loss": 1.0601, "num_input_tokens_seen": 165226944, "step": 956 }, { "epoch": 0.3820943245403677, "loss": 1.1687389612197876, "loss_ce": 0.00907099712640047, "loss_xval": 1.15625, "num_input_tokens_seen": 165226944, "step": 956 }, { "epoch": 0.3824940047961631, "grad_norm": 44.450558669277505, "learning_rate": 5e-06, "loss": 0.4047, "num_input_tokens_seen": 165399800, "step": 957 }, { "epoch": 0.3824940047961631, "loss": 0.38355177640914917, "loss_ce": 0.006476604379713535, "loss_xval": 0.376953125, "num_input_tokens_seen": 165399800, "step": 957 }, { "epoch": 0.3828936850519584, "grad_norm": 106.25175160747101, "learning_rate": 5e-06, "loss": 0.4297, "num_input_tokens_seen": 165572760, "step": 958 }, { "epoch": 0.3828936850519584, "loss": 0.5556713938713074, "loss_ce": 0.011237800121307373, "loss_xval": 0.54296875, "num_input_tokens_seen": 165572760, "step": 958 }, { "epoch": 0.3832933653077538, "grad_norm": 49.87171905029353, "learning_rate": 5e-06, "loss": 0.6636, "num_input_tokens_seen": 165745872, "step": 959 }, { "epoch": 0.3832933653077538, "loss": 0.41619065403938293, "loss_ce": 0.006583709269762039, "loss_xval": 0.41015625, "num_input_tokens_seen": 165745872, "step": 959 }, { "epoch": 0.38369304556354916, "grad_norm": 132.66081692696326, "learning_rate": 5e-06, "loss": 0.9105, "num_input_tokens_seen": 165918744, "step": 960 }, { "epoch": 0.38369304556354916, "loss": 0.9205524325370789, "loss_ce": 0.011311713606119156, "loss_xval": 0.91015625, "num_input_tokens_seen": 165918744, "step": 960 }, { "epoch": 0.3840927258193445, "grad_norm": 73.04408709563512, "learning_rate": 5e-06, "loss": 0.5809, "num_input_tokens_seen": 166087696, "step": 961 }, { "epoch": 0.3840927258193445, "loss": 0.701846718788147, "loss_ce": 0.005496594589203596, "loss_xval": 0.6953125, "num_input_tokens_seen": 166087696, "step": 961 }, { "epoch": 0.3844924060751399, "grad_norm": 142.34342052216527, "learning_rate": 5e-06, "loss": 0.8367, "num_input_tokens_seen": 166260736, "step": 962 }, { "epoch": 0.3844924060751399, "loss": 1.0870518684387207, "loss_ce": 0.007217873819172382, "loss_xval": 1.078125, "num_input_tokens_seen": 166260736, "step": 962 }, { "epoch": 0.38489208633093525, "grad_norm": 91.56652258008455, "learning_rate": 5e-06, "loss": 0.6936, "num_input_tokens_seen": 166433480, "step": 963 }, { "epoch": 0.38489208633093525, "loss": 0.757168710231781, "loss_ce": 0.005947999190539122, "loss_xval": 0.75, "num_input_tokens_seen": 166433480, "step": 963 }, { "epoch": 0.3852917665867306, "grad_norm": 112.22507236008505, "learning_rate": 5e-06, "loss": 0.4131, "num_input_tokens_seen": 166606920, "step": 964 }, { "epoch": 0.3852917665867306, "loss": 0.2249833643436432, "loss_ce": 0.007179419510066509, "loss_xval": 0.2177734375, "num_input_tokens_seen": 166606920, "step": 964 }, { "epoch": 0.385691446842526, "grad_norm": 75.91188836670992, "learning_rate": 5e-06, "loss": 0.4977, "num_input_tokens_seen": 166779976, "step": 965 }, { "epoch": 0.385691446842526, "loss": 0.3778020143508911, "loss_ce": 0.007257565855979919, "loss_xval": 0.37109375, "num_input_tokens_seen": 166779976, "step": 965 }, { "epoch": 0.38609112709832133, "grad_norm": 72.22940290020605, "learning_rate": 5e-06, "loss": 0.583, "num_input_tokens_seen": 166953200, "step": 966 }, { "epoch": 0.38609112709832133, "loss": 0.6213172674179077, "loss_ce": 0.011698130518198013, "loss_xval": 0.609375, "num_input_tokens_seen": 166953200, "step": 966 }, { "epoch": 0.3864908073541167, "grad_norm": 40.4774269583089, "learning_rate": 5e-06, "loss": 0.8808, "num_input_tokens_seen": 167126160, "step": 967 }, { "epoch": 0.3864908073541167, "loss": 1.3061549663543701, "loss_ce": 0.008913781493902206, "loss_xval": 1.296875, "num_input_tokens_seen": 167126160, "step": 967 }, { "epoch": 0.38689048760991207, "grad_norm": 77.76546008195128, "learning_rate": 5e-06, "loss": 0.6034, "num_input_tokens_seen": 167299016, "step": 968 }, { "epoch": 0.38689048760991207, "loss": 0.4178801476955414, "loss_ce": 0.005404568277299404, "loss_xval": 0.412109375, "num_input_tokens_seen": 167299016, "step": 968 }, { "epoch": 0.3872901678657074, "grad_norm": 50.08422450179112, "learning_rate": 5e-06, "loss": 0.8172, "num_input_tokens_seen": 167472080, "step": 969 }, { "epoch": 0.3872901678657074, "loss": 0.6857173442840576, "loss_ce": 0.008837435394525528, "loss_xval": 0.67578125, "num_input_tokens_seen": 167472080, "step": 969 }, { "epoch": 0.3876898481215028, "grad_norm": 115.02853547549552, "learning_rate": 5e-06, "loss": 1.2746, "num_input_tokens_seen": 167645184, "step": 970 }, { "epoch": 0.3876898481215028, "loss": 0.7067731618881226, "loss_ce": 0.007554412819445133, "loss_xval": 0.69921875, "num_input_tokens_seen": 167645184, "step": 970 }, { "epoch": 0.38808952837729815, "grad_norm": 114.25307598804022, "learning_rate": 5e-06, "loss": 0.6847, "num_input_tokens_seen": 167817888, "step": 971 }, { "epoch": 0.38808952837729815, "loss": 0.8140785694122314, "loss_ce": 0.006705489940941334, "loss_xval": 0.80859375, "num_input_tokens_seen": 167817888, "step": 971 }, { "epoch": 0.38848920863309355, "grad_norm": 58.200766823999736, "learning_rate": 5e-06, "loss": 0.8283, "num_input_tokens_seen": 167990920, "step": 972 }, { "epoch": 0.38848920863309355, "loss": 0.551996111869812, "loss_ce": 0.007928753271698952, "loss_xval": 0.54296875, "num_input_tokens_seen": 167990920, "step": 972 }, { "epoch": 0.3888888888888889, "grad_norm": 58.649617728544364, "learning_rate": 5e-06, "loss": 0.4376, "num_input_tokens_seen": 168163488, "step": 973 }, { "epoch": 0.3888888888888889, "loss": 0.2681065797805786, "loss_ce": 0.0070592425763607025, "loss_xval": 0.26171875, "num_input_tokens_seen": 168163488, "step": 973 }, { "epoch": 0.38928856914468424, "grad_norm": 80.19293169274987, "learning_rate": 5e-06, "loss": 0.6363, "num_input_tokens_seen": 168336336, "step": 974 }, { "epoch": 0.38928856914468424, "loss": 0.7013700008392334, "loss_ce": 0.006606808863580227, "loss_xval": 0.6953125, "num_input_tokens_seen": 168336336, "step": 974 }, { "epoch": 0.38968824940047964, "grad_norm": 89.07859540994681, "learning_rate": 5e-06, "loss": 0.4365, "num_input_tokens_seen": 168509680, "step": 975 }, { "epoch": 0.38968824940047964, "loss": 0.40148645639419556, "loss_ce": 0.006741571240127087, "loss_xval": 0.39453125, "num_input_tokens_seen": 168509680, "step": 975 }, { "epoch": 0.390087929656275, "grad_norm": 63.38435699178039, "learning_rate": 5e-06, "loss": 0.6486, "num_input_tokens_seen": 168682024, "step": 976 }, { "epoch": 0.390087929656275, "loss": 0.48744258284568787, "loss_ce": 0.00575310830026865, "loss_xval": 0.482421875, "num_input_tokens_seen": 168682024, "step": 976 }, { "epoch": 0.3904876099120703, "grad_norm": 37.56895127097929, "learning_rate": 5e-06, "loss": 0.8508, "num_input_tokens_seen": 168855064, "step": 977 }, { "epoch": 0.3904876099120703, "loss": 0.7805444002151489, "loss_ce": 0.010647003538906574, "loss_xval": 0.76953125, "num_input_tokens_seen": 168855064, "step": 977 }, { "epoch": 0.3908872901678657, "grad_norm": 38.66001191702328, "learning_rate": 5e-06, "loss": 0.6123, "num_input_tokens_seen": 169028096, "step": 978 }, { "epoch": 0.3908872901678657, "loss": 0.5990478992462158, "loss_ce": 0.009570390917360783, "loss_xval": 0.58984375, "num_input_tokens_seen": 169028096, "step": 978 }, { "epoch": 0.39128697042366106, "grad_norm": 43.557533168818516, "learning_rate": 5e-06, "loss": 0.6573, "num_input_tokens_seen": 169201024, "step": 979 }, { "epoch": 0.39128697042366106, "loss": 0.6643787026405334, "loss_ce": 0.005992514081299305, "loss_xval": 0.66015625, "num_input_tokens_seen": 169201024, "step": 979 }, { "epoch": 0.39168665067945646, "grad_norm": 57.037574608846434, "learning_rate": 5e-06, "loss": 0.553, "num_input_tokens_seen": 169372448, "step": 980 }, { "epoch": 0.39168665067945646, "loss": 0.7914978265762329, "loss_ce": 0.015252649784088135, "loss_xval": 0.77734375, "num_input_tokens_seen": 169372448, "step": 980 }, { "epoch": 0.3920863309352518, "grad_norm": 26.713226016677673, "learning_rate": 5e-06, "loss": 0.4622, "num_input_tokens_seen": 169545336, "step": 981 }, { "epoch": 0.3920863309352518, "loss": 0.5592265725135803, "loss_ce": 0.005027370527386665, "loss_xval": 0.5546875, "num_input_tokens_seen": 169545336, "step": 981 }, { "epoch": 0.39248601119104715, "grad_norm": 30.376410347757318, "learning_rate": 5e-06, "loss": 0.4451, "num_input_tokens_seen": 169718480, "step": 982 }, { "epoch": 0.39248601119104715, "loss": 0.326797217130661, "loss_ce": 0.00709507055580616, "loss_xval": 0.3203125, "num_input_tokens_seen": 169718480, "step": 982 }, { "epoch": 0.39288569144684254, "grad_norm": 42.70059848914429, "learning_rate": 5e-06, "loss": 0.65, "num_input_tokens_seen": 169891376, "step": 983 }, { "epoch": 0.39288569144684254, "loss": 0.6005296111106873, "loss_ce": 0.004796041641384363, "loss_xval": 0.59765625, "num_input_tokens_seen": 169891376, "step": 983 }, { "epoch": 0.3932853717026379, "grad_norm": 61.85368887460458, "learning_rate": 5e-06, "loss": 0.7171, "num_input_tokens_seen": 170064280, "step": 984 }, { "epoch": 0.3932853717026379, "loss": 0.3366258144378662, "loss_ce": 0.0034348834306001663, "loss_xval": 0.333984375, "num_input_tokens_seen": 170064280, "step": 984 }, { "epoch": 0.39368505195843323, "grad_norm": 27.359901358980956, "learning_rate": 5e-06, "loss": 0.5081, "num_input_tokens_seen": 170237664, "step": 985 }, { "epoch": 0.39368505195843323, "loss": 0.4442784786224365, "loss_ce": 0.0073583247140049934, "loss_xval": 0.4375, "num_input_tokens_seen": 170237664, "step": 985 }, { "epoch": 0.39408473221422863, "grad_norm": 161.64024654863002, "learning_rate": 5e-06, "loss": 0.8349, "num_input_tokens_seen": 170411016, "step": 986 }, { "epoch": 0.39408473221422863, "loss": 0.765992283821106, "loss_ce": 0.005799395032227039, "loss_xval": 0.76171875, "num_input_tokens_seen": 170411016, "step": 986 }, { "epoch": 0.39448441247002397, "grad_norm": 77.11201946826561, "learning_rate": 5e-06, "loss": 1.1603, "num_input_tokens_seen": 170583888, "step": 987 }, { "epoch": 0.39448441247002397, "loss": 1.365664005279541, "loss_ce": 0.007936842739582062, "loss_xval": 1.359375, "num_input_tokens_seen": 170583888, "step": 987 }, { "epoch": 0.39488409272581937, "grad_norm": 126.12675711597637, "learning_rate": 5e-06, "loss": 0.9069, "num_input_tokens_seen": 170756584, "step": 988 }, { "epoch": 0.39488409272581937, "loss": 0.7058815956115723, "loss_ce": 0.008860129863023758, "loss_xval": 0.6953125, "num_input_tokens_seen": 170756584, "step": 988 }, { "epoch": 0.3952837729816147, "grad_norm": 176.69341122468677, "learning_rate": 5e-06, "loss": 0.8091, "num_input_tokens_seen": 170929272, "step": 989 }, { "epoch": 0.3952837729816147, "loss": 0.7472469806671143, "loss_ce": 0.0038387635722756386, "loss_xval": 0.7421875, "num_input_tokens_seen": 170929272, "step": 989 }, { "epoch": 0.39568345323741005, "grad_norm": 100.80639940703247, "learning_rate": 5e-06, "loss": 0.4486, "num_input_tokens_seen": 171102064, "step": 990 }, { "epoch": 0.39568345323741005, "loss": 0.34538111090660095, "loss_ce": 0.00645289896056056, "loss_xval": 0.33984375, "num_input_tokens_seen": 171102064, "step": 990 }, { "epoch": 0.39608313349320545, "grad_norm": 127.70680274006587, "learning_rate": 5e-06, "loss": 0.4639, "num_input_tokens_seen": 171275256, "step": 991 }, { "epoch": 0.39608313349320545, "loss": 0.6334390044212341, "loss_ce": 0.006730004213750362, "loss_xval": 0.625, "num_input_tokens_seen": 171275256, "step": 991 }, { "epoch": 0.3964828137490008, "grad_norm": 98.65398770343047, "learning_rate": 5e-06, "loss": 0.7308, "num_input_tokens_seen": 171448104, "step": 992 }, { "epoch": 0.3964828137490008, "loss": 0.7053290009498596, "loss_ce": 0.009894400835037231, "loss_xval": 0.6953125, "num_input_tokens_seen": 171448104, "step": 992 }, { "epoch": 0.39688249400479614, "grad_norm": 104.68550005705072, "learning_rate": 5e-06, "loss": 0.2795, "num_input_tokens_seen": 171621304, "step": 993 }, { "epoch": 0.39688249400479614, "loss": 0.2923963665962219, "loss_ce": 0.005653205327689648, "loss_xval": 0.287109375, "num_input_tokens_seen": 171621304, "step": 993 }, { "epoch": 0.39728217426059154, "grad_norm": 151.12207326320515, "learning_rate": 5e-06, "loss": 0.8025, "num_input_tokens_seen": 171794016, "step": 994 }, { "epoch": 0.39728217426059154, "loss": 0.7504175901412964, "loss_ce": 0.005300438497215509, "loss_xval": 0.74609375, "num_input_tokens_seen": 171794016, "step": 994 }, { "epoch": 0.3976818545163869, "grad_norm": 87.55290667068692, "learning_rate": 5e-06, "loss": 0.5974, "num_input_tokens_seen": 171966768, "step": 995 }, { "epoch": 0.3976818545163869, "loss": 0.3750014305114746, "loss_ce": 0.005433551035821438, "loss_xval": 0.369140625, "num_input_tokens_seen": 171966768, "step": 995 }, { "epoch": 0.3980815347721823, "grad_norm": 52.76431280420055, "learning_rate": 5e-06, "loss": 0.5791, "num_input_tokens_seen": 172139856, "step": 996 }, { "epoch": 0.3980815347721823, "loss": 0.5788711309432983, "loss_ce": 0.009596217423677444, "loss_xval": 0.5703125, "num_input_tokens_seen": 172139856, "step": 996 }, { "epoch": 0.3984812150279776, "grad_norm": 61.69947757143887, "learning_rate": 5e-06, "loss": 0.5662, "num_input_tokens_seen": 172312544, "step": 997 }, { "epoch": 0.3984812150279776, "loss": 0.50725257396698, "loss_ce": 0.010182302445173264, "loss_xval": 0.49609375, "num_input_tokens_seen": 172312544, "step": 997 }, { "epoch": 0.39888089528377296, "grad_norm": 34.32354136252659, "learning_rate": 5e-06, "loss": 0.8179, "num_input_tokens_seen": 172485560, "step": 998 }, { "epoch": 0.39888089528377296, "loss": 0.9845725893974304, "loss_ce": 0.023391013965010643, "loss_xval": 0.9609375, "num_input_tokens_seen": 172485560, "step": 998 }, { "epoch": 0.39928057553956836, "grad_norm": 84.83106537615475, "learning_rate": 5e-06, "loss": 0.5806, "num_input_tokens_seen": 172658848, "step": 999 }, { "epoch": 0.39928057553956836, "loss": 0.8320725560188293, "loss_ce": 0.005900641903281212, "loss_xval": 0.828125, "num_input_tokens_seen": 172658848, "step": 999 }, { "epoch": 0.3996802557953637, "grad_norm": 80.35046570322501, "learning_rate": 5e-06, "loss": 0.603, "num_input_tokens_seen": 172831616, "step": 1000 }, { "epoch": 0.3996802557953637, "eval_websight_new_IoU": 0.3092806488275528, "eval_websight_new_MAE_all": 0.024964885786175728, "eval_websight_new_MAE_h": 0.009109157603234053, "eval_websight_new_MAE_w": 0.04664035141468048, "eval_websight_new_MAE_x": 0.025324680842459202, "eval_websight_new_MAE_y": 0.018785354681313038, "eval_websight_new_NUM_probability": 0.9444170296192169, "eval_websight_new_inside_bbox": 0.6996527910232544, "eval_websight_new_loss": 0.33494770526885986, "eval_websight_new_loss_ce": 0.006523952353745699, "eval_websight_new_loss_xval": 0.26861572265625, "eval_websight_new_runtime": 56.6826, "eval_websight_new_samples_per_second": 0.882, "eval_websight_new_steps_per_second": 0.035, "num_input_tokens_seen": 172831616, "step": 1000 }, { "epoch": 0.3996802557953637, "eval_seeclick_IoU": 0.23224642127752304, "eval_seeclick_MAE_all": 0.07489410787820816, "eval_seeclick_MAE_h": 0.02226562239229679, "eval_seeclick_MAE_w": 0.11477012187242508, "eval_seeclick_MAE_x": 0.0983852706849575, "eval_seeclick_MAE_y": 0.06415541097521782, "eval_seeclick_NUM_probability": 0.9417648315429688, "eval_seeclick_inside_bbox": 0.4444444477558136, "eval_seeclick_loss": 1.5326517820358276, "eval_seeclick_loss_ce": 0.020226879976689816, "eval_seeclick_loss_xval": 1.391357421875, "eval_seeclick_runtime": 84.8257, "eval_seeclick_samples_per_second": 0.589, "eval_seeclick_steps_per_second": 0.024, "num_input_tokens_seen": 172831616, "step": 1000 }, { "epoch": 0.3996802557953637, "eval_icons_IoU": 0.061911119148135185, "eval_icons_MAE_all": 0.028313827700912952, "eval_icons_MAE_h": 0.006960721453651786, "eval_icons_MAE_w": 0.008420140482485294, "eval_icons_MAE_x": 0.05678635463118553, "eval_icons_MAE_y": 0.04108810052275658, "eval_icons_NUM_probability": 0.9464539885520935, "eval_icons_inside_bbox": 0.09027777798473835, "eval_icons_loss": 0.38697123527526855, "eval_icons_loss_ce": 0.006282810820266604, "eval_icons_loss_xval": 0.310516357421875, "eval_icons_runtime": 83.1499, "eval_icons_samples_per_second": 0.601, "eval_icons_steps_per_second": 0.024, "num_input_tokens_seen": 172831616, "step": 1000 }, { "epoch": 0.3996802557953637, "loss": 0.23844069242477417, "loss_ce": 0.00641553895547986, "loss_xval": 0.232421875, "num_input_tokens_seen": 172831616, "step": 1000 }, { "epoch": 0.40007993605115905, "grad_norm": 26.22909617587164, "learning_rate": 5e-06, "loss": 0.5293, "num_input_tokens_seen": 173004832, "step": 1001 }, { "epoch": 0.40007993605115905, "loss": 0.5112817287445068, "loss_ce": 0.008443554863333702, "loss_xval": 0.50390625, "num_input_tokens_seen": 173004832, "step": 1001 }, { "epoch": 0.40047961630695444, "grad_norm": 37.57565610535053, "learning_rate": 5e-06, "loss": 0.6173, "num_input_tokens_seen": 173178376, "step": 1002 }, { "epoch": 0.40047961630695444, "loss": 0.6873658299446106, "loss_ce": 0.0071595776826143265, "loss_xval": 0.6796875, "num_input_tokens_seen": 173178376, "step": 1002 }, { "epoch": 0.4008792965627498, "grad_norm": 52.448242126332076, "learning_rate": 5e-06, "loss": 0.5896, "num_input_tokens_seen": 173351128, "step": 1003 }, { "epoch": 0.4008792965627498, "loss": 0.6299257278442383, "loss_ce": 0.007367200218141079, "loss_xval": 0.62109375, "num_input_tokens_seen": 173351128, "step": 1003 }, { "epoch": 0.4012789768185452, "grad_norm": 59.20638229901705, "learning_rate": 5e-06, "loss": 0.5644, "num_input_tokens_seen": 173524176, "step": 1004 }, { "epoch": 0.4012789768185452, "loss": 0.7194583415985107, "loss_ce": 0.006445643957704306, "loss_xval": 0.71484375, "num_input_tokens_seen": 173524176, "step": 1004 }, { "epoch": 0.40167865707434053, "grad_norm": 36.58616370768613, "learning_rate": 5e-06, "loss": 0.9425, "num_input_tokens_seen": 173696832, "step": 1005 }, { "epoch": 0.40167865707434053, "loss": 1.347395896911621, "loss_ce": 0.0066977087408304214, "loss_xval": 1.34375, "num_input_tokens_seen": 173696832, "step": 1005 }, { "epoch": 0.40207833733013587, "grad_norm": 108.83892770229245, "learning_rate": 5e-06, "loss": 0.9874, "num_input_tokens_seen": 173869728, "step": 1006 }, { "epoch": 0.40207833733013587, "loss": 0.9585855007171631, "loss_ce": 0.004667055793106556, "loss_xval": 0.953125, "num_input_tokens_seen": 173869728, "step": 1006 }, { "epoch": 0.40247801758593127, "grad_norm": 29.57215095837664, "learning_rate": 5e-06, "loss": 0.6105, "num_input_tokens_seen": 174042616, "step": 1007 }, { "epoch": 0.40247801758593127, "loss": 0.6520742177963257, "loss_ce": 0.006871582940220833, "loss_xval": 0.64453125, "num_input_tokens_seen": 174042616, "step": 1007 }, { "epoch": 0.4028776978417266, "grad_norm": 153.84706487430793, "learning_rate": 5e-06, "loss": 0.657, "num_input_tokens_seen": 174215280, "step": 1008 }, { "epoch": 0.4028776978417266, "loss": 0.7333929538726807, "loss_ce": 0.012934006750583649, "loss_xval": 0.71875, "num_input_tokens_seen": 174215280, "step": 1008 }, { "epoch": 0.403277378097522, "grad_norm": 114.61737036027186, "learning_rate": 5e-06, "loss": 0.7579, "num_input_tokens_seen": 174388264, "step": 1009 }, { "epoch": 0.403277378097522, "loss": 0.525598406791687, "loss_ce": 0.006250268779695034, "loss_xval": 0.51953125, "num_input_tokens_seen": 174388264, "step": 1009 }, { "epoch": 0.40367705835331735, "grad_norm": 124.58554146445353, "learning_rate": 5e-06, "loss": 0.5623, "num_input_tokens_seen": 174561424, "step": 1010 }, { "epoch": 0.40367705835331735, "loss": 0.7112394571304321, "loss_ce": 0.012814194895327091, "loss_xval": 0.69921875, "num_input_tokens_seen": 174561424, "step": 1010 }, { "epoch": 0.4040767386091127, "grad_norm": 97.38895828742109, "learning_rate": 5e-06, "loss": 0.3995, "num_input_tokens_seen": 174734472, "step": 1011 }, { "epoch": 0.4040767386091127, "loss": 0.4527726471424103, "loss_ce": 0.0054460205137729645, "loss_xval": 0.447265625, "num_input_tokens_seen": 174734472, "step": 1011 }, { "epoch": 0.4044764188649081, "grad_norm": 39.357984950176494, "learning_rate": 5e-06, "loss": 0.6717, "num_input_tokens_seen": 174907320, "step": 1012 }, { "epoch": 0.4044764188649081, "loss": 0.7403974533081055, "loss_ce": 0.009654035791754723, "loss_xval": 0.73046875, "num_input_tokens_seen": 174907320, "step": 1012 }, { "epoch": 0.40487609912070344, "grad_norm": 43.49113942292695, "learning_rate": 5e-06, "loss": 0.648, "num_input_tokens_seen": 175080416, "step": 1013 }, { "epoch": 0.40487609912070344, "loss": 0.5626762509346008, "loss_ce": 0.009941885247826576, "loss_xval": 0.5546875, "num_input_tokens_seen": 175080416, "step": 1013 }, { "epoch": 0.4052757793764988, "grad_norm": 134.41500315087782, "learning_rate": 5e-06, "loss": 0.9411, "num_input_tokens_seen": 175252992, "step": 1014 }, { "epoch": 0.4052757793764988, "loss": 0.6051626801490784, "loss_ce": 0.004698799457401037, "loss_xval": 0.6015625, "num_input_tokens_seen": 175252992, "step": 1014 }, { "epoch": 0.4056754596322942, "grad_norm": 91.34613154118188, "learning_rate": 5e-06, "loss": 0.4441, "num_input_tokens_seen": 175425760, "step": 1015 }, { "epoch": 0.4056754596322942, "loss": 0.5284594297409058, "loss_ce": 0.006120562553405762, "loss_xval": 0.5234375, "num_input_tokens_seen": 175425760, "step": 1015 }, { "epoch": 0.4060751398880895, "grad_norm": 62.797149514317915, "learning_rate": 5e-06, "loss": 0.9974, "num_input_tokens_seen": 175598784, "step": 1016 }, { "epoch": 0.4060751398880895, "loss": 0.6914917826652527, "loss_ce": 0.0069824811071157455, "loss_xval": 0.68359375, "num_input_tokens_seen": 175598784, "step": 1016 }, { "epoch": 0.4064748201438849, "grad_norm": 68.01439644089561, "learning_rate": 5e-06, "loss": 0.6034, "num_input_tokens_seen": 175771752, "step": 1017 }, { "epoch": 0.4064748201438849, "loss": 0.7401469945907593, "loss_ce": 0.005466855131089687, "loss_xval": 0.734375, "num_input_tokens_seen": 175771752, "step": 1017 }, { "epoch": 0.40687450039968026, "grad_norm": 33.194237726138894, "learning_rate": 5e-06, "loss": 0.4104, "num_input_tokens_seen": 175944448, "step": 1018 }, { "epoch": 0.40687450039968026, "loss": 0.4032331705093384, "loss_ce": 0.004368394613265991, "loss_xval": 0.3984375, "num_input_tokens_seen": 175944448, "step": 1018 }, { "epoch": 0.4072741806554756, "grad_norm": 32.31623305090731, "learning_rate": 5e-06, "loss": 0.5535, "num_input_tokens_seen": 176117224, "step": 1019 }, { "epoch": 0.4072741806554756, "loss": 0.6844021081924438, "loss_ce": 0.004104219377040863, "loss_xval": 0.6796875, "num_input_tokens_seen": 176117224, "step": 1019 }, { "epoch": 0.407673860911271, "grad_norm": 69.88987283607364, "learning_rate": 5e-06, "loss": 0.5382, "num_input_tokens_seen": 176290288, "step": 1020 }, { "epoch": 0.407673860911271, "loss": 0.6937678456306458, "loss_ce": 0.006511982996016741, "loss_xval": 0.6875, "num_input_tokens_seen": 176290288, "step": 1020 }, { "epoch": 0.40807354116706634, "grad_norm": 49.12054277220987, "learning_rate": 5e-06, "loss": 0.7288, "num_input_tokens_seen": 176463360, "step": 1021 }, { "epoch": 0.40807354116706634, "loss": 0.8968067169189453, "loss_ce": 0.005205155350267887, "loss_xval": 0.890625, "num_input_tokens_seen": 176463360, "step": 1021 }, { "epoch": 0.4084732214228617, "grad_norm": 96.62921131568778, "learning_rate": 5e-06, "loss": 0.8582, "num_input_tokens_seen": 176636272, "step": 1022 }, { "epoch": 0.4084732214228617, "loss": 0.6188912987709045, "loss_ce": 0.004145242273807526, "loss_xval": 0.61328125, "num_input_tokens_seen": 176636272, "step": 1022 }, { "epoch": 0.4088729016786571, "grad_norm": 104.19761497471133, "learning_rate": 5e-06, "loss": 0.5969, "num_input_tokens_seen": 176809416, "step": 1023 }, { "epoch": 0.4088729016786571, "loss": 0.7266084551811218, "loss_ce": 0.004013280384242535, "loss_xval": 0.72265625, "num_input_tokens_seen": 176809416, "step": 1023 }, { "epoch": 0.40927258193445243, "grad_norm": 25.052045025077565, "learning_rate": 5e-06, "loss": 0.652, "num_input_tokens_seen": 176982512, "step": 1024 }, { "epoch": 0.40927258193445243, "loss": 0.4747720956802368, "loss_ce": 0.0037638223730027676, "loss_xval": 0.470703125, "num_input_tokens_seen": 176982512, "step": 1024 }, { "epoch": 0.4096722621902478, "grad_norm": 55.184447426888354, "learning_rate": 5e-06, "loss": 0.909, "num_input_tokens_seen": 177155544, "step": 1025 }, { "epoch": 0.4096722621902478, "loss": 1.211665153503418, "loss_ce": 0.008418156765401363, "loss_xval": 1.203125, "num_input_tokens_seen": 177155544, "step": 1025 }, { "epoch": 0.41007194244604317, "grad_norm": 51.43456903101215, "learning_rate": 5e-06, "loss": 0.4022, "num_input_tokens_seen": 177328144, "step": 1026 }, { "epoch": 0.41007194244604317, "loss": 0.3644227683544159, "loss_ce": 0.003643968142569065, "loss_xval": 0.361328125, "num_input_tokens_seen": 177328144, "step": 1026 }, { "epoch": 0.4104716227018385, "grad_norm": 38.349924787831824, "learning_rate": 5e-06, "loss": 0.6099, "num_input_tokens_seen": 177500576, "step": 1027 }, { "epoch": 0.4104716227018385, "loss": 0.46429070830345154, "loss_ce": 0.005428393371403217, "loss_xval": 0.458984375, "num_input_tokens_seen": 177500576, "step": 1027 }, { "epoch": 0.4108713029576339, "grad_norm": 81.99938062743149, "learning_rate": 5e-06, "loss": 0.4963, "num_input_tokens_seen": 177673800, "step": 1028 }, { "epoch": 0.4108713029576339, "loss": 0.27311575412750244, "loss_ce": 0.0070024700835347176, "loss_xval": 0.265625, "num_input_tokens_seen": 177673800, "step": 1028 }, { "epoch": 0.41127098321342925, "grad_norm": 60.9113505265921, "learning_rate": 5e-06, "loss": 0.6987, "num_input_tokens_seen": 177846720, "step": 1029 }, { "epoch": 0.41127098321342925, "loss": 0.8026575446128845, "loss_ce": 0.0034326824825257063, "loss_xval": 0.80078125, "num_input_tokens_seen": 177846720, "step": 1029 }, { "epoch": 0.4116706634692246, "grad_norm": 134.37290555496102, "learning_rate": 5e-06, "loss": 0.7379, "num_input_tokens_seen": 178019584, "step": 1030 }, { "epoch": 0.4116706634692246, "loss": 0.6104703545570374, "loss_ce": 0.006893688812851906, "loss_xval": 0.60546875, "num_input_tokens_seen": 178019584, "step": 1030 }, { "epoch": 0.41207034372502, "grad_norm": 116.92587445948755, "learning_rate": 5e-06, "loss": 0.5363, "num_input_tokens_seen": 178192928, "step": 1031 }, { "epoch": 0.41207034372502, "loss": 0.3673360347747803, "loss_ce": 0.002956158248707652, "loss_xval": 0.365234375, "num_input_tokens_seen": 178192928, "step": 1031 }, { "epoch": 0.41247002398081534, "grad_norm": 75.81443523599133, "learning_rate": 5e-06, "loss": 0.5552, "num_input_tokens_seen": 178365992, "step": 1032 }, { "epoch": 0.41247002398081534, "loss": 0.5825966596603394, "loss_ce": 0.00911034271121025, "loss_xval": 0.57421875, "num_input_tokens_seen": 178365992, "step": 1032 }, { "epoch": 0.41286970423661074, "grad_norm": 153.80297019873962, "learning_rate": 5e-06, "loss": 0.6214, "num_input_tokens_seen": 178539008, "step": 1033 }, { "epoch": 0.41286970423661074, "loss": 0.5666282773017883, "loss_ce": 0.007546260487288237, "loss_xval": 0.55859375, "num_input_tokens_seen": 178539008, "step": 1033 }, { "epoch": 0.4132693844924061, "grad_norm": 49.204697005620176, "learning_rate": 5e-06, "loss": 0.9595, "num_input_tokens_seen": 178711328, "step": 1034 }, { "epoch": 0.4132693844924061, "loss": 0.6298288106918335, "loss_ce": 0.01019988302141428, "loss_xval": 0.62109375, "num_input_tokens_seen": 178711328, "step": 1034 }, { "epoch": 0.4136690647482014, "grad_norm": 76.08084598570717, "learning_rate": 5e-06, "loss": 0.4532, "num_input_tokens_seen": 178884104, "step": 1035 }, { "epoch": 0.4136690647482014, "loss": 0.4612717032432556, "loss_ce": 0.012480195611715317, "loss_xval": 0.44921875, "num_input_tokens_seen": 178884104, "step": 1035 }, { "epoch": 0.4140687450039968, "grad_norm": 34.52904789226501, "learning_rate": 5e-06, "loss": 0.4644, "num_input_tokens_seen": 179057048, "step": 1036 }, { "epoch": 0.4140687450039968, "loss": 0.6084589958190918, "loss_ce": 0.006713386625051498, "loss_xval": 0.6015625, "num_input_tokens_seen": 179057048, "step": 1036 }, { "epoch": 0.41446842525979216, "grad_norm": 107.83231397269536, "learning_rate": 5e-06, "loss": 0.4973, "num_input_tokens_seen": 179230376, "step": 1037 }, { "epoch": 0.41446842525979216, "loss": 0.3456187844276428, "loss_ce": 0.005652973428368568, "loss_xval": 0.33984375, "num_input_tokens_seen": 179230376, "step": 1037 }, { "epoch": 0.4148681055155875, "grad_norm": 68.86144066104148, "learning_rate": 5e-06, "loss": 0.6048, "num_input_tokens_seen": 179403320, "step": 1038 }, { "epoch": 0.4148681055155875, "loss": 0.9780701398849487, "loss_ce": 0.040020860731601715, "loss_xval": 0.9375, "num_input_tokens_seen": 179403320, "step": 1038 }, { "epoch": 0.4152677857713829, "grad_norm": 29.2133368946288, "learning_rate": 5e-06, "loss": 0.64, "num_input_tokens_seen": 179576152, "step": 1039 }, { "epoch": 0.4152677857713829, "loss": 0.7152938842773438, "loss_ce": 0.005577098578214645, "loss_xval": 0.7109375, "num_input_tokens_seen": 179576152, "step": 1039 }, { "epoch": 0.41566746602717825, "grad_norm": 65.40652651781713, "learning_rate": 5e-06, "loss": 0.6278, "num_input_tokens_seen": 179748936, "step": 1040 }, { "epoch": 0.41566746602717825, "loss": 0.7276248931884766, "loss_ce": 0.004358314909040928, "loss_xval": 0.72265625, "num_input_tokens_seen": 179748936, "step": 1040 }, { "epoch": 0.41606714628297364, "grad_norm": 118.90984830416035, "learning_rate": 5e-06, "loss": 0.4819, "num_input_tokens_seen": 179922080, "step": 1041 }, { "epoch": 0.41606714628297364, "loss": 0.38193365931510925, "loss_ce": 0.013891654089093208, "loss_xval": 0.3671875, "num_input_tokens_seen": 179922080, "step": 1041 }, { "epoch": 0.416466826538769, "grad_norm": 56.73656107069961, "learning_rate": 5e-06, "loss": 0.5628, "num_input_tokens_seen": 180094792, "step": 1042 }, { "epoch": 0.416466826538769, "loss": 0.624241292476654, "loss_ce": 0.005711013451218605, "loss_xval": 0.6171875, "num_input_tokens_seen": 180094792, "step": 1042 }, { "epoch": 0.41686650679456433, "grad_norm": 161.1185485615042, "learning_rate": 5e-06, "loss": 0.5426, "num_input_tokens_seen": 180267832, "step": 1043 }, { "epoch": 0.41686650679456433, "loss": 0.41229158639907837, "loss_ce": 0.003966381307691336, "loss_xval": 0.408203125, "num_input_tokens_seen": 180267832, "step": 1043 }, { "epoch": 0.4172661870503597, "grad_norm": 78.0546420015716, "learning_rate": 5e-06, "loss": 0.4282, "num_input_tokens_seen": 180440816, "step": 1044 }, { "epoch": 0.4172661870503597, "loss": 0.4995594322681427, "loss_ce": 0.020921722054481506, "loss_xval": 0.478515625, "num_input_tokens_seen": 180440816, "step": 1044 }, { "epoch": 0.41766586730615507, "grad_norm": 115.85859991185649, "learning_rate": 5e-06, "loss": 0.7434, "num_input_tokens_seen": 180613760, "step": 1045 }, { "epoch": 0.41766586730615507, "loss": 0.4128772020339966, "loss_ce": 0.005528563167899847, "loss_xval": 0.408203125, "num_input_tokens_seen": 180613760, "step": 1045 }, { "epoch": 0.4180655475619504, "grad_norm": 122.32619412285479, "learning_rate": 5e-06, "loss": 0.7231, "num_input_tokens_seen": 180786392, "step": 1046 }, { "epoch": 0.4180655475619504, "loss": 0.42909038066864014, "loss_ce": 0.00416360329836607, "loss_xval": 0.42578125, "num_input_tokens_seen": 180786392, "step": 1046 }, { "epoch": 0.4184652278177458, "grad_norm": 74.33985091881948, "learning_rate": 5e-06, "loss": 0.6912, "num_input_tokens_seen": 180959520, "step": 1047 }, { "epoch": 0.4184652278177458, "loss": 0.4434017837047577, "loss_ce": 0.006145905703306198, "loss_xval": 0.4375, "num_input_tokens_seen": 180959520, "step": 1047 }, { "epoch": 0.41886490807354115, "grad_norm": 52.273984052094164, "learning_rate": 5e-06, "loss": 0.7557, "num_input_tokens_seen": 181132584, "step": 1048 }, { "epoch": 0.41886490807354115, "loss": 0.7552452087402344, "loss_ce": 0.006557449232786894, "loss_xval": 0.75, "num_input_tokens_seen": 181132584, "step": 1048 }, { "epoch": 0.41926458832933655, "grad_norm": 115.38168323853732, "learning_rate": 5e-06, "loss": 0.5297, "num_input_tokens_seen": 181305200, "step": 1049 }, { "epoch": 0.41926458832933655, "loss": 0.35102906823158264, "loss_ce": 0.04145876318216324, "loss_xval": 0.30859375, "num_input_tokens_seen": 181305200, "step": 1049 }, { "epoch": 0.4196642685851319, "grad_norm": 129.43468680045206, "learning_rate": 5e-06, "loss": 0.7044, "num_input_tokens_seen": 181477832, "step": 1050 }, { "epoch": 0.4196642685851319, "loss": 0.7682023048400879, "loss_ce": 0.005873220041394234, "loss_xval": 0.76171875, "num_input_tokens_seen": 181477832, "step": 1050 }, { "epoch": 0.42006394884092724, "grad_norm": 84.3901351931311, "learning_rate": 5e-06, "loss": 0.5263, "num_input_tokens_seen": 181650856, "step": 1051 }, { "epoch": 0.42006394884092724, "loss": 0.20175260305404663, "loss_ce": 0.005524573847651482, "loss_xval": 0.1962890625, "num_input_tokens_seen": 181650856, "step": 1051 }, { "epoch": 0.42046362909672264, "grad_norm": 105.05979567930446, "learning_rate": 5e-06, "loss": 0.6799, "num_input_tokens_seen": 181823832, "step": 1052 }, { "epoch": 0.42046362909672264, "loss": 0.5329502820968628, "loss_ce": 0.007681742776185274, "loss_xval": 0.5234375, "num_input_tokens_seen": 181823832, "step": 1052 }, { "epoch": 0.420863309352518, "grad_norm": 141.7226731667635, "learning_rate": 5e-06, "loss": 0.6034, "num_input_tokens_seen": 181997064, "step": 1053 }, { "epoch": 0.420863309352518, "loss": 0.8590031862258911, "loss_ce": 0.007928947918117046, "loss_xval": 0.8515625, "num_input_tokens_seen": 181997064, "step": 1053 }, { "epoch": 0.4212629896083134, "grad_norm": 97.79244733216197, "learning_rate": 5e-06, "loss": 0.5302, "num_input_tokens_seen": 182169840, "step": 1054 }, { "epoch": 0.4212629896083134, "loss": 0.6665828227996826, "loss_ce": 0.008318647742271423, "loss_xval": 0.66015625, "num_input_tokens_seen": 182169840, "step": 1054 }, { "epoch": 0.4216626698641087, "grad_norm": 118.68763350662337, "learning_rate": 5e-06, "loss": 0.9278, "num_input_tokens_seen": 182342800, "step": 1055 }, { "epoch": 0.4216626698641087, "loss": 1.1125082969665527, "loss_ce": 0.01094580627977848, "loss_xval": 1.1015625, "num_input_tokens_seen": 182342800, "step": 1055 }, { "epoch": 0.42206235011990406, "grad_norm": 80.77252748394068, "learning_rate": 5e-06, "loss": 0.5525, "num_input_tokens_seen": 182515448, "step": 1056 }, { "epoch": 0.42206235011990406, "loss": 0.5583893656730652, "loss_ce": 0.007119842804968357, "loss_xval": 0.55078125, "num_input_tokens_seen": 182515448, "step": 1056 }, { "epoch": 0.42246203037569946, "grad_norm": 102.19393718382399, "learning_rate": 5e-06, "loss": 0.3007, "num_input_tokens_seen": 182688632, "step": 1057 }, { "epoch": 0.42246203037569946, "loss": 0.28939294815063477, "loss_ce": 0.009790889918804169, "loss_xval": 0.279296875, "num_input_tokens_seen": 182688632, "step": 1057 }, { "epoch": 0.4228617106314948, "grad_norm": 36.3098200607475, "learning_rate": 5e-06, "loss": 0.5062, "num_input_tokens_seen": 182861544, "step": 1058 }, { "epoch": 0.4228617106314948, "loss": 0.6533856391906738, "loss_ce": 0.0076947640627622604, "loss_xval": 0.64453125, "num_input_tokens_seen": 182861544, "step": 1058 }, { "epoch": 0.42326139088729015, "grad_norm": 172.13037659969413, "learning_rate": 5e-06, "loss": 0.593, "num_input_tokens_seen": 183034432, "step": 1059 }, { "epoch": 0.42326139088729015, "loss": 0.5093013048171997, "loss_ce": 0.006615748163312674, "loss_xval": 0.50390625, "num_input_tokens_seen": 183034432, "step": 1059 }, { "epoch": 0.42366107114308554, "grad_norm": 47.71558190089748, "learning_rate": 5e-06, "loss": 0.344, "num_input_tokens_seen": 183207152, "step": 1060 }, { "epoch": 0.42366107114308554, "loss": 0.24332204461097717, "loss_ce": 0.0058342646807432175, "loss_xval": 0.2373046875, "num_input_tokens_seen": 183207152, "step": 1060 }, { "epoch": 0.4240607513988809, "grad_norm": 225.7577370720187, "learning_rate": 5e-06, "loss": 0.8292, "num_input_tokens_seen": 183380024, "step": 1061 }, { "epoch": 0.4240607513988809, "loss": 1.1143964529037476, "loss_ce": 0.006730412133038044, "loss_xval": 1.109375, "num_input_tokens_seen": 183380024, "step": 1061 }, { "epoch": 0.4244604316546763, "grad_norm": 37.571367032139, "learning_rate": 5e-06, "loss": 0.5539, "num_input_tokens_seen": 183553232, "step": 1062 }, { "epoch": 0.4244604316546763, "loss": 0.7506046891212463, "loss_ce": 0.004358367994427681, "loss_xval": 0.74609375, "num_input_tokens_seen": 183553232, "step": 1062 }, { "epoch": 0.4248601119104716, "grad_norm": 147.5481344304394, "learning_rate": 5e-06, "loss": 0.6861, "num_input_tokens_seen": 183726032, "step": 1063 }, { "epoch": 0.4248601119104716, "loss": 0.6540185213088989, "loss_ce": 0.007289969827979803, "loss_xval": 0.6484375, "num_input_tokens_seen": 183726032, "step": 1063 }, { "epoch": 0.42525979216626697, "grad_norm": 82.0603792449001, "learning_rate": 5e-06, "loss": 0.5386, "num_input_tokens_seen": 183898888, "step": 1064 }, { "epoch": 0.42525979216626697, "loss": 0.5563596487045288, "loss_ce": 0.007897760719060898, "loss_xval": 0.546875, "num_input_tokens_seen": 183898888, "step": 1064 }, { "epoch": 0.42565947242206237, "grad_norm": 157.6689432477122, "learning_rate": 5e-06, "loss": 0.765, "num_input_tokens_seen": 184071656, "step": 1065 }, { "epoch": 0.42565947242206237, "loss": 0.8480945825576782, "loss_ce": 0.007274296134710312, "loss_xval": 0.83984375, "num_input_tokens_seen": 184071656, "step": 1065 }, { "epoch": 0.4260591526778577, "grad_norm": 73.92092672975159, "learning_rate": 5e-06, "loss": 0.6531, "num_input_tokens_seen": 184244448, "step": 1066 }, { "epoch": 0.4260591526778577, "loss": 0.43375566601753235, "loss_ce": 0.00882891844958067, "loss_xval": 0.42578125, "num_input_tokens_seen": 184244448, "step": 1066 }, { "epoch": 0.42645883293365305, "grad_norm": 137.83919435792907, "learning_rate": 5e-06, "loss": 0.7066, "num_input_tokens_seen": 184417704, "step": 1067 }, { "epoch": 0.42645883293365305, "loss": 0.5617185235023499, "loss_ce": 0.009838663972914219, "loss_xval": 0.55078125, "num_input_tokens_seen": 184417704, "step": 1067 }, { "epoch": 0.42685851318944845, "grad_norm": 12.10364180031393, "learning_rate": 5e-06, "loss": 0.5337, "num_input_tokens_seen": 184590944, "step": 1068 }, { "epoch": 0.42685851318944845, "loss": 0.6997219920158386, "loss_ce": 0.006026932038366795, "loss_xval": 0.6953125, "num_input_tokens_seen": 184590944, "step": 1068 }, { "epoch": 0.4272581934452438, "grad_norm": 97.7581622286511, "learning_rate": 5e-06, "loss": 0.4564, "num_input_tokens_seen": 184763904, "step": 1069 }, { "epoch": 0.4272581934452438, "loss": 0.5520721673965454, "loss_ce": 0.007485995534807444, "loss_xval": 0.54296875, "num_input_tokens_seen": 184763904, "step": 1069 }, { "epoch": 0.4276578737010392, "grad_norm": 44.525008948363784, "learning_rate": 5e-06, "loss": 0.5811, "num_input_tokens_seen": 184936952, "step": 1070 }, { "epoch": 0.4276578737010392, "loss": 0.7966837882995605, "loss_ce": 0.01567797176539898, "loss_xval": 0.78125, "num_input_tokens_seen": 184936952, "step": 1070 }, { "epoch": 0.42805755395683454, "grad_norm": 61.55085629886339, "learning_rate": 5e-06, "loss": 0.5968, "num_input_tokens_seen": 185109752, "step": 1071 }, { "epoch": 0.42805755395683454, "loss": 0.614800214767456, "loss_ce": 0.005913465283811092, "loss_xval": 0.609375, "num_input_tokens_seen": 185109752, "step": 1071 }, { "epoch": 0.4284572342126299, "grad_norm": 127.48039438818263, "learning_rate": 5e-06, "loss": 0.981, "num_input_tokens_seen": 185282904, "step": 1072 }, { "epoch": 0.4284572342126299, "loss": 1.1177836656570435, "loss_ce": 0.0068827904760837555, "loss_xval": 1.109375, "num_input_tokens_seen": 185282904, "step": 1072 }, { "epoch": 0.4288569144684253, "grad_norm": 101.69167477228902, "learning_rate": 5e-06, "loss": 0.5908, "num_input_tokens_seen": 185455960, "step": 1073 }, { "epoch": 0.4288569144684253, "loss": 0.5060060620307922, "loss_ce": 0.014306841418147087, "loss_xval": 0.4921875, "num_input_tokens_seen": 185455960, "step": 1073 }, { "epoch": 0.4292565947242206, "grad_norm": 79.52330036933225, "learning_rate": 5e-06, "loss": 0.7136, "num_input_tokens_seen": 185628528, "step": 1074 }, { "epoch": 0.4292565947242206, "loss": 0.9773727655410767, "loss_ce": 0.010301224887371063, "loss_xval": 0.96875, "num_input_tokens_seen": 185628528, "step": 1074 }, { "epoch": 0.42965627498001596, "grad_norm": 59.63539541971929, "learning_rate": 5e-06, "loss": 0.659, "num_input_tokens_seen": 185801848, "step": 1075 }, { "epoch": 0.42965627498001596, "loss": 1.030850887298584, "loss_ce": 0.008054335601627827, "loss_xval": 1.0234375, "num_input_tokens_seen": 185801848, "step": 1075 }, { "epoch": 0.43005595523581136, "grad_norm": 98.38438903473731, "learning_rate": 5e-06, "loss": 0.5859, "num_input_tokens_seen": 185974824, "step": 1076 }, { "epoch": 0.43005595523581136, "loss": 0.6300668716430664, "loss_ce": 0.009705590084195137, "loss_xval": 0.62109375, "num_input_tokens_seen": 185974824, "step": 1076 }, { "epoch": 0.4304556354916067, "grad_norm": 83.65142565563072, "learning_rate": 5e-06, "loss": 0.4725, "num_input_tokens_seen": 186147832, "step": 1077 }, { "epoch": 0.4304556354916067, "loss": 0.40116894245147705, "loss_ce": 0.005661151837557554, "loss_xval": 0.39453125, "num_input_tokens_seen": 186147832, "step": 1077 }, { "epoch": 0.4308553157474021, "grad_norm": 118.49626901189295, "learning_rate": 5e-06, "loss": 0.7388, "num_input_tokens_seen": 186321064, "step": 1078 }, { "epoch": 0.4308553157474021, "loss": 0.876869797706604, "loss_ce": 0.005531886592507362, "loss_xval": 0.87109375, "num_input_tokens_seen": 186321064, "step": 1078 }, { "epoch": 0.43125499600319744, "grad_norm": 109.09146812364261, "learning_rate": 5e-06, "loss": 0.5597, "num_input_tokens_seen": 186493784, "step": 1079 }, { "epoch": 0.43125499600319744, "loss": 0.3077165484428406, "loss_ce": 0.014015364460647106, "loss_xval": 0.29296875, "num_input_tokens_seen": 186493784, "step": 1079 }, { "epoch": 0.4316546762589928, "grad_norm": 153.6208097109895, "learning_rate": 5e-06, "loss": 0.659, "num_input_tokens_seen": 186666760, "step": 1080 }, { "epoch": 0.4316546762589928, "loss": 0.7934675216674805, "loss_ce": 0.0073347436264157295, "loss_xval": 0.78515625, "num_input_tokens_seen": 186666760, "step": 1080 }, { "epoch": 0.4320543565147882, "grad_norm": 104.55460590207808, "learning_rate": 5e-06, "loss": 0.533, "num_input_tokens_seen": 186839760, "step": 1081 }, { "epoch": 0.4320543565147882, "loss": 0.6861118078231812, "loss_ce": 0.011856443248689175, "loss_xval": 0.67578125, "num_input_tokens_seen": 186839760, "step": 1081 }, { "epoch": 0.4324540367705835, "grad_norm": 163.00494027996143, "learning_rate": 5e-06, "loss": 0.5373, "num_input_tokens_seen": 187013040, "step": 1082 }, { "epoch": 0.4324540367705835, "loss": 0.47731196880340576, "loss_ce": 0.005754363723099232, "loss_xval": 0.470703125, "num_input_tokens_seen": 187013040, "step": 1082 }, { "epoch": 0.43285371702637887, "grad_norm": 56.689099954633605, "learning_rate": 5e-06, "loss": 0.5203, "num_input_tokens_seen": 187186120, "step": 1083 }, { "epoch": 0.43285371702637887, "loss": 0.4869546592235565, "loss_ce": 0.004288674332201481, "loss_xval": 0.482421875, "num_input_tokens_seen": 187186120, "step": 1083 }, { "epoch": 0.43325339728217427, "grad_norm": 130.52361059914816, "learning_rate": 5e-06, "loss": 0.4672, "num_input_tokens_seen": 187359432, "step": 1084 }, { "epoch": 0.43325339728217427, "loss": 0.3323134183883667, "loss_ce": 0.005409114994108677, "loss_xval": 0.326171875, "num_input_tokens_seen": 187359432, "step": 1084 }, { "epoch": 0.4336530775379696, "grad_norm": 41.52331087326012, "learning_rate": 5e-06, "loss": 0.297, "num_input_tokens_seen": 187532856, "step": 1085 }, { "epoch": 0.4336530775379696, "loss": 0.14482995867729187, "loss_ce": 0.009652344509959221, "loss_xval": 0.134765625, "num_input_tokens_seen": 187532856, "step": 1085 }, { "epoch": 0.434052757793765, "grad_norm": 93.52508988364698, "learning_rate": 5e-06, "loss": 0.578, "num_input_tokens_seen": 187705944, "step": 1086 }, { "epoch": 0.434052757793765, "loss": 0.5481054186820984, "loss_ce": 0.00513664074242115, "loss_xval": 0.54296875, "num_input_tokens_seen": 187705944, "step": 1086 }, { "epoch": 0.43445243804956035, "grad_norm": 80.0157216132746, "learning_rate": 5e-06, "loss": 0.6773, "num_input_tokens_seen": 187878896, "step": 1087 }, { "epoch": 0.43445243804956035, "loss": 0.5329375267028809, "loss_ce": 0.009744150564074516, "loss_xval": 0.5234375, "num_input_tokens_seen": 187878896, "step": 1087 }, { "epoch": 0.4348521183053557, "grad_norm": 80.72326753655868, "learning_rate": 5e-06, "loss": 0.5067, "num_input_tokens_seen": 188051976, "step": 1088 }, { "epoch": 0.4348521183053557, "loss": 0.39130324125289917, "loss_ce": 0.011908696964383125, "loss_xval": 0.37890625, "num_input_tokens_seen": 188051976, "step": 1088 }, { "epoch": 0.4352517985611511, "grad_norm": 93.34175906908915, "learning_rate": 5e-06, "loss": 0.4928, "num_input_tokens_seen": 188224992, "step": 1089 }, { "epoch": 0.4352517985611511, "loss": 0.6273799538612366, "loss_ce": 0.007751064375042915, "loss_xval": 0.62109375, "num_input_tokens_seen": 188224992, "step": 1089 }, { "epoch": 0.43565147881694644, "grad_norm": 87.97360717139733, "learning_rate": 5e-06, "loss": 0.8327, "num_input_tokens_seen": 188397832, "step": 1090 }, { "epoch": 0.43565147881694644, "loss": 1.0488063097000122, "loss_ce": 0.006386911030858755, "loss_xval": 1.0390625, "num_input_tokens_seen": 188397832, "step": 1090 }, { "epoch": 0.43605115907274183, "grad_norm": 95.46249626601698, "learning_rate": 5e-06, "loss": 0.4634, "num_input_tokens_seen": 188570840, "step": 1091 }, { "epoch": 0.43605115907274183, "loss": 0.4953336715698242, "loss_ce": 0.010622961446642876, "loss_xval": 0.484375, "num_input_tokens_seen": 188570840, "step": 1091 }, { "epoch": 0.4364508393285372, "grad_norm": 78.63885250463996, "learning_rate": 5e-06, "loss": 0.938, "num_input_tokens_seen": 188740200, "step": 1092 }, { "epoch": 0.4364508393285372, "loss": 0.8642194271087646, "loss_ce": 0.007529974915087223, "loss_xval": 0.85546875, "num_input_tokens_seen": 188740200, "step": 1092 }, { "epoch": 0.4368505195843325, "grad_norm": 43.144393557301015, "learning_rate": 5e-06, "loss": 0.3254, "num_input_tokens_seen": 188913096, "step": 1093 }, { "epoch": 0.4368505195843325, "loss": 0.3359166979789734, "loss_ce": 0.007486535236239433, "loss_xval": 0.328125, "num_input_tokens_seen": 188913096, "step": 1093 }, { "epoch": 0.4372501998401279, "grad_norm": 40.822521318289155, "learning_rate": 5e-06, "loss": 0.6592, "num_input_tokens_seen": 189085696, "step": 1094 }, { "epoch": 0.4372501998401279, "loss": 0.490889310836792, "loss_ce": 0.007735013496130705, "loss_xval": 0.482421875, "num_input_tokens_seen": 189085696, "step": 1094 }, { "epoch": 0.43764988009592326, "grad_norm": 35.439525893578136, "learning_rate": 5e-06, "loss": 0.4484, "num_input_tokens_seen": 189258752, "step": 1095 }, { "epoch": 0.43764988009592326, "loss": 0.5019693374633789, "loss_ce": 0.00807284377515316, "loss_xval": 0.494140625, "num_input_tokens_seen": 189258752, "step": 1095 }, { "epoch": 0.4380495603517186, "grad_norm": 54.20083299822031, "learning_rate": 5e-06, "loss": 0.4249, "num_input_tokens_seen": 189431568, "step": 1096 }, { "epoch": 0.4380495603517186, "loss": 0.49263429641723633, "loss_ce": 0.006062053143978119, "loss_xval": 0.486328125, "num_input_tokens_seen": 189431568, "step": 1096 }, { "epoch": 0.438449240607514, "grad_norm": 41.375718060440676, "learning_rate": 5e-06, "loss": 0.4014, "num_input_tokens_seen": 189604344, "step": 1097 }, { "epoch": 0.438449240607514, "loss": 0.5682648420333862, "loss_ce": 0.00735173374414444, "loss_xval": 0.5625, "num_input_tokens_seen": 189604344, "step": 1097 }, { "epoch": 0.43884892086330934, "grad_norm": 68.92925320134385, "learning_rate": 5e-06, "loss": 0.6919, "num_input_tokens_seen": 189777200, "step": 1098 }, { "epoch": 0.43884892086330934, "loss": 0.9103479385375977, "loss_ce": 0.00611207727342844, "loss_xval": 0.90234375, "num_input_tokens_seen": 189777200, "step": 1098 }, { "epoch": 0.43924860111910474, "grad_norm": 87.815067117457, "learning_rate": 5e-06, "loss": 0.6971, "num_input_tokens_seen": 189950504, "step": 1099 }, { "epoch": 0.43924860111910474, "loss": 0.8111795783042908, "loss_ce": 0.009421739727258682, "loss_xval": 0.80078125, "num_input_tokens_seen": 189950504, "step": 1099 }, { "epoch": 0.4396482813749001, "grad_norm": 72.59872907713559, "learning_rate": 5e-06, "loss": 0.7176, "num_input_tokens_seen": 190123456, "step": 1100 }, { "epoch": 0.4396482813749001, "loss": 0.6115920543670654, "loss_ce": 0.004917819052934647, "loss_xval": 0.60546875, "num_input_tokens_seen": 190123456, "step": 1100 }, { "epoch": 0.44004796163069543, "grad_norm": 43.80495265395074, "learning_rate": 5e-06, "loss": 0.398, "num_input_tokens_seen": 190296440, "step": 1101 }, { "epoch": 0.44004796163069543, "loss": 0.44945085048675537, "loss_ce": 0.010089308023452759, "loss_xval": 0.439453125, "num_input_tokens_seen": 190296440, "step": 1101 }, { "epoch": 0.4404476418864908, "grad_norm": 43.888561938956094, "learning_rate": 5e-06, "loss": 0.5756, "num_input_tokens_seen": 190469448, "step": 1102 }, { "epoch": 0.4404476418864908, "loss": 0.34963488578796387, "loss_ce": 0.009516467340290546, "loss_xval": 0.33984375, "num_input_tokens_seen": 190469448, "step": 1102 }, { "epoch": 0.44084732214228617, "grad_norm": 53.76726245558171, "learning_rate": 5e-06, "loss": 0.5227, "num_input_tokens_seen": 190639032, "step": 1103 }, { "epoch": 0.44084732214228617, "loss": 0.42856094241142273, "loss_ce": 0.003573148977011442, "loss_xval": 0.42578125, "num_input_tokens_seen": 190639032, "step": 1103 }, { "epoch": 0.4412470023980815, "grad_norm": 52.770932080369946, "learning_rate": 5e-06, "loss": 0.639, "num_input_tokens_seen": 190811968, "step": 1104 }, { "epoch": 0.4412470023980815, "loss": 0.8563051819801331, "loss_ce": 0.00498683238402009, "loss_xval": 0.8515625, "num_input_tokens_seen": 190811968, "step": 1104 }, { "epoch": 0.4416466826538769, "grad_norm": 108.72621992851067, "learning_rate": 5e-06, "loss": 0.6654, "num_input_tokens_seen": 190984856, "step": 1105 }, { "epoch": 0.4416466826538769, "loss": 0.4479440748691559, "loss_ce": 0.012885487638413906, "loss_xval": 0.435546875, "num_input_tokens_seen": 190984856, "step": 1105 }, { "epoch": 0.44204636290967225, "grad_norm": 57.656324232108965, "learning_rate": 5e-06, "loss": 0.3461, "num_input_tokens_seen": 191157688, "step": 1106 }, { "epoch": 0.44204636290967225, "loss": 0.3032693564891815, "loss_ce": 0.004014000296592712, "loss_xval": 0.298828125, "num_input_tokens_seen": 191157688, "step": 1106 }, { "epoch": 0.44244604316546765, "grad_norm": 55.89449040965285, "learning_rate": 5e-06, "loss": 0.5353, "num_input_tokens_seen": 191330880, "step": 1107 }, { "epoch": 0.44244604316546765, "loss": 0.45665621757507324, "loss_ce": 0.010245123878121376, "loss_xval": 0.447265625, "num_input_tokens_seen": 191330880, "step": 1107 }, { "epoch": 0.442845723421263, "grad_norm": 94.95423178563114, "learning_rate": 5e-06, "loss": 0.3389, "num_input_tokens_seen": 191503792, "step": 1108 }, { "epoch": 0.442845723421263, "loss": 0.40393486618995667, "loss_ce": 0.009342581033706665, "loss_xval": 0.39453125, "num_input_tokens_seen": 191503792, "step": 1108 }, { "epoch": 0.44324540367705834, "grad_norm": 78.28345803795365, "learning_rate": 5e-06, "loss": 0.6797, "num_input_tokens_seen": 191676888, "step": 1109 }, { "epoch": 0.44324540367705834, "loss": 0.7096420526504517, "loss_ce": 0.00401464244350791, "loss_xval": 0.70703125, "num_input_tokens_seen": 191676888, "step": 1109 }, { "epoch": 0.44364508393285373, "grad_norm": 61.34569052749395, "learning_rate": 5e-06, "loss": 0.4274, "num_input_tokens_seen": 191850256, "step": 1110 }, { "epoch": 0.44364508393285373, "loss": 0.44284114241600037, "loss_ce": 0.004608696326613426, "loss_xval": 0.4375, "num_input_tokens_seen": 191850256, "step": 1110 }, { "epoch": 0.4440447641886491, "grad_norm": 81.82998770177993, "learning_rate": 5e-06, "loss": 0.586, "num_input_tokens_seen": 192023160, "step": 1111 }, { "epoch": 0.4440447641886491, "loss": 0.41691532731056213, "loss_ce": 0.0032190163619816303, "loss_xval": 0.4140625, "num_input_tokens_seen": 192023160, "step": 1111 }, { "epoch": 0.4444444444444444, "grad_norm": 56.85534103714712, "learning_rate": 5e-06, "loss": 0.84, "num_input_tokens_seen": 192196264, "step": 1112 }, { "epoch": 0.4444444444444444, "loss": 0.5856711268424988, "loss_ce": 0.0033957427367568016, "loss_xval": 0.58203125, "num_input_tokens_seen": 192196264, "step": 1112 }, { "epoch": 0.4448441247002398, "grad_norm": 53.319073892773986, "learning_rate": 5e-06, "loss": 0.5192, "num_input_tokens_seen": 192369400, "step": 1113 }, { "epoch": 0.4448441247002398, "loss": 0.6779003143310547, "loss_ce": 0.007734273560345173, "loss_xval": 0.671875, "num_input_tokens_seen": 192369400, "step": 1113 }, { "epoch": 0.44524380495603516, "grad_norm": 53.64924867403011, "learning_rate": 5e-06, "loss": 0.7078, "num_input_tokens_seen": 192542208, "step": 1114 }, { "epoch": 0.44524380495603516, "loss": 0.7112681865692139, "loss_ce": 0.004114857874810696, "loss_xval": 0.70703125, "num_input_tokens_seen": 192542208, "step": 1114 }, { "epoch": 0.44564348521183056, "grad_norm": 94.76319006567356, "learning_rate": 5e-06, "loss": 0.8391, "num_input_tokens_seen": 192714968, "step": 1115 }, { "epoch": 0.44564348521183056, "loss": 0.9513822793960571, "loss_ce": 0.006313872057944536, "loss_xval": 0.9453125, "num_input_tokens_seen": 192714968, "step": 1115 }, { "epoch": 0.4460431654676259, "grad_norm": 32.46405483729325, "learning_rate": 5e-06, "loss": 0.698, "num_input_tokens_seen": 192887824, "step": 1116 }, { "epoch": 0.4460431654676259, "loss": 0.3914491534233093, "loss_ce": 0.00399797223508358, "loss_xval": 0.38671875, "num_input_tokens_seen": 192887824, "step": 1116 }, { "epoch": 0.44644284572342124, "grad_norm": 86.48967997737158, "learning_rate": 5e-06, "loss": 0.649, "num_input_tokens_seen": 193061072, "step": 1117 }, { "epoch": 0.44644284572342124, "loss": 0.6223208904266357, "loss_ce": 0.003363374387845397, "loss_xval": 0.6171875, "num_input_tokens_seen": 193061072, "step": 1117 }, { "epoch": 0.44684252597921664, "grad_norm": 31.61585246517527, "learning_rate": 5e-06, "loss": 0.8002, "num_input_tokens_seen": 193234224, "step": 1118 }, { "epoch": 0.44684252597921664, "loss": 0.7989094257354736, "loss_ce": 0.004201183095574379, "loss_xval": 0.79296875, "num_input_tokens_seen": 193234224, "step": 1118 }, { "epoch": 0.447242206235012, "grad_norm": 74.71859039399737, "learning_rate": 5e-06, "loss": 0.6386, "num_input_tokens_seen": 193406944, "step": 1119 }, { "epoch": 0.447242206235012, "loss": 0.7952804565429688, "loss_ce": 0.004783664830029011, "loss_xval": 0.7890625, "num_input_tokens_seen": 193406944, "step": 1119 }, { "epoch": 0.44764188649080733, "grad_norm": 67.74121508591531, "learning_rate": 5e-06, "loss": 0.9679, "num_input_tokens_seen": 193579568, "step": 1120 }, { "epoch": 0.44764188649080733, "loss": 1.1605088710784912, "loss_ce": 0.004624995868653059, "loss_xval": 1.15625, "num_input_tokens_seen": 193579568, "step": 1120 }, { "epoch": 0.4480415667466027, "grad_norm": 159.3426492178321, "learning_rate": 5e-06, "loss": 0.6493, "num_input_tokens_seen": 193752760, "step": 1121 }, { "epoch": 0.4480415667466027, "loss": 0.7691453695297241, "loss_ce": 0.009623829275369644, "loss_xval": 0.7578125, "num_input_tokens_seen": 193752760, "step": 1121 }, { "epoch": 0.44844124700239807, "grad_norm": 19.412155259998478, "learning_rate": 5e-06, "loss": 0.4257, "num_input_tokens_seen": 193925760, "step": 1122 }, { "epoch": 0.44844124700239807, "loss": 0.35129088163375854, "loss_ce": 0.019381720572710037, "loss_xval": 0.33203125, "num_input_tokens_seen": 193925760, "step": 1122 }, { "epoch": 0.44884092725819347, "grad_norm": 132.37602488962582, "learning_rate": 5e-06, "loss": 0.6866, "num_input_tokens_seen": 194098728, "step": 1123 }, { "epoch": 0.44884092725819347, "loss": 0.681769847869873, "loss_ce": 0.011481714434921741, "loss_xval": 0.671875, "num_input_tokens_seen": 194098728, "step": 1123 }, { "epoch": 0.4492406075139888, "grad_norm": 56.93257373319191, "learning_rate": 5e-06, "loss": 0.4694, "num_input_tokens_seen": 194271896, "step": 1124 }, { "epoch": 0.4492406075139888, "loss": 0.3968222737312317, "loss_ce": 0.008516602218151093, "loss_xval": 0.388671875, "num_input_tokens_seen": 194271896, "step": 1124 }, { "epoch": 0.44964028776978415, "grad_norm": 141.95283709265078, "learning_rate": 5e-06, "loss": 0.4908, "num_input_tokens_seen": 194444824, "step": 1125 }, { "epoch": 0.44964028776978415, "loss": 0.46788841485977173, "loss_ce": 0.005364010110497475, "loss_xval": 0.462890625, "num_input_tokens_seen": 194444824, "step": 1125 }, { "epoch": 0.45003996802557955, "grad_norm": 74.83771643983036, "learning_rate": 5e-06, "loss": 0.6375, "num_input_tokens_seen": 194614360, "step": 1126 }, { "epoch": 0.45003996802557955, "loss": 0.42517510056495667, "loss_ce": 0.006016166415065527, "loss_xval": 0.419921875, "num_input_tokens_seen": 194614360, "step": 1126 }, { "epoch": 0.4504396482813749, "grad_norm": 89.4636056566743, "learning_rate": 5e-06, "loss": 0.7524, "num_input_tokens_seen": 194786968, "step": 1127 }, { "epoch": 0.4504396482813749, "loss": 0.8362482190132141, "loss_ce": 0.0059259673580527306, "loss_xval": 0.83203125, "num_input_tokens_seen": 194786968, "step": 1127 }, { "epoch": 0.45083932853717024, "grad_norm": 104.5467139927948, "learning_rate": 5e-06, "loss": 0.7947, "num_input_tokens_seen": 194959904, "step": 1128 }, { "epoch": 0.45083932853717024, "loss": 0.5858356952667236, "loss_ce": 0.0061237625777721405, "loss_xval": 0.578125, "num_input_tokens_seen": 194959904, "step": 1128 }, { "epoch": 0.45123900879296563, "grad_norm": 103.50952784722763, "learning_rate": 5e-06, "loss": 0.5609, "num_input_tokens_seen": 195133032, "step": 1129 }, { "epoch": 0.45123900879296563, "loss": 0.3132474422454834, "loss_ce": 0.005569221451878548, "loss_xval": 0.30859375, "num_input_tokens_seen": 195133032, "step": 1129 }, { "epoch": 0.451638689048761, "grad_norm": 85.79342122693706, "learning_rate": 5e-06, "loss": 0.636, "num_input_tokens_seen": 195305808, "step": 1130 }, { "epoch": 0.451638689048761, "loss": 0.5406002402305603, "loss_ce": 0.00495569733902812, "loss_xval": 0.53515625, "num_input_tokens_seen": 195305808, "step": 1130 }, { "epoch": 0.4520383693045564, "grad_norm": 57.37006766323614, "learning_rate": 5e-06, "loss": 0.516, "num_input_tokens_seen": 195478768, "step": 1131 }, { "epoch": 0.4520383693045564, "loss": 0.344220370054245, "loss_ce": 0.004925938788801432, "loss_xval": 0.33984375, "num_input_tokens_seen": 195478768, "step": 1131 }, { "epoch": 0.4524380495603517, "grad_norm": 120.1394915291317, "learning_rate": 5e-06, "loss": 0.459, "num_input_tokens_seen": 195651456, "step": 1132 }, { "epoch": 0.4524380495603517, "loss": 0.5401527881622314, "loss_ce": 0.0051185921765863895, "loss_xval": 0.53515625, "num_input_tokens_seen": 195651456, "step": 1132 }, { "epoch": 0.45283772981614706, "grad_norm": 54.69748517929681, "learning_rate": 5e-06, "loss": 0.7013, "num_input_tokens_seen": 195824560, "step": 1133 }, { "epoch": 0.45283772981614706, "loss": 0.8380387425422668, "loss_ce": 0.006373690906912088, "loss_xval": 0.83203125, "num_input_tokens_seen": 195824560, "step": 1133 }, { "epoch": 0.45323741007194246, "grad_norm": 95.7615162884942, "learning_rate": 5e-06, "loss": 0.4366, "num_input_tokens_seen": 195997400, "step": 1134 }, { "epoch": 0.45323741007194246, "loss": 0.31681621074676514, "loss_ce": 0.0037669152952730656, "loss_xval": 0.3125, "num_input_tokens_seen": 195997400, "step": 1134 }, { "epoch": 0.4536370903277378, "grad_norm": 51.51343251960469, "learning_rate": 5e-06, "loss": 0.9125, "num_input_tokens_seen": 196170648, "step": 1135 }, { "epoch": 0.4536370903277378, "loss": 0.5430760979652405, "loss_ce": 0.004868092946708202, "loss_xval": 0.5390625, "num_input_tokens_seen": 196170648, "step": 1135 }, { "epoch": 0.4540367705835332, "grad_norm": 41.699713092805354, "learning_rate": 5e-06, "loss": 0.8595, "num_input_tokens_seen": 196343824, "step": 1136 }, { "epoch": 0.4540367705835332, "loss": 0.8823078274726868, "loss_ce": 0.013533404096961021, "loss_xval": 0.8671875, "num_input_tokens_seen": 196343824, "step": 1136 }, { "epoch": 0.45443645083932854, "grad_norm": 47.49786214842191, "learning_rate": 5e-06, "loss": 0.7696, "num_input_tokens_seen": 196517272, "step": 1137 }, { "epoch": 0.45443645083932854, "loss": 0.6541973948478699, "loss_ce": 0.006553375627845526, "loss_xval": 0.6484375, "num_input_tokens_seen": 196517272, "step": 1137 }, { "epoch": 0.4548361310951239, "grad_norm": 38.86335658205585, "learning_rate": 5e-06, "loss": 0.5626, "num_input_tokens_seen": 196690024, "step": 1138 }, { "epoch": 0.4548361310951239, "loss": 0.49186328053474426, "loss_ce": 0.0104790348559618, "loss_xval": 0.48046875, "num_input_tokens_seen": 196690024, "step": 1138 }, { "epoch": 0.4552358113509193, "grad_norm": 74.08500005053307, "learning_rate": 5e-06, "loss": 0.6865, "num_input_tokens_seen": 196862856, "step": 1139 }, { "epoch": 0.4552358113509193, "loss": 0.9089970588684082, "loss_ce": 0.010742646642029285, "loss_xval": 0.8984375, "num_input_tokens_seen": 196862856, "step": 1139 }, { "epoch": 0.4556354916067146, "grad_norm": 62.43683214837666, "learning_rate": 5e-06, "loss": 0.9647, "num_input_tokens_seen": 197035824, "step": 1140 }, { "epoch": 0.4556354916067146, "loss": 1.02647864818573, "loss_ce": 0.0059097823686897755, "loss_xval": 1.0234375, "num_input_tokens_seen": 197035824, "step": 1140 }, { "epoch": 0.45603517186250997, "grad_norm": 44.531689617859236, "learning_rate": 5e-06, "loss": 0.7474, "num_input_tokens_seen": 197208824, "step": 1141 }, { "epoch": 0.45603517186250997, "loss": 0.6352089643478394, "loss_ce": 0.005020937416702509, "loss_xval": 0.62890625, "num_input_tokens_seen": 197208824, "step": 1141 }, { "epoch": 0.45643485211830537, "grad_norm": 33.00928243102856, "learning_rate": 5e-06, "loss": 0.5729, "num_input_tokens_seen": 197381696, "step": 1142 }, { "epoch": 0.45643485211830537, "loss": 0.5751632452011108, "loss_ce": 0.005094892345368862, "loss_xval": 0.5703125, "num_input_tokens_seen": 197381696, "step": 1142 }, { "epoch": 0.4568345323741007, "grad_norm": 110.0605177033546, "learning_rate": 5e-06, "loss": 0.8354, "num_input_tokens_seen": 197554728, "step": 1143 }, { "epoch": 0.4568345323741007, "loss": 0.8544121384620667, "loss_ce": 0.005535189062356949, "loss_xval": 0.84765625, "num_input_tokens_seen": 197554728, "step": 1143 }, { "epoch": 0.4572342126298961, "grad_norm": 66.44659371930612, "learning_rate": 5e-06, "loss": 0.6322, "num_input_tokens_seen": 197727824, "step": 1144 }, { "epoch": 0.4572342126298961, "loss": 0.6607143878936768, "loss_ce": 0.006173363886773586, "loss_xval": 0.65625, "num_input_tokens_seen": 197727824, "step": 1144 }, { "epoch": 0.45763389288569145, "grad_norm": 43.536496164695684, "learning_rate": 5e-06, "loss": 0.6087, "num_input_tokens_seen": 197900352, "step": 1145 }, { "epoch": 0.45763389288569145, "loss": 0.5236250162124634, "loss_ce": 0.0048262146301567554, "loss_xval": 0.51953125, "num_input_tokens_seen": 197900352, "step": 1145 }, { "epoch": 0.4580335731414868, "grad_norm": 99.45161663771454, "learning_rate": 5e-06, "loss": 0.7284, "num_input_tokens_seen": 198073304, "step": 1146 }, { "epoch": 0.4580335731414868, "loss": 0.7672001123428345, "loss_ce": 0.005206714384257793, "loss_xval": 0.76171875, "num_input_tokens_seen": 198073304, "step": 1146 }, { "epoch": 0.4584332533972822, "grad_norm": 81.21506953379286, "learning_rate": 5e-06, "loss": 0.6978, "num_input_tokens_seen": 198246440, "step": 1147 }, { "epoch": 0.4584332533972822, "loss": 0.574648380279541, "loss_ce": 0.012514561414718628, "loss_xval": 0.5625, "num_input_tokens_seen": 198246440, "step": 1147 }, { "epoch": 0.45883293365307753, "grad_norm": 79.41751309974842, "learning_rate": 5e-06, "loss": 1.0044, "num_input_tokens_seen": 198419656, "step": 1148 }, { "epoch": 0.45883293365307753, "loss": 0.8175091743469238, "loss_ce": 0.004032664000988007, "loss_xval": 0.8125, "num_input_tokens_seen": 198419656, "step": 1148 }, { "epoch": 0.4592326139088729, "grad_norm": 69.40833245914523, "learning_rate": 5e-06, "loss": 0.4482, "num_input_tokens_seen": 198592384, "step": 1149 }, { "epoch": 0.4592326139088729, "loss": 0.37215864658355713, "loss_ce": 0.00448285136371851, "loss_xval": 0.3671875, "num_input_tokens_seen": 198592384, "step": 1149 }, { "epoch": 0.4596322941646683, "grad_norm": 122.05219702328993, "learning_rate": 5e-06, "loss": 0.5041, "num_input_tokens_seen": 198765368, "step": 1150 }, { "epoch": 0.4596322941646683, "loss": 0.2772751450538635, "loss_ce": 0.010124286636710167, "loss_xval": 0.267578125, "num_input_tokens_seen": 198765368, "step": 1150 }, { "epoch": 0.4600319744204636, "grad_norm": 55.85383031494442, "learning_rate": 5e-06, "loss": 0.76, "num_input_tokens_seen": 198938392, "step": 1151 }, { "epoch": 0.4600319744204636, "loss": 0.8261213302612305, "loss_ce": 0.0045881494879722595, "loss_xval": 0.8203125, "num_input_tokens_seen": 198938392, "step": 1151 }, { "epoch": 0.460431654676259, "grad_norm": 53.35549520327359, "learning_rate": 5e-06, "loss": 0.489, "num_input_tokens_seen": 199111568, "step": 1152 }, { "epoch": 0.460431654676259, "loss": 0.6392042636871338, "loss_ce": 0.007368315011262894, "loss_xval": 0.6328125, "num_input_tokens_seen": 199111568, "step": 1152 }, { "epoch": 0.46083133493205436, "grad_norm": 41.969663825725945, "learning_rate": 5e-06, "loss": 0.5099, "num_input_tokens_seen": 199284384, "step": 1153 }, { "epoch": 0.46083133493205436, "loss": 0.5351383686065674, "loss_ce": 0.004376672208309174, "loss_xval": 0.53125, "num_input_tokens_seen": 199284384, "step": 1153 }, { "epoch": 0.4612310151878497, "grad_norm": 62.75425126705116, "learning_rate": 5e-06, "loss": 0.5918, "num_input_tokens_seen": 199457280, "step": 1154 }, { "epoch": 0.4612310151878497, "loss": 0.7125241756439209, "loss_ce": 0.006286341696977615, "loss_xval": 0.70703125, "num_input_tokens_seen": 199457280, "step": 1154 }, { "epoch": 0.4616306954436451, "grad_norm": 50.715666738129826, "learning_rate": 5e-06, "loss": 0.5002, "num_input_tokens_seen": 199630232, "step": 1155 }, { "epoch": 0.4616306954436451, "loss": 0.25332915782928467, "loss_ce": 0.0032681506127119064, "loss_xval": 0.25, "num_input_tokens_seen": 199630232, "step": 1155 }, { "epoch": 0.46203037569944044, "grad_norm": 29.166937844481254, "learning_rate": 5e-06, "loss": 0.4687, "num_input_tokens_seen": 199803208, "step": 1156 }, { "epoch": 0.46203037569944044, "loss": 0.3712252378463745, "loss_ce": 0.0047701504081487656, "loss_xval": 0.3671875, "num_input_tokens_seen": 199803208, "step": 1156 }, { "epoch": 0.4624300559552358, "grad_norm": 67.21113680973119, "learning_rate": 5e-06, "loss": 0.8676, "num_input_tokens_seen": 199976416, "step": 1157 }, { "epoch": 0.4624300559552358, "loss": 0.41837334632873535, "loss_ce": 0.003608925500884652, "loss_xval": 0.4140625, "num_input_tokens_seen": 199976416, "step": 1157 }, { "epoch": 0.4628297362110312, "grad_norm": 36.177291288625426, "learning_rate": 5e-06, "loss": 0.4889, "num_input_tokens_seen": 200149720, "step": 1158 }, { "epoch": 0.4628297362110312, "loss": 0.7105360627174377, "loss_ce": 0.005763140507042408, "loss_xval": 0.703125, "num_input_tokens_seen": 200149720, "step": 1158 }, { "epoch": 0.4632294164668265, "grad_norm": 144.37390819755106, "learning_rate": 5e-06, "loss": 0.3905, "num_input_tokens_seen": 200322696, "step": 1159 }, { "epoch": 0.4632294164668265, "loss": 0.48566287755966187, "loss_ce": 0.0023254724219441414, "loss_xval": 0.482421875, "num_input_tokens_seen": 200322696, "step": 1159 }, { "epoch": 0.4636290967226219, "grad_norm": 34.84183224871753, "learning_rate": 5e-06, "loss": 0.2845, "num_input_tokens_seen": 200496096, "step": 1160 }, { "epoch": 0.4636290967226219, "loss": 0.27525389194488525, "loss_ce": 0.0022894316352903843, "loss_xval": 0.2734375, "num_input_tokens_seen": 200496096, "step": 1160 }, { "epoch": 0.46402877697841727, "grad_norm": 52.0086165058688, "learning_rate": 5e-06, "loss": 0.6896, "num_input_tokens_seen": 200669288, "step": 1161 }, { "epoch": 0.46402877697841727, "loss": 1.063295602798462, "loss_ce": 0.009096423164010048, "loss_xval": 1.0546875, "num_input_tokens_seen": 200669288, "step": 1161 }, { "epoch": 0.4644284572342126, "grad_norm": 48.3637411249726, "learning_rate": 5e-06, "loss": 0.5238, "num_input_tokens_seen": 200842184, "step": 1162 }, { "epoch": 0.4644284572342126, "loss": 0.41702714562416077, "loss_ce": 0.00394120067358017, "loss_xval": 0.4140625, "num_input_tokens_seen": 200842184, "step": 1162 }, { "epoch": 0.464828137490008, "grad_norm": 95.71936689991736, "learning_rate": 5e-06, "loss": 0.5964, "num_input_tokens_seen": 201015320, "step": 1163 }, { "epoch": 0.464828137490008, "loss": 0.45856761932373047, "loss_ce": 0.002146715298295021, "loss_xval": 0.45703125, "num_input_tokens_seen": 201015320, "step": 1163 }, { "epoch": 0.46522781774580335, "grad_norm": 47.786200135317, "learning_rate": 5e-06, "loss": 0.8803, "num_input_tokens_seen": 201187928, "step": 1164 }, { "epoch": 0.46522781774580335, "loss": 1.0784153938293457, "loss_ce": 0.007370521314442158, "loss_xval": 1.0703125, "num_input_tokens_seen": 201187928, "step": 1164 }, { "epoch": 0.4656274980015987, "grad_norm": 70.62039457961404, "learning_rate": 5e-06, "loss": 0.9221, "num_input_tokens_seen": 201360888, "step": 1165 }, { "epoch": 0.4656274980015987, "loss": 0.8973113298416138, "loss_ce": 0.004855302162468433, "loss_xval": 0.890625, "num_input_tokens_seen": 201360888, "step": 1165 }, { "epoch": 0.4660271782573941, "grad_norm": 35.876824576405056, "learning_rate": 5e-06, "loss": 0.4456, "num_input_tokens_seen": 201533736, "step": 1166 }, { "epoch": 0.4660271782573941, "loss": 0.5968400835990906, "loss_ce": 0.006630140822380781, "loss_xval": 0.58984375, "num_input_tokens_seen": 201533736, "step": 1166 }, { "epoch": 0.46642685851318944, "grad_norm": 63.15386629040931, "learning_rate": 5e-06, "loss": 0.683, "num_input_tokens_seen": 201706496, "step": 1167 }, { "epoch": 0.46642685851318944, "loss": 0.5146918296813965, "loss_ce": 0.0032782740890979767, "loss_xval": 0.51171875, "num_input_tokens_seen": 201706496, "step": 1167 }, { "epoch": 0.46682653876898483, "grad_norm": 51.66674260877597, "learning_rate": 5e-06, "loss": 0.6915, "num_input_tokens_seen": 201879464, "step": 1168 }, { "epoch": 0.46682653876898483, "loss": 1.018492341041565, "loss_ce": 0.00811633188277483, "loss_xval": 1.0078125, "num_input_tokens_seen": 201879464, "step": 1168 }, { "epoch": 0.4672262190247802, "grad_norm": 49.273976329122995, "learning_rate": 5e-06, "loss": 0.376, "num_input_tokens_seen": 202052384, "step": 1169 }, { "epoch": 0.4672262190247802, "loss": 0.39832669496536255, "loss_ce": 0.003398712258785963, "loss_xval": 0.39453125, "num_input_tokens_seen": 202052384, "step": 1169 }, { "epoch": 0.4676258992805755, "grad_norm": 101.63398509107819, "learning_rate": 5e-06, "loss": 0.6936, "num_input_tokens_seen": 202225344, "step": 1170 }, { "epoch": 0.4676258992805755, "loss": 0.5724492073059082, "loss_ce": 0.004242459312081337, "loss_xval": 0.56640625, "num_input_tokens_seen": 202225344, "step": 1170 }, { "epoch": 0.4680255795363709, "grad_norm": 21.379650294342827, "learning_rate": 5e-06, "loss": 0.4938, "num_input_tokens_seen": 202398136, "step": 1171 }, { "epoch": 0.4680255795363709, "loss": 0.41817671060562134, "loss_ce": 0.0021305850241333246, "loss_xval": 0.416015625, "num_input_tokens_seen": 202398136, "step": 1171 }, { "epoch": 0.46842525979216626, "grad_norm": 70.14687736374864, "learning_rate": 5e-06, "loss": 0.695, "num_input_tokens_seen": 202571104, "step": 1172 }, { "epoch": 0.46842525979216626, "loss": 0.5816267728805542, "loss_ce": 0.005393843166530132, "loss_xval": 0.578125, "num_input_tokens_seen": 202571104, "step": 1172 }, { "epoch": 0.46882494004796166, "grad_norm": 24.91607803294252, "learning_rate": 5e-06, "loss": 0.6048, "num_input_tokens_seen": 202744160, "step": 1173 }, { "epoch": 0.46882494004796166, "loss": 0.5543652176856995, "loss_ce": 0.004911464173346758, "loss_xval": 0.55078125, "num_input_tokens_seen": 202744160, "step": 1173 }, { "epoch": 0.469224620303757, "grad_norm": 59.37963682222382, "learning_rate": 5e-06, "loss": 0.4961, "num_input_tokens_seen": 202916928, "step": 1174 }, { "epoch": 0.469224620303757, "loss": 0.5824704170227051, "loss_ce": 0.008221141993999481, "loss_xval": 0.57421875, "num_input_tokens_seen": 202916928, "step": 1174 }, { "epoch": 0.46962430055955234, "grad_norm": 61.38563783071624, "learning_rate": 5e-06, "loss": 0.9222, "num_input_tokens_seen": 203090040, "step": 1175 }, { "epoch": 0.46962430055955234, "loss": 0.6337473392486572, "loss_ce": 0.003986579366028309, "loss_xval": 0.62890625, "num_input_tokens_seen": 203090040, "step": 1175 }, { "epoch": 0.47002398081534774, "grad_norm": 30.520568694364325, "learning_rate": 5e-06, "loss": 0.5868, "num_input_tokens_seen": 203260272, "step": 1176 }, { "epoch": 0.47002398081534774, "loss": 0.7021505236625671, "loss_ce": 0.009065819904208183, "loss_xval": 0.69140625, "num_input_tokens_seen": 203260272, "step": 1176 }, { "epoch": 0.4704236610711431, "grad_norm": 114.80561125694719, "learning_rate": 5e-06, "loss": 0.6283, "num_input_tokens_seen": 203433192, "step": 1177 }, { "epoch": 0.4704236610711431, "loss": 0.46350085735321045, "loss_ce": 0.004699579905718565, "loss_xval": 0.458984375, "num_input_tokens_seen": 203433192, "step": 1177 }, { "epoch": 0.4708233413269384, "grad_norm": 90.34627311896233, "learning_rate": 5e-06, "loss": 0.5641, "num_input_tokens_seen": 203606144, "step": 1178 }, { "epoch": 0.4708233413269384, "loss": 0.4923925995826721, "loss_ce": 0.007163085043430328, "loss_xval": 0.484375, "num_input_tokens_seen": 203606144, "step": 1178 }, { "epoch": 0.4712230215827338, "grad_norm": 71.19184014400291, "learning_rate": 5e-06, "loss": 0.5977, "num_input_tokens_seen": 203779104, "step": 1179 }, { "epoch": 0.4712230215827338, "loss": 0.6201607584953308, "loss_ce": 0.0019356525735929608, "loss_xval": 0.6171875, "num_input_tokens_seen": 203779104, "step": 1179 }, { "epoch": 0.47162270183852917, "grad_norm": 46.09540182862563, "learning_rate": 5e-06, "loss": 0.7048, "num_input_tokens_seen": 203951968, "step": 1180 }, { "epoch": 0.47162270183852917, "loss": 0.6933472752571106, "loss_ce": 0.003711057361215353, "loss_xval": 0.69140625, "num_input_tokens_seen": 203951968, "step": 1180 }, { "epoch": 0.47202238209432457, "grad_norm": 85.31817735510187, "learning_rate": 5e-06, "loss": 0.4103, "num_input_tokens_seen": 204124320, "step": 1181 }, { "epoch": 0.47202238209432457, "loss": 0.45448797941207886, "loss_ce": 0.0026142210699617863, "loss_xval": 0.451171875, "num_input_tokens_seen": 204124320, "step": 1181 }, { "epoch": 0.4724220623501199, "grad_norm": 62.585221065215585, "learning_rate": 5e-06, "loss": 0.5123, "num_input_tokens_seen": 204296704, "step": 1182 }, { "epoch": 0.4724220623501199, "loss": 0.753317654132843, "loss_ce": 0.004050097428262234, "loss_xval": 0.75, "num_input_tokens_seen": 204296704, "step": 1182 }, { "epoch": 0.47282174260591525, "grad_norm": 90.40966754196259, "learning_rate": 5e-06, "loss": 0.6652, "num_input_tokens_seen": 204469256, "step": 1183 }, { "epoch": 0.47282174260591525, "loss": 0.42565417289733887, "loss_ce": 0.003901238553225994, "loss_xval": 0.421875, "num_input_tokens_seen": 204469256, "step": 1183 }, { "epoch": 0.47322142286171065, "grad_norm": 54.39935747092517, "learning_rate": 5e-06, "loss": 0.7614, "num_input_tokens_seen": 204642296, "step": 1184 }, { "epoch": 0.47322142286171065, "loss": 0.5638756155967712, "loss_ce": 0.005281836725771427, "loss_xval": 0.55859375, "num_input_tokens_seen": 204642296, "step": 1184 }, { "epoch": 0.473621103117506, "grad_norm": 69.45566268019499, "learning_rate": 5e-06, "loss": 0.7909, "num_input_tokens_seen": 204815504, "step": 1185 }, { "epoch": 0.473621103117506, "loss": 0.9263577461242676, "loss_ce": 0.0036892304196953773, "loss_xval": 0.921875, "num_input_tokens_seen": 204815504, "step": 1185 }, { "epoch": 0.47402078337330134, "grad_norm": 47.95557312190612, "learning_rate": 5e-06, "loss": 0.5052, "num_input_tokens_seen": 204988160, "step": 1186 }, { "epoch": 0.47402078337330134, "loss": 0.42373624444007874, "loss_ce": 0.004943536594510078, "loss_xval": 0.41796875, "num_input_tokens_seen": 204988160, "step": 1186 }, { "epoch": 0.47442046362909673, "grad_norm": 80.91810811660899, "learning_rate": 5e-06, "loss": 0.4477, "num_input_tokens_seen": 205160608, "step": 1187 }, { "epoch": 0.47442046362909673, "loss": 0.27876970171928406, "loss_ce": 0.0027076760306954384, "loss_xval": 0.275390625, "num_input_tokens_seen": 205160608, "step": 1187 }, { "epoch": 0.4748201438848921, "grad_norm": 89.19904810541342, "learning_rate": 5e-06, "loss": 0.8543, "num_input_tokens_seen": 205333520, "step": 1188 }, { "epoch": 0.4748201438848921, "loss": 1.167051076889038, "loss_ce": 0.00945834070444107, "loss_xval": 1.15625, "num_input_tokens_seen": 205333520, "step": 1188 }, { "epoch": 0.4752198241406875, "grad_norm": 129.51637700147677, "learning_rate": 5e-06, "loss": 0.6642, "num_input_tokens_seen": 205505920, "step": 1189 }, { "epoch": 0.4752198241406875, "loss": 0.6400755643844604, "loss_ce": 0.003539936849847436, "loss_xval": 0.63671875, "num_input_tokens_seen": 205505920, "step": 1189 }, { "epoch": 0.4756195043964828, "grad_norm": 147.50728598390194, "learning_rate": 5e-06, "loss": 0.7091, "num_input_tokens_seen": 205679128, "step": 1190 }, { "epoch": 0.4756195043964828, "loss": 0.9833164215087891, "loss_ce": 0.005899389274418354, "loss_xval": 0.9765625, "num_input_tokens_seen": 205679128, "step": 1190 }, { "epoch": 0.47601918465227816, "grad_norm": 90.94767641525118, "learning_rate": 5e-06, "loss": 0.7318, "num_input_tokens_seen": 205851816, "step": 1191 }, { "epoch": 0.47601918465227816, "loss": 0.7062619924545288, "loss_ce": 0.006860113237053156, "loss_xval": 0.69921875, "num_input_tokens_seen": 205851816, "step": 1191 }, { "epoch": 0.47641886490807356, "grad_norm": 149.49872792375933, "learning_rate": 5e-06, "loss": 0.5292, "num_input_tokens_seen": 206024792, "step": 1192 }, { "epoch": 0.47641886490807356, "loss": 0.5177109837532043, "loss_ce": 0.005381924565881491, "loss_xval": 0.51171875, "num_input_tokens_seen": 206024792, "step": 1192 }, { "epoch": 0.4768185451638689, "grad_norm": 36.85115396826063, "learning_rate": 5e-06, "loss": 0.4891, "num_input_tokens_seen": 206197976, "step": 1193 }, { "epoch": 0.4768185451638689, "loss": 0.6350235939025879, "loss_ce": 0.006605636328458786, "loss_xval": 0.62890625, "num_input_tokens_seen": 206197976, "step": 1193 }, { "epoch": 0.47721822541966424, "grad_norm": 45.87647325830105, "learning_rate": 5e-06, "loss": 0.3795, "num_input_tokens_seen": 206370960, "step": 1194 }, { "epoch": 0.47721822541966424, "loss": 0.31491148471832275, "loss_ce": 0.004608749412000179, "loss_xval": 0.310546875, "num_input_tokens_seen": 206370960, "step": 1194 }, { "epoch": 0.47761790567545964, "grad_norm": 112.11711434198487, "learning_rate": 5e-06, "loss": 0.7471, "num_input_tokens_seen": 206544200, "step": 1195 }, { "epoch": 0.47761790567545964, "loss": 0.9842346906661987, "loss_ce": 0.004620386753231287, "loss_xval": 0.98046875, "num_input_tokens_seen": 206544200, "step": 1195 }, { "epoch": 0.478017585931255, "grad_norm": 65.02709178533176, "learning_rate": 5e-06, "loss": 0.7812, "num_input_tokens_seen": 206716880, "step": 1196 }, { "epoch": 0.478017585931255, "loss": 1.0724844932556152, "loss_ce": 0.012914232909679413, "loss_xval": 1.0625, "num_input_tokens_seen": 206716880, "step": 1196 }, { "epoch": 0.4784172661870504, "grad_norm": 64.89552423131474, "learning_rate": 5e-06, "loss": 0.6697, "num_input_tokens_seen": 206889208, "step": 1197 }, { "epoch": 0.4784172661870504, "loss": 0.5000320076942444, "loss_ce": 0.006379666738212109, "loss_xval": 0.494140625, "num_input_tokens_seen": 206889208, "step": 1197 }, { "epoch": 0.4788169464428457, "grad_norm": 92.45595988265173, "learning_rate": 5e-06, "loss": 0.4587, "num_input_tokens_seen": 207062608, "step": 1198 }, { "epoch": 0.4788169464428457, "loss": 0.6541969776153564, "loss_ce": 0.0036232522688806057, "loss_xval": 0.65234375, "num_input_tokens_seen": 207062608, "step": 1198 }, { "epoch": 0.47921662669864107, "grad_norm": 82.54766047585346, "learning_rate": 5e-06, "loss": 0.6257, "num_input_tokens_seen": 207235096, "step": 1199 }, { "epoch": 0.47921662669864107, "loss": 0.689261257648468, "loss_ce": 0.007254431024193764, "loss_xval": 0.68359375, "num_input_tokens_seen": 207235096, "step": 1199 }, { "epoch": 0.47961630695443647, "grad_norm": 57.082823434112576, "learning_rate": 5e-06, "loss": 0.5072, "num_input_tokens_seen": 207408280, "step": 1200 }, { "epoch": 0.47961630695443647, "loss": 0.5904377102851868, "loss_ce": 0.003889882005751133, "loss_xval": 0.5859375, "num_input_tokens_seen": 207408280, "step": 1200 }, { "epoch": 0.4800159872102318, "grad_norm": 37.253625537767235, "learning_rate": 5e-06, "loss": 0.7844, "num_input_tokens_seen": 207581328, "step": 1201 }, { "epoch": 0.4800159872102318, "loss": 0.5707876682281494, "loss_ce": 0.0070670172572135925, "loss_xval": 0.5625, "num_input_tokens_seen": 207581328, "step": 1201 }, { "epoch": 0.48041566746602715, "grad_norm": 126.39021086125909, "learning_rate": 5e-06, "loss": 0.47, "num_input_tokens_seen": 207754272, "step": 1202 }, { "epoch": 0.48041566746602715, "loss": 0.369029700756073, "loss_ce": 0.005626373924314976, "loss_xval": 0.36328125, "num_input_tokens_seen": 207754272, "step": 1202 }, { "epoch": 0.48081534772182255, "grad_norm": 27.399518912536287, "learning_rate": 5e-06, "loss": 0.4326, "num_input_tokens_seen": 207926864, "step": 1203 }, { "epoch": 0.48081534772182255, "loss": 0.48411285877227783, "loss_ce": 0.0038882247172296047, "loss_xval": 0.48046875, "num_input_tokens_seen": 207926864, "step": 1203 }, { "epoch": 0.4812150279776179, "grad_norm": 133.10942772969344, "learning_rate": 5e-06, "loss": 0.8218, "num_input_tokens_seen": 208099896, "step": 1204 }, { "epoch": 0.4812150279776179, "loss": 0.7755193114280701, "loss_ce": 0.012824056670069695, "loss_xval": 0.76171875, "num_input_tokens_seen": 208099896, "step": 1204 }, { "epoch": 0.4816147082334133, "grad_norm": 39.05446343546031, "learning_rate": 5e-06, "loss": 0.5318, "num_input_tokens_seen": 208272824, "step": 1205 }, { "epoch": 0.4816147082334133, "loss": 0.4078354239463806, "loss_ce": 0.0037826700136065483, "loss_xval": 0.404296875, "num_input_tokens_seen": 208272824, "step": 1205 }, { "epoch": 0.48201438848920863, "grad_norm": 141.26378845226603, "learning_rate": 5e-06, "loss": 0.9037, "num_input_tokens_seen": 208445320, "step": 1206 }, { "epoch": 0.48201438848920863, "loss": 0.4204305112361908, "loss_ce": 0.007832853123545647, "loss_xval": 0.412109375, "num_input_tokens_seen": 208445320, "step": 1206 }, { "epoch": 0.482414068745004, "grad_norm": 65.24571131237124, "learning_rate": 5e-06, "loss": 0.781, "num_input_tokens_seen": 208618264, "step": 1207 }, { "epoch": 0.482414068745004, "loss": 1.0000226497650146, "loss_ce": 0.005759958643466234, "loss_xval": 0.99609375, "num_input_tokens_seen": 208618264, "step": 1207 }, { "epoch": 0.4828137490007994, "grad_norm": 93.46614098455318, "learning_rate": 5e-06, "loss": 0.6181, "num_input_tokens_seen": 208791320, "step": 1208 }, { "epoch": 0.4828137490007994, "loss": 0.5935106873512268, "loss_ce": 0.007390075363218784, "loss_xval": 0.5859375, "num_input_tokens_seen": 208791320, "step": 1208 }, { "epoch": 0.4832134292565947, "grad_norm": 197.0479425394529, "learning_rate": 5e-06, "loss": 0.7052, "num_input_tokens_seen": 208964360, "step": 1209 }, { "epoch": 0.4832134292565947, "loss": 0.6921088695526123, "loss_ce": 0.00509719830006361, "loss_xval": 0.6875, "num_input_tokens_seen": 208964360, "step": 1209 }, { "epoch": 0.48361310951239006, "grad_norm": 128.35610143451706, "learning_rate": 5e-06, "loss": 0.6615, "num_input_tokens_seen": 209137008, "step": 1210 }, { "epoch": 0.48361310951239006, "loss": 0.6662713289260864, "loss_ce": 0.0055047329515218735, "loss_xval": 0.66015625, "num_input_tokens_seen": 209137008, "step": 1210 }, { "epoch": 0.48401278976818546, "grad_norm": 97.43877540168052, "learning_rate": 5e-06, "loss": 0.4814, "num_input_tokens_seen": 209309968, "step": 1211 }, { "epoch": 0.48401278976818546, "loss": 0.5262230634689331, "loss_ce": 0.005654177628457546, "loss_xval": 0.51953125, "num_input_tokens_seen": 209309968, "step": 1211 }, { "epoch": 0.4844124700239808, "grad_norm": 72.44506988992362, "learning_rate": 5e-06, "loss": 0.5673, "num_input_tokens_seen": 209483056, "step": 1212 }, { "epoch": 0.4844124700239808, "loss": 0.6535665988922119, "loss_ce": 0.005800464190542698, "loss_xval": 0.6484375, "num_input_tokens_seen": 209483056, "step": 1212 }, { "epoch": 0.4848121502797762, "grad_norm": 82.26685181094909, "learning_rate": 5e-06, "loss": 0.2986, "num_input_tokens_seen": 209655808, "step": 1213 }, { "epoch": 0.4848121502797762, "loss": 0.24768781661987305, "loss_ce": 0.005134111270308495, "loss_xval": 0.2421875, "num_input_tokens_seen": 209655808, "step": 1213 }, { "epoch": 0.48521183053557154, "grad_norm": 2883.265707805072, "learning_rate": 5e-06, "loss": 3.4834, "num_input_tokens_seen": 209828696, "step": 1214 }, { "epoch": 0.48521183053557154, "loss": 6.084342002868652, "loss_ce": 0.01479253824800253, "loss_xval": 6.0625, "num_input_tokens_seen": 209828696, "step": 1214 }, { "epoch": 0.4856115107913669, "grad_norm": 88.50388639064461, "learning_rate": 5e-06, "loss": 0.7424, "num_input_tokens_seen": 210001296, "step": 1215 }, { "epoch": 0.4856115107913669, "loss": 1.1838319301605225, "loss_ce": 0.00505995936691761, "loss_xval": 1.1796875, "num_input_tokens_seen": 210001296, "step": 1215 }, { "epoch": 0.4860111910471623, "grad_norm": 103.36056926681852, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 210174048, "step": 1216 }, { "epoch": 0.4860111910471623, "loss": 0.7499319314956665, "loss_ce": 0.10924588143825531, "loss_xval": 0.640625, "num_input_tokens_seen": 210174048, "step": 1216 }, { "epoch": 0.4864108713029576, "grad_norm": 55.89331329285388, "learning_rate": 5e-06, "loss": 0.8338, "num_input_tokens_seen": 210347120, "step": 1217 }, { "epoch": 0.4864108713029576, "loss": 0.9395196437835693, "loss_ce": 0.15698786079883575, "loss_xval": 0.78125, "num_input_tokens_seen": 210347120, "step": 1217 }, { "epoch": 0.486810551558753, "grad_norm": 67.56062531813605, "learning_rate": 5e-06, "loss": 0.8645, "num_input_tokens_seen": 210519800, "step": 1218 }, { "epoch": 0.486810551558753, "loss": 0.8125213980674744, "loss_ce": 0.18675847351551056, "loss_xval": 0.625, "num_input_tokens_seen": 210519800, "step": 1218 }, { "epoch": 0.48721023181454837, "grad_norm": 71.24757754808115, "learning_rate": 5e-06, "loss": 0.8123, "num_input_tokens_seen": 210692952, "step": 1219 }, { "epoch": 0.48721023181454837, "loss": 0.9922150373458862, "loss_ce": 0.23125924170017242, "loss_xval": 0.76171875, "num_input_tokens_seen": 210692952, "step": 1219 }, { "epoch": 0.4876099120703437, "grad_norm": 114.99129622606719, "learning_rate": 5e-06, "loss": 0.8695, "num_input_tokens_seen": 210862584, "step": 1220 }, { "epoch": 0.4876099120703437, "loss": 0.7880289554595947, "loss_ce": 0.1557047963142395, "loss_xval": 0.6328125, "num_input_tokens_seen": 210862584, "step": 1220 }, { "epoch": 0.4880095923261391, "grad_norm": 37.624977084364716, "learning_rate": 5e-06, "loss": 0.8068, "num_input_tokens_seen": 211035584, "step": 1221 }, { "epoch": 0.4880095923261391, "loss": 0.8313114047050476, "loss_ce": 0.13703647255897522, "loss_xval": 0.6953125, "num_input_tokens_seen": 211035584, "step": 1221 }, { "epoch": 0.48840927258193445, "grad_norm": 42.83229604580505, "learning_rate": 5e-06, "loss": 0.6414, "num_input_tokens_seen": 211208152, "step": 1222 }, { "epoch": 0.48840927258193445, "loss": 0.6869634985923767, "loss_ce": 0.12171684950590134, "loss_xval": 0.56640625, "num_input_tokens_seen": 211208152, "step": 1222 }, { "epoch": 0.4888089528377298, "grad_norm": 37.91970336856216, "learning_rate": 5e-06, "loss": 0.8735, "num_input_tokens_seen": 211380896, "step": 1223 }, { "epoch": 0.4888089528377298, "loss": 0.47167566418647766, "loss_ce": 0.15661218762397766, "loss_xval": 0.314453125, "num_input_tokens_seen": 211380896, "step": 1223 }, { "epoch": 0.4892086330935252, "grad_norm": 38.52062556987781, "learning_rate": 5e-06, "loss": 0.8148, "num_input_tokens_seen": 211553704, "step": 1224 }, { "epoch": 0.4892086330935252, "loss": 0.5824769735336304, "loss_ce": 0.11915907263755798, "loss_xval": 0.462890625, "num_input_tokens_seen": 211553704, "step": 1224 }, { "epoch": 0.48960831334932053, "grad_norm": 81.00693493140086, "learning_rate": 5e-06, "loss": 0.7234, "num_input_tokens_seen": 211726768, "step": 1225 }, { "epoch": 0.48960831334932053, "loss": 0.9797881841659546, "loss_ce": 0.1069854348897934, "loss_xval": 0.87109375, "num_input_tokens_seen": 211726768, "step": 1225 }, { "epoch": 0.49000799360511593, "grad_norm": 37.58450026948904, "learning_rate": 5e-06, "loss": 0.6889, "num_input_tokens_seen": 211899872, "step": 1226 }, { "epoch": 0.49000799360511593, "loss": 0.9251545667648315, "loss_ce": 0.10462842881679535, "loss_xval": 0.8203125, "num_input_tokens_seen": 211899872, "step": 1226 }, { "epoch": 0.4904076738609113, "grad_norm": 42.74553283537623, "learning_rate": 5e-06, "loss": 0.917, "num_input_tokens_seen": 212072968, "step": 1227 }, { "epoch": 0.4904076738609113, "loss": 1.1364541053771973, "loss_ce": 0.09052520245313644, "loss_xval": 1.046875, "num_input_tokens_seen": 212072968, "step": 1227 }, { "epoch": 0.4908073541167066, "grad_norm": 39.42896536840982, "learning_rate": 5e-06, "loss": 0.4878, "num_input_tokens_seen": 212246136, "step": 1228 }, { "epoch": 0.4908073541167066, "loss": 0.45796746015548706, "loss_ce": 0.05721063166856766, "loss_xval": 0.400390625, "num_input_tokens_seen": 212246136, "step": 1228 }, { "epoch": 0.491207034372502, "grad_norm": 49.5355844201104, "learning_rate": 5e-06, "loss": 1.0534, "num_input_tokens_seen": 212419256, "step": 1229 }, { "epoch": 0.491207034372502, "loss": 1.6057854890823364, "loss_ce": 0.05982610583305359, "loss_xval": 1.546875, "num_input_tokens_seen": 212419256, "step": 1229 }, { "epoch": 0.49160671462829736, "grad_norm": 53.85482243731108, "learning_rate": 5e-06, "loss": 0.4841, "num_input_tokens_seen": 212592272, "step": 1230 }, { "epoch": 0.49160671462829736, "loss": 0.3914153575897217, "loss_ce": 0.05328058823943138, "loss_xval": 0.337890625, "num_input_tokens_seen": 212592272, "step": 1230 }, { "epoch": 0.4920063948840927, "grad_norm": 29.615996478413024, "learning_rate": 5e-06, "loss": 0.6303, "num_input_tokens_seen": 212765424, "step": 1231 }, { "epoch": 0.4920063948840927, "loss": 0.8910754919052124, "loss_ce": 0.04480774700641632, "loss_xval": 0.84765625, "num_input_tokens_seen": 212765424, "step": 1231 }, { "epoch": 0.4924060751398881, "grad_norm": 22.122138227281248, "learning_rate": 5e-06, "loss": 0.6796, "num_input_tokens_seen": 212938128, "step": 1232 }, { "epoch": 0.4924060751398881, "loss": 0.9208757281303406, "loss_ce": 0.03565235063433647, "loss_xval": 0.88671875, "num_input_tokens_seen": 212938128, "step": 1232 }, { "epoch": 0.49280575539568344, "grad_norm": 37.455588321599855, "learning_rate": 5e-06, "loss": 0.5403, "num_input_tokens_seen": 213111032, "step": 1233 }, { "epoch": 0.49280575539568344, "loss": 0.3303181231021881, "loss_ce": 0.027003923431038857, "loss_xval": 0.302734375, "num_input_tokens_seen": 213111032, "step": 1233 }, { "epoch": 0.49320543565147884, "grad_norm": 26.199194983743542, "learning_rate": 5e-06, "loss": 0.5541, "num_input_tokens_seen": 213284168, "step": 1234 }, { "epoch": 0.49320543565147884, "loss": 0.5545949935913086, "loss_ce": 0.04916280135512352, "loss_xval": 0.50390625, "num_input_tokens_seen": 213284168, "step": 1234 }, { "epoch": 0.4936051159072742, "grad_norm": 48.729256475162536, "learning_rate": 5e-06, "loss": 0.6151, "num_input_tokens_seen": 213456952, "step": 1235 }, { "epoch": 0.4936051159072742, "loss": 0.6374036073684692, "loss_ce": 0.032087456434965134, "loss_xval": 0.60546875, "num_input_tokens_seen": 213456952, "step": 1235 }, { "epoch": 0.4940047961630695, "grad_norm": 43.01829750124253, "learning_rate": 5e-06, "loss": 0.4977, "num_input_tokens_seen": 213630232, "step": 1236 }, { "epoch": 0.4940047961630695, "loss": 0.4653409421443939, "loss_ce": 0.024789176881313324, "loss_xval": 0.44140625, "num_input_tokens_seen": 213630232, "step": 1236 }, { "epoch": 0.4944044764188649, "grad_norm": 32.54880413499179, "learning_rate": 5e-06, "loss": 0.3181, "num_input_tokens_seen": 213803368, "step": 1237 }, { "epoch": 0.4944044764188649, "loss": 0.4874266982078552, "loss_ce": 0.01226796768605709, "loss_xval": 0.474609375, "num_input_tokens_seen": 213803368, "step": 1237 }, { "epoch": 0.49480415667466027, "grad_norm": 45.33323015772085, "learning_rate": 5e-06, "loss": 0.793, "num_input_tokens_seen": 213976184, "step": 1238 }, { "epoch": 0.49480415667466027, "loss": 0.8546011447906494, "loss_ce": 0.0273917093873024, "loss_xval": 0.828125, "num_input_tokens_seen": 213976184, "step": 1238 }, { "epoch": 0.4952038369304556, "grad_norm": 58.6602616771861, "learning_rate": 5e-06, "loss": 0.6672, "num_input_tokens_seen": 214149208, "step": 1239 }, { "epoch": 0.4952038369304556, "loss": 0.7796196341514587, "loss_ce": 0.021196816116571426, "loss_xval": 0.7578125, "num_input_tokens_seen": 214149208, "step": 1239 }, { "epoch": 0.495603517186251, "grad_norm": 41.12794193465622, "learning_rate": 5e-06, "loss": 0.4236, "num_input_tokens_seen": 214322368, "step": 1240 }, { "epoch": 0.495603517186251, "loss": 0.352802038192749, "loss_ce": 0.019183889031410217, "loss_xval": 0.333984375, "num_input_tokens_seen": 214322368, "step": 1240 }, { "epoch": 0.49600319744204635, "grad_norm": 57.576106531202115, "learning_rate": 5e-06, "loss": 0.5261, "num_input_tokens_seen": 214494992, "step": 1241 }, { "epoch": 0.49600319744204635, "loss": 0.36257174611091614, "loss_ce": 0.01869969069957733, "loss_xval": 0.34375, "num_input_tokens_seen": 214494992, "step": 1241 }, { "epoch": 0.49640287769784175, "grad_norm": 44.020812396343615, "learning_rate": 5e-06, "loss": 0.5488, "num_input_tokens_seen": 214668120, "step": 1242 }, { "epoch": 0.49640287769784175, "loss": 0.610977828502655, "loss_ce": 0.02009648270905018, "loss_xval": 0.58984375, "num_input_tokens_seen": 214668120, "step": 1242 }, { "epoch": 0.4968025579536371, "grad_norm": 61.870825774278416, "learning_rate": 5e-06, "loss": 0.5893, "num_input_tokens_seen": 214841232, "step": 1243 }, { "epoch": 0.4968025579536371, "loss": 0.7896366715431213, "loss_ce": 0.028238333761692047, "loss_xval": 0.76171875, "num_input_tokens_seen": 214841232, "step": 1243 }, { "epoch": 0.49720223820943243, "grad_norm": 41.17697077521213, "learning_rate": 5e-06, "loss": 0.5542, "num_input_tokens_seen": 215014424, "step": 1244 }, { "epoch": 0.49720223820943243, "loss": 0.680115818977356, "loss_ce": 0.009522556327283382, "loss_xval": 0.671875, "num_input_tokens_seen": 215014424, "step": 1244 }, { "epoch": 0.49760191846522783, "grad_norm": 76.82017942798518, "learning_rate": 5e-06, "loss": 0.6606, "num_input_tokens_seen": 215187240, "step": 1245 }, { "epoch": 0.49760191846522783, "loss": 0.577049732208252, "loss_ce": 0.02297259122133255, "loss_xval": 0.5546875, "num_input_tokens_seen": 215187240, "step": 1245 }, { "epoch": 0.4980015987210232, "grad_norm": 49.35990869457503, "learning_rate": 5e-06, "loss": 0.4436, "num_input_tokens_seen": 215356856, "step": 1246 }, { "epoch": 0.4980015987210232, "loss": 0.52464359998703, "loss_ce": 0.01289433240890503, "loss_xval": 0.51171875, "num_input_tokens_seen": 215356856, "step": 1246 }, { "epoch": 0.4984012789768185, "grad_norm": 30.431804132343178, "learning_rate": 5e-06, "loss": 0.4152, "num_input_tokens_seen": 215529512, "step": 1247 }, { "epoch": 0.4984012789768185, "loss": 0.39520263671875, "loss_ce": 0.004180910065770149, "loss_xval": 0.390625, "num_input_tokens_seen": 215529512, "step": 1247 }, { "epoch": 0.4988009592326139, "grad_norm": 79.43813542549815, "learning_rate": 5e-06, "loss": 0.7606, "num_input_tokens_seen": 215702560, "step": 1248 }, { "epoch": 0.4988009592326139, "loss": 0.8635843396186829, "loss_ce": 0.006284565664827824, "loss_xval": 0.85546875, "num_input_tokens_seen": 215702560, "step": 1248 }, { "epoch": 0.49920063948840926, "grad_norm": 46.9042618380455, "learning_rate": 5e-06, "loss": 0.5143, "num_input_tokens_seen": 215875760, "step": 1249 }, { "epoch": 0.49920063948840926, "loss": 0.5450088381767273, "loss_ce": 0.0034438944421708584, "loss_xval": 0.54296875, "num_input_tokens_seen": 215875760, "step": 1249 }, { "epoch": 0.49960031974420466, "grad_norm": 44.95576452572528, "learning_rate": 5e-06, "loss": 0.8945, "num_input_tokens_seen": 216048592, "step": 1250 }, { "epoch": 0.49960031974420466, "eval_websight_new_IoU": 0.46755318343639374, "eval_websight_new_MAE_all": 0.01638866774737835, "eval_websight_new_MAE_h": 0.004519310197792947, "eval_websight_new_MAE_w": 0.030832246877253056, "eval_websight_new_MAE_x": 0.014988915994763374, "eval_websight_new_MAE_y": 0.015214197337627411, "eval_websight_new_NUM_probability": 0.9706818461418152, "eval_websight_new_inside_bbox": 0.7760416567325592, "eval_websight_new_loss": 0.11890730261802673, "eval_websight_new_loss_ce": 0.0031176727497950196, "eval_websight_new_loss_xval": 0.097412109375, "eval_websight_new_runtime": 56.3167, "eval_websight_new_samples_per_second": 0.888, "eval_websight_new_steps_per_second": 0.036, "num_input_tokens_seen": 216048592, "step": 1250 }, { "epoch": 0.49960031974420466, "eval_seeclick_IoU": 0.2279941290616989, "eval_seeclick_MAE_all": 0.07300104945898056, "eval_seeclick_MAE_h": 0.023876951076090336, "eval_seeclick_MAE_w": 0.0947648361325264, "eval_seeclick_MAE_x": 0.10035844147205353, "eval_seeclick_MAE_y": 0.0730039793998003, "eval_seeclick_NUM_probability": 0.9678144454956055, "eval_seeclick_inside_bbox": 0.4288194477558136, "eval_seeclick_loss": 1.8150830268859863, "eval_seeclick_loss_ce": 0.013575777411460876, "eval_seeclick_loss_xval": 1.76678466796875, "eval_seeclick_runtime": 81.5951, "eval_seeclick_samples_per_second": 0.613, "eval_seeclick_steps_per_second": 0.025, "num_input_tokens_seen": 216048592, "step": 1250 }, { "epoch": 0.49960031974420466, "eval_icons_IoU": 0.1491006501019001, "eval_icons_MAE_all": 0.023428103420883417, "eval_icons_MAE_h": 0.009299044031649828, "eval_icons_MAE_w": 0.006808809470385313, "eval_icons_MAE_x": 0.0501435212790966, "eval_icons_MAE_y": 0.0274610361084342, "eval_icons_NUM_probability": 0.9710031449794769, "eval_icons_inside_bbox": 0.2708333358168602, "eval_icons_loss": 0.2193080484867096, "eval_icons_loss_ce": 0.0031963232904672623, "eval_icons_loss_xval": 0.18771743774414062, "eval_icons_runtime": 86.547, "eval_icons_samples_per_second": 0.578, "eval_icons_steps_per_second": 0.023, "num_input_tokens_seen": 216048592, "step": 1250 }, { "epoch": 0.49960031974420466, "loss": 0.33800265192985535, "loss_ce": 0.003209566930308938, "loss_xval": 0.333984375, "num_input_tokens_seen": 216048592, "step": 1250 }, { "epoch": 0.5, "grad_norm": 56.2359235987798, "learning_rate": 5e-06, "loss": 0.9024, "num_input_tokens_seen": 216221720, "step": 1251 }, { "epoch": 0.5, "loss": 0.8478096723556519, "loss_ce": 0.007721788249909878, "loss_xval": 0.83984375, "num_input_tokens_seen": 216221720, "step": 1251 }, { "epoch": 0.5003996802557954, "grad_norm": 25.41910553147172, "learning_rate": 5e-06, "loss": 0.5755, "num_input_tokens_seen": 216394976, "step": 1252 }, { "epoch": 0.5003996802557954, "loss": 0.5523375272750854, "loss_ce": 0.006713734474033117, "loss_xval": 0.546875, "num_input_tokens_seen": 216394976, "step": 1252 }, { "epoch": 0.5007993605115907, "grad_norm": 32.41052618484067, "learning_rate": 5e-06, "loss": 0.7387, "num_input_tokens_seen": 216567976, "step": 1253 }, { "epoch": 0.5007993605115907, "loss": 0.8942447900772095, "loss_ce": 0.009235035628080368, "loss_xval": 0.88671875, "num_input_tokens_seen": 216567976, "step": 1253 }, { "epoch": 0.5011990407673861, "grad_norm": 47.47149033729788, "learning_rate": 5e-06, "loss": 0.5451, "num_input_tokens_seen": 216740976, "step": 1254 }, { "epoch": 0.5011990407673861, "loss": 0.34523525834083557, "loss_ce": 0.006917405407875776, "loss_xval": 0.337890625, "num_input_tokens_seen": 216740976, "step": 1254 }, { "epoch": 0.5015987210231815, "grad_norm": 33.76916038849005, "learning_rate": 5e-06, "loss": 0.462, "num_input_tokens_seen": 216913768, "step": 1255 }, { "epoch": 0.5015987210231815, "loss": 0.37124156951904297, "loss_ce": 0.002772345207631588, "loss_xval": 0.369140625, "num_input_tokens_seen": 216913768, "step": 1255 }, { "epoch": 0.5019984012789768, "grad_norm": 38.361290403427155, "learning_rate": 5e-06, "loss": 0.4189, "num_input_tokens_seen": 217086576, "step": 1256 }, { "epoch": 0.5019984012789768, "loss": 0.36219191551208496, "loss_ce": 0.00684523768723011, "loss_xval": 0.35546875, "num_input_tokens_seen": 217086576, "step": 1256 }, { "epoch": 0.5023980815347722, "grad_norm": 39.2253317331318, "learning_rate": 5e-06, "loss": 0.4246, "num_input_tokens_seen": 217259736, "step": 1257 }, { "epoch": 0.5023980815347722, "loss": 0.5401076078414917, "loss_ce": 0.010536082088947296, "loss_xval": 0.53125, "num_input_tokens_seen": 217259736, "step": 1257 }, { "epoch": 0.5027977617905676, "grad_norm": 32.08576453335854, "learning_rate": 5e-06, "loss": 0.6911, "num_input_tokens_seen": 217432720, "step": 1258 }, { "epoch": 0.5027977617905676, "loss": 0.46980804204940796, "loss_ce": 0.004964273888617754, "loss_xval": 0.46484375, "num_input_tokens_seen": 217432720, "step": 1258 }, { "epoch": 0.503197442046363, "grad_norm": 52.40906144418351, "learning_rate": 5e-06, "loss": 0.4735, "num_input_tokens_seen": 217605680, "step": 1259 }, { "epoch": 0.503197442046363, "loss": 0.3017037510871887, "loss_ce": 0.009772591292858124, "loss_xval": 0.291015625, "num_input_tokens_seen": 217605680, "step": 1259 }, { "epoch": 0.5035971223021583, "grad_norm": 27.32132146628816, "learning_rate": 5e-06, "loss": 0.5435, "num_input_tokens_seen": 217778576, "step": 1260 }, { "epoch": 0.5035971223021583, "loss": 0.30970054864883423, "loss_ce": 0.009041349403560162, "loss_xval": 0.30078125, "num_input_tokens_seen": 217778576, "step": 1260 }, { "epoch": 0.5039968025579536, "grad_norm": 56.69177348549417, "learning_rate": 5e-06, "loss": 0.4164, "num_input_tokens_seen": 217951296, "step": 1261 }, { "epoch": 0.5039968025579536, "loss": 0.32917362451553345, "loss_ce": 0.0054126461036503315, "loss_xval": 0.32421875, "num_input_tokens_seen": 217951296, "step": 1261 }, { "epoch": 0.504396482813749, "grad_norm": 33.190618492693645, "learning_rate": 5e-06, "loss": 0.5033, "num_input_tokens_seen": 218124328, "step": 1262 }, { "epoch": 0.504396482813749, "loss": 0.45435625314712524, "loss_ce": 0.0076399631798267365, "loss_xval": 0.447265625, "num_input_tokens_seen": 218124328, "step": 1262 }, { "epoch": 0.5047961630695443, "grad_norm": 108.08898692175731, "learning_rate": 5e-06, "loss": 0.4897, "num_input_tokens_seen": 218296984, "step": 1263 }, { "epoch": 0.5047961630695443, "loss": 0.45390087366104126, "loss_ce": 0.00871045421808958, "loss_xval": 0.4453125, "num_input_tokens_seen": 218296984, "step": 1263 }, { "epoch": 0.5051958433253397, "grad_norm": 21.718718812429533, "learning_rate": 5e-06, "loss": 0.2889, "num_input_tokens_seen": 218469976, "step": 1264 }, { "epoch": 0.5051958433253397, "loss": 0.2596575617790222, "loss_ce": 0.009230328723788261, "loss_xval": 0.25, "num_input_tokens_seen": 218469976, "step": 1264 }, { "epoch": 0.5055955235811351, "grad_norm": 37.47124945247608, "learning_rate": 5e-06, "loss": 0.6292, "num_input_tokens_seen": 218639192, "step": 1265 }, { "epoch": 0.5055955235811351, "loss": 0.6127042770385742, "loss_ce": 0.003237716155126691, "loss_xval": 0.609375, "num_input_tokens_seen": 218639192, "step": 1265 }, { "epoch": 0.5059952038369304, "grad_norm": 61.16539980854701, "learning_rate": 5e-06, "loss": 0.7032, "num_input_tokens_seen": 218812232, "step": 1266 }, { "epoch": 0.5059952038369304, "loss": 0.6988984942436218, "loss_ce": 0.00425736466422677, "loss_xval": 0.6953125, "num_input_tokens_seen": 218812232, "step": 1266 }, { "epoch": 0.5063948840927258, "grad_norm": 78.5806516668246, "learning_rate": 5e-06, "loss": 0.3494, "num_input_tokens_seen": 218985632, "step": 1267 }, { "epoch": 0.5063948840927258, "loss": 0.35586732625961304, "loss_ce": 0.007844888605177402, "loss_xval": 0.34765625, "num_input_tokens_seen": 218985632, "step": 1267 }, { "epoch": 0.5067945643485212, "grad_norm": 59.790638905805274, "learning_rate": 5e-06, "loss": 0.6626, "num_input_tokens_seen": 219159072, "step": 1268 }, { "epoch": 0.5067945643485212, "loss": 0.6986931562423706, "loss_ce": 0.0025871843099594116, "loss_xval": 0.6953125, "num_input_tokens_seen": 219159072, "step": 1268 }, { "epoch": 0.5071942446043165, "grad_norm": 62.29636011421407, "learning_rate": 5e-06, "loss": 0.487, "num_input_tokens_seen": 219332328, "step": 1269 }, { "epoch": 0.5071942446043165, "loss": 0.49609100818634033, "loss_ce": 0.0033542001619935036, "loss_xval": 0.4921875, "num_input_tokens_seen": 219332328, "step": 1269 }, { "epoch": 0.5075939248601119, "grad_norm": 69.1067299073295, "learning_rate": 5e-06, "loss": 0.4005, "num_input_tokens_seen": 219505336, "step": 1270 }, { "epoch": 0.5075939248601119, "loss": 0.40173617005348206, "loss_ce": 0.006960791535675526, "loss_xval": 0.39453125, "num_input_tokens_seen": 219505336, "step": 1270 }, { "epoch": 0.5079936051159073, "grad_norm": 46.18987949039075, "learning_rate": 5e-06, "loss": 0.6363, "num_input_tokens_seen": 219678016, "step": 1271 }, { "epoch": 0.5079936051159073, "loss": 0.8009523749351501, "loss_ce": 0.003787450725212693, "loss_xval": 0.796875, "num_input_tokens_seen": 219678016, "step": 1271 }, { "epoch": 0.5083932853717026, "grad_norm": 80.61756373894711, "learning_rate": 5e-06, "loss": 0.5925, "num_input_tokens_seen": 219851080, "step": 1272 }, { "epoch": 0.5083932853717026, "loss": 0.4208008050918579, "loss_ce": 0.00719609297811985, "loss_xval": 0.4140625, "num_input_tokens_seen": 219851080, "step": 1272 }, { "epoch": 0.508792965627498, "grad_norm": 24.109669016327093, "learning_rate": 5e-06, "loss": 0.3739, "num_input_tokens_seen": 220024640, "step": 1273 }, { "epoch": 0.508792965627498, "loss": 0.387967050075531, "loss_ce": 0.005032491870224476, "loss_xval": 0.3828125, "num_input_tokens_seen": 220024640, "step": 1273 }, { "epoch": 0.5091926458832934, "grad_norm": 45.963819514467495, "learning_rate": 5e-06, "loss": 0.4557, "num_input_tokens_seen": 220197848, "step": 1274 }, { "epoch": 0.5091926458832934, "loss": 0.29403769969940186, "loss_ce": 0.008789882063865662, "loss_xval": 0.28515625, "num_input_tokens_seen": 220197848, "step": 1274 }, { "epoch": 0.5095923261390888, "grad_norm": 54.94754414475741, "learning_rate": 5e-06, "loss": 0.6258, "num_input_tokens_seen": 220370784, "step": 1275 }, { "epoch": 0.5095923261390888, "loss": 0.6867777109146118, "loss_ce": 0.009470607154071331, "loss_xval": 0.67578125, "num_input_tokens_seen": 220370784, "step": 1275 }, { "epoch": 0.5099920063948841, "grad_norm": 37.56208223998764, "learning_rate": 5e-06, "loss": 0.7402, "num_input_tokens_seen": 220543944, "step": 1276 }, { "epoch": 0.5099920063948841, "loss": 0.28449493646621704, "loss_ce": 0.0019937213510274887, "loss_xval": 0.283203125, "num_input_tokens_seen": 220543944, "step": 1276 }, { "epoch": 0.5103916866506795, "grad_norm": 86.38962127574528, "learning_rate": 5e-06, "loss": 0.2525, "num_input_tokens_seen": 220716584, "step": 1277 }, { "epoch": 0.5103916866506795, "loss": 0.36348748207092285, "loss_ce": 0.004051441326737404, "loss_xval": 0.359375, "num_input_tokens_seen": 220716584, "step": 1277 }, { "epoch": 0.5107913669064749, "grad_norm": 41.557195679205826, "learning_rate": 5e-06, "loss": 0.4988, "num_input_tokens_seen": 220889464, "step": 1278 }, { "epoch": 0.5107913669064749, "loss": 0.4272310435771942, "loss_ce": 0.0021822056733071804, "loss_xval": 0.42578125, "num_input_tokens_seen": 220889464, "step": 1278 }, { "epoch": 0.5111910471622702, "grad_norm": 72.04043333354636, "learning_rate": 5e-06, "loss": 0.5884, "num_input_tokens_seen": 221061952, "step": 1279 }, { "epoch": 0.5111910471622702, "loss": 0.7456511855125427, "loss_ce": 0.004440242424607277, "loss_xval": 0.7421875, "num_input_tokens_seen": 221061952, "step": 1279 }, { "epoch": 0.5115907274180655, "grad_norm": 54.40269586756929, "learning_rate": 5e-06, "loss": 0.5129, "num_input_tokens_seen": 221234880, "step": 1280 }, { "epoch": 0.5115907274180655, "loss": 0.6264010071754456, "loss_ce": 0.0085421372205019, "loss_xval": 0.6171875, "num_input_tokens_seen": 221234880, "step": 1280 }, { "epoch": 0.511990407673861, "grad_norm": 60.383568725505285, "learning_rate": 5e-06, "loss": 0.9545, "num_input_tokens_seen": 221408088, "step": 1281 }, { "epoch": 0.511990407673861, "loss": 0.9138003587722778, "loss_ce": 0.009747644886374474, "loss_xval": 0.90234375, "num_input_tokens_seen": 221408088, "step": 1281 }, { "epoch": 0.5123900879296562, "grad_norm": 24.245343297674463, "learning_rate": 5e-06, "loss": 0.4102, "num_input_tokens_seen": 221581192, "step": 1282 }, { "epoch": 0.5123900879296562, "loss": 0.2816739082336426, "loss_ce": 0.0051236217841506, "loss_xval": 0.27734375, "num_input_tokens_seen": 221581192, "step": 1282 }, { "epoch": 0.5127897681854516, "grad_norm": 36.034373770787454, "learning_rate": 5e-06, "loss": 0.4183, "num_input_tokens_seen": 221753984, "step": 1283 }, { "epoch": 0.5127897681854516, "loss": 0.28008684515953064, "loss_ce": 0.006466236896812916, "loss_xval": 0.2734375, "num_input_tokens_seen": 221753984, "step": 1283 }, { "epoch": 0.513189448441247, "grad_norm": 33.97082506216034, "learning_rate": 5e-06, "loss": 0.7656, "num_input_tokens_seen": 221927128, "step": 1284 }, { "epoch": 0.513189448441247, "loss": 0.9152973890304565, "loss_ce": 0.005415809340775013, "loss_xval": 0.91015625, "num_input_tokens_seen": 221927128, "step": 1284 }, { "epoch": 0.5135891286970423, "grad_norm": 38.15398027342218, "learning_rate": 5e-06, "loss": 0.6912, "num_input_tokens_seen": 222100064, "step": 1285 }, { "epoch": 0.5135891286970423, "loss": 0.43050551414489746, "loss_ce": 0.0032594138756394386, "loss_xval": 0.427734375, "num_input_tokens_seen": 222100064, "step": 1285 }, { "epoch": 0.5139888089528377, "grad_norm": 17.77718576528173, "learning_rate": 5e-06, "loss": 0.4619, "num_input_tokens_seen": 222273144, "step": 1286 }, { "epoch": 0.5139888089528377, "loss": 0.502855122089386, "loss_ce": 0.007493783254176378, "loss_xval": 0.49609375, "num_input_tokens_seen": 222273144, "step": 1286 }, { "epoch": 0.5143884892086331, "grad_norm": 49.07971279565783, "learning_rate": 5e-06, "loss": 0.5318, "num_input_tokens_seen": 222446248, "step": 1287 }, { "epoch": 0.5143884892086331, "loss": 0.3896329402923584, "loss_ce": 0.01567053608596325, "loss_xval": 0.373046875, "num_input_tokens_seen": 222446248, "step": 1287 }, { "epoch": 0.5147881694644284, "grad_norm": 21.411674481901443, "learning_rate": 5e-06, "loss": 0.4787, "num_input_tokens_seen": 222619208, "step": 1288 }, { "epoch": 0.5147881694644284, "loss": 0.6500849723815918, "loss_ce": 0.01007033046334982, "loss_xval": 0.640625, "num_input_tokens_seen": 222619208, "step": 1288 }, { "epoch": 0.5151878497202238, "grad_norm": 69.27709990958668, "learning_rate": 5e-06, "loss": 0.6783, "num_input_tokens_seen": 222792208, "step": 1289 }, { "epoch": 0.5151878497202238, "loss": 0.657429575920105, "loss_ce": 0.0020950797479599714, "loss_xval": 0.65625, "num_input_tokens_seen": 222792208, "step": 1289 }, { "epoch": 0.5155875299760192, "grad_norm": 35.61428154048361, "learning_rate": 5e-06, "loss": 0.4603, "num_input_tokens_seen": 222965128, "step": 1290 }, { "epoch": 0.5155875299760192, "loss": 0.27936655282974243, "loss_ce": 0.0025415923446416855, "loss_xval": 0.27734375, "num_input_tokens_seen": 222965128, "step": 1290 }, { "epoch": 0.5159872102318146, "grad_norm": 74.58818630856017, "learning_rate": 5e-06, "loss": 0.5112, "num_input_tokens_seen": 223137904, "step": 1291 }, { "epoch": 0.5159872102318146, "loss": 0.5828518867492676, "loss_ce": 0.003628222271800041, "loss_xval": 0.578125, "num_input_tokens_seen": 223137904, "step": 1291 }, { "epoch": 0.5163868904876099, "grad_norm": 80.74060896190802, "learning_rate": 5e-06, "loss": 0.4543, "num_input_tokens_seen": 223310688, "step": 1292 }, { "epoch": 0.5163868904876099, "loss": 0.4423733055591583, "loss_ce": 0.004201945383101702, "loss_xval": 0.4375, "num_input_tokens_seen": 223310688, "step": 1292 }, { "epoch": 0.5167865707434053, "grad_norm": 84.13859190094642, "learning_rate": 5e-06, "loss": 0.3072, "num_input_tokens_seen": 223483768, "step": 1293 }, { "epoch": 0.5167865707434053, "loss": 0.27045226097106934, "loss_ce": 0.007879025302827358, "loss_xval": 0.26171875, "num_input_tokens_seen": 223483768, "step": 1293 }, { "epoch": 0.5171862509992007, "grad_norm": 67.13775547647433, "learning_rate": 5e-06, "loss": 0.4929, "num_input_tokens_seen": 223656944, "step": 1294 }, { "epoch": 0.5171862509992007, "loss": 0.6396459341049194, "loss_ce": 0.004178367555141449, "loss_xval": 0.63671875, "num_input_tokens_seen": 223656944, "step": 1294 }, { "epoch": 0.517585931254996, "grad_norm": 122.1235623959299, "learning_rate": 5e-06, "loss": 0.6337, "num_input_tokens_seen": 223829944, "step": 1295 }, { "epoch": 0.517585931254996, "loss": 0.5107072591781616, "loss_ce": 0.006068557035177946, "loss_xval": 0.50390625, "num_input_tokens_seen": 223829944, "step": 1295 }, { "epoch": 0.5179856115107914, "grad_norm": 23.06635214962515, "learning_rate": 5e-06, "loss": 0.451, "num_input_tokens_seen": 224002944, "step": 1296 }, { "epoch": 0.5179856115107914, "loss": 0.3327631950378418, "loss_ce": 0.0022272877395153046, "loss_xval": 0.330078125, "num_input_tokens_seen": 224002944, "step": 1296 }, { "epoch": 0.5183852917665868, "grad_norm": 72.82118466464544, "learning_rate": 5e-06, "loss": 0.3968, "num_input_tokens_seen": 224176144, "step": 1297 }, { "epoch": 0.5183852917665868, "loss": 0.18834558129310608, "loss_ce": 0.003409042488783598, "loss_xval": 0.1845703125, "num_input_tokens_seen": 224176144, "step": 1297 }, { "epoch": 0.518784972022382, "grad_norm": 22.046556325441585, "learning_rate": 5e-06, "loss": 0.6103, "num_input_tokens_seen": 224349112, "step": 1298 }, { "epoch": 0.518784972022382, "loss": 0.6086263656616211, "loss_ce": 0.003767983755096793, "loss_xval": 0.60546875, "num_input_tokens_seen": 224349112, "step": 1298 }, { "epoch": 0.5191846522781774, "grad_norm": 46.80144712333731, "learning_rate": 5e-06, "loss": 0.5122, "num_input_tokens_seen": 224522008, "step": 1299 }, { "epoch": 0.5191846522781774, "loss": 0.6673205494880676, "loss_ce": 0.00789671204984188, "loss_xval": 0.66015625, "num_input_tokens_seen": 224522008, "step": 1299 }, { "epoch": 0.5195843325339728, "grad_norm": 58.87556869060338, "learning_rate": 5e-06, "loss": 0.6155, "num_input_tokens_seen": 224694608, "step": 1300 }, { "epoch": 0.5195843325339728, "loss": 0.45236122608184814, "loss_ce": 0.004302144981920719, "loss_xval": 0.447265625, "num_input_tokens_seen": 224694608, "step": 1300 }, { "epoch": 0.5199840127897681, "grad_norm": 59.57025192167172, "learning_rate": 5e-06, "loss": 0.6062, "num_input_tokens_seen": 224867824, "step": 1301 }, { "epoch": 0.5199840127897681, "loss": 0.5515105128288269, "loss_ce": 0.007199006155133247, "loss_xval": 0.54296875, "num_input_tokens_seen": 224867824, "step": 1301 }, { "epoch": 0.5203836930455635, "grad_norm": 46.648119620488075, "learning_rate": 5e-06, "loss": 0.5472, "num_input_tokens_seen": 225040896, "step": 1302 }, { "epoch": 0.5203836930455635, "loss": 0.3536621034145355, "loss_ce": 0.0044799624010920525, "loss_xval": 0.349609375, "num_input_tokens_seen": 225040896, "step": 1302 }, { "epoch": 0.5207833733013589, "grad_norm": 21.096710126232633, "learning_rate": 5e-06, "loss": 0.3661, "num_input_tokens_seen": 225213776, "step": 1303 }, { "epoch": 0.5207833733013589, "loss": 0.47925370931625366, "loss_ce": 0.004583288915455341, "loss_xval": 0.474609375, "num_input_tokens_seen": 225213776, "step": 1303 }, { "epoch": 0.5211830535571543, "grad_norm": 45.00401125931439, "learning_rate": 5e-06, "loss": 0.4723, "num_input_tokens_seen": 225386776, "step": 1304 }, { "epoch": 0.5211830535571543, "loss": 0.3989310562610626, "loss_ce": 0.005376371555030346, "loss_xval": 0.39453125, "num_input_tokens_seen": 225386776, "step": 1304 }, { "epoch": 0.5215827338129496, "grad_norm": 47.83089787844274, "learning_rate": 5e-06, "loss": 0.4133, "num_input_tokens_seen": 225559840, "step": 1305 }, { "epoch": 0.5215827338129496, "loss": 0.2589433789253235, "loss_ce": 0.0025346819311380386, "loss_xval": 0.255859375, "num_input_tokens_seen": 225559840, "step": 1305 }, { "epoch": 0.521982414068745, "grad_norm": 76.45941526870152, "learning_rate": 5e-06, "loss": 0.4183, "num_input_tokens_seen": 225732832, "step": 1306 }, { "epoch": 0.521982414068745, "loss": 0.35908687114715576, "loss_ce": 0.005754332058131695, "loss_xval": 0.353515625, "num_input_tokens_seen": 225732832, "step": 1306 }, { "epoch": 0.5223820943245404, "grad_norm": 28.346923450799792, "learning_rate": 5e-06, "loss": 0.2043, "num_input_tokens_seen": 225905584, "step": 1307 }, { "epoch": 0.5223820943245404, "loss": 0.30641642212867737, "loss_ce": 0.004780675284564495, "loss_xval": 0.30078125, "num_input_tokens_seen": 225905584, "step": 1307 }, { "epoch": 0.5227817745803357, "grad_norm": 52.38544353874319, "learning_rate": 5e-06, "loss": 0.3302, "num_input_tokens_seen": 226075200, "step": 1308 }, { "epoch": 0.5227817745803357, "loss": 0.39421164989471436, "loss_ce": 0.005997546017169952, "loss_xval": 0.388671875, "num_input_tokens_seen": 226075200, "step": 1308 }, { "epoch": 0.5231814548361311, "grad_norm": 30.047970923730745, "learning_rate": 5e-06, "loss": 0.362, "num_input_tokens_seen": 226248032, "step": 1309 }, { "epoch": 0.5231814548361311, "loss": 0.17206686735153198, "loss_ce": 0.002251807600259781, "loss_xval": 0.169921875, "num_input_tokens_seen": 226248032, "step": 1309 }, { "epoch": 0.5235811350919265, "grad_norm": 21.06650577505968, "learning_rate": 5e-06, "loss": 0.3305, "num_input_tokens_seen": 226420696, "step": 1310 }, { "epoch": 0.5235811350919265, "loss": 0.42764365673065186, "loss_ce": 0.01628197729587555, "loss_xval": 0.412109375, "num_input_tokens_seen": 226420696, "step": 1310 }, { "epoch": 0.5239808153477218, "grad_norm": 54.162401748623466, "learning_rate": 5e-06, "loss": 0.3938, "num_input_tokens_seen": 226593672, "step": 1311 }, { "epoch": 0.5239808153477218, "loss": 0.16174665093421936, "loss_ce": 0.0044895680621266365, "loss_xval": 0.1572265625, "num_input_tokens_seen": 226593672, "step": 1311 }, { "epoch": 0.5243804956035172, "grad_norm": 25.92578935552127, "learning_rate": 5e-06, "loss": 0.7577, "num_input_tokens_seen": 226766864, "step": 1312 }, { "epoch": 0.5243804956035172, "loss": 0.7232966423034668, "loss_ce": 0.0022577994968742132, "loss_xval": 0.72265625, "num_input_tokens_seen": 226766864, "step": 1312 }, { "epoch": 0.5247801758593126, "grad_norm": 30.005024485668603, "learning_rate": 5e-06, "loss": 0.497, "num_input_tokens_seen": 226939656, "step": 1313 }, { "epoch": 0.5247801758593126, "loss": 0.4305632710456848, "loss_ce": 0.0025236960500478745, "loss_xval": 0.427734375, "num_input_tokens_seen": 226939656, "step": 1313 }, { "epoch": 0.5251798561151079, "grad_norm": 42.69744397870593, "learning_rate": 5e-06, "loss": 0.4521, "num_input_tokens_seen": 227112800, "step": 1314 }, { "epoch": 0.5251798561151079, "loss": 0.30210599303245544, "loss_ce": 0.0036135392729192972, "loss_xval": 0.298828125, "num_input_tokens_seen": 227112800, "step": 1314 }, { "epoch": 0.5255795363709033, "grad_norm": 15.197513540024648, "learning_rate": 5e-06, "loss": 0.3364, "num_input_tokens_seen": 227285992, "step": 1315 }, { "epoch": 0.5255795363709033, "loss": 0.37036991119384766, "loss_ce": 0.005036352667957544, "loss_xval": 0.365234375, "num_input_tokens_seen": 227285992, "step": 1315 }, { "epoch": 0.5259792166266987, "grad_norm": 59.42411346211391, "learning_rate": 5e-06, "loss": 0.5199, "num_input_tokens_seen": 227458960, "step": 1316 }, { "epoch": 0.5259792166266987, "loss": 0.28569337725639343, "loss_ce": 0.004321303218603134, "loss_xval": 0.28125, "num_input_tokens_seen": 227458960, "step": 1316 }, { "epoch": 0.526378896882494, "grad_norm": 69.21561612485712, "learning_rate": 5e-06, "loss": 0.491, "num_input_tokens_seen": 227632200, "step": 1317 }, { "epoch": 0.526378896882494, "loss": 0.2640664279460907, "loss_ce": 0.003949846141040325, "loss_xval": 0.259765625, "num_input_tokens_seen": 227632200, "step": 1317 }, { "epoch": 0.5267785771382894, "grad_norm": 13.132135435903631, "learning_rate": 5e-06, "loss": 0.2884, "num_input_tokens_seen": 227805296, "step": 1318 }, { "epoch": 0.5267785771382894, "loss": 0.28863468766212463, "loss_ce": 0.006469154264777899, "loss_xval": 0.28125, "num_input_tokens_seen": 227805296, "step": 1318 }, { "epoch": 0.5271782573940847, "grad_norm": 57.694864530698325, "learning_rate": 5e-06, "loss": 0.5443, "num_input_tokens_seen": 227978120, "step": 1319 }, { "epoch": 0.5271782573940847, "loss": 0.5344159007072449, "loss_ce": 0.006644911132752895, "loss_xval": 0.52734375, "num_input_tokens_seen": 227978120, "step": 1319 }, { "epoch": 0.5275779376498801, "grad_norm": 37.97773385737496, "learning_rate": 5e-06, "loss": 0.353, "num_input_tokens_seen": 228151056, "step": 1320 }, { "epoch": 0.5275779376498801, "loss": 0.3334610164165497, "loss_ce": 0.006907662842422724, "loss_xval": 0.326171875, "num_input_tokens_seen": 228151056, "step": 1320 }, { "epoch": 0.5279776179056754, "grad_norm": 23.410095734672666, "learning_rate": 5e-06, "loss": 0.6552, "num_input_tokens_seen": 228323968, "step": 1321 }, { "epoch": 0.5279776179056754, "loss": 0.579754114151001, "loss_ce": 0.005108083598315716, "loss_xval": 0.57421875, "num_input_tokens_seen": 228323968, "step": 1321 }, { "epoch": 0.5283772981614708, "grad_norm": 45.88258781654939, "learning_rate": 5e-06, "loss": 0.7877, "num_input_tokens_seen": 228497032, "step": 1322 }, { "epoch": 0.5283772981614708, "loss": 0.7026103734970093, "loss_ce": 0.005100608803331852, "loss_xval": 0.69921875, "num_input_tokens_seen": 228497032, "step": 1322 }, { "epoch": 0.5287769784172662, "grad_norm": 20.4423042983933, "learning_rate": 5e-06, "loss": 0.4286, "num_input_tokens_seen": 228670088, "step": 1323 }, { "epoch": 0.5287769784172662, "loss": 0.3816264271736145, "loss_ce": 0.014408385381102562, "loss_xval": 0.3671875, "num_input_tokens_seen": 228670088, "step": 1323 }, { "epoch": 0.5291766586730615, "grad_norm": 51.63250028305346, "learning_rate": 5e-06, "loss": 0.2495, "num_input_tokens_seen": 228843096, "step": 1324 }, { "epoch": 0.5291766586730615, "loss": 0.3274524211883545, "loss_ce": 0.005583534948527813, "loss_xval": 0.322265625, "num_input_tokens_seen": 228843096, "step": 1324 }, { "epoch": 0.5295763389288569, "grad_norm": 24.608922162922916, "learning_rate": 5e-06, "loss": 0.3274, "num_input_tokens_seen": 229016144, "step": 1325 }, { "epoch": 0.5295763389288569, "loss": 0.486013799905777, "loss_ce": 0.0058578504249453545, "loss_xval": 0.48046875, "num_input_tokens_seen": 229016144, "step": 1325 }, { "epoch": 0.5299760191846523, "grad_norm": 61.57937595911081, "learning_rate": 5e-06, "loss": 0.312, "num_input_tokens_seen": 229188832, "step": 1326 }, { "epoch": 0.5299760191846523, "loss": 0.3464367985725403, "loss_ce": 0.007767999544739723, "loss_xval": 0.337890625, "num_input_tokens_seen": 229188832, "step": 1326 }, { "epoch": 0.5303756994404476, "grad_norm": 44.936253935063796, "learning_rate": 5e-06, "loss": 0.8079, "num_input_tokens_seen": 229361504, "step": 1327 }, { "epoch": 0.5303756994404476, "loss": 1.1617729663848877, "loss_ce": 0.0061028143391013145, "loss_xval": 1.15625, "num_input_tokens_seen": 229361504, "step": 1327 }, { "epoch": 0.530775379696243, "grad_norm": 57.45812813973755, "learning_rate": 5e-06, "loss": 0.447, "num_input_tokens_seen": 229534336, "step": 1328 }, { "epoch": 0.530775379696243, "loss": 0.42143499851226807, "loss_ce": 0.005541440099477768, "loss_xval": 0.416015625, "num_input_tokens_seen": 229534336, "step": 1328 }, { "epoch": 0.5311750599520384, "grad_norm": 95.69310855679585, "learning_rate": 5e-06, "loss": 0.6078, "num_input_tokens_seen": 229707488, "step": 1329 }, { "epoch": 0.5311750599520384, "loss": 0.42394664883613586, "loss_ce": 0.0022547650150954723, "loss_xval": 0.421875, "num_input_tokens_seen": 229707488, "step": 1329 }, { "epoch": 0.5315747402078337, "grad_norm": 55.23809125724533, "learning_rate": 5e-06, "loss": 0.9022, "num_input_tokens_seen": 229880592, "step": 1330 }, { "epoch": 0.5315747402078337, "loss": 1.1072825193405151, "loss_ce": 0.010541743598878384, "loss_xval": 1.09375, "num_input_tokens_seen": 229880592, "step": 1330 }, { "epoch": 0.5319744204636291, "grad_norm": 107.65601955216079, "learning_rate": 5e-06, "loss": 0.5116, "num_input_tokens_seen": 230053800, "step": 1331 }, { "epoch": 0.5319744204636291, "loss": 0.4177815020084381, "loss_ce": 0.0035969249438494444, "loss_xval": 0.4140625, "num_input_tokens_seen": 230053800, "step": 1331 }, { "epoch": 0.5323741007194245, "grad_norm": 47.508790422209, "learning_rate": 5e-06, "loss": 0.4268, "num_input_tokens_seen": 230226896, "step": 1332 }, { "epoch": 0.5323741007194245, "loss": 0.4874696731567383, "loss_ce": 0.003766059409826994, "loss_xval": 0.484375, "num_input_tokens_seen": 230226896, "step": 1332 }, { "epoch": 0.5327737809752199, "grad_norm": 44.833401746884746, "learning_rate": 5e-06, "loss": 0.2839, "num_input_tokens_seen": 230399952, "step": 1333 }, { "epoch": 0.5327737809752199, "loss": 0.21195606887340546, "loss_ce": 0.003459974192082882, "loss_xval": 0.208984375, "num_input_tokens_seen": 230399952, "step": 1333 }, { "epoch": 0.5331734612310152, "grad_norm": 22.979639420354616, "learning_rate": 5e-06, "loss": 0.5363, "num_input_tokens_seen": 230573048, "step": 1334 }, { "epoch": 0.5331734612310152, "loss": 0.5578432083129883, "loss_ce": 0.015240712091326714, "loss_xval": 0.54296875, "num_input_tokens_seen": 230573048, "step": 1334 }, { "epoch": 0.5335731414868106, "grad_norm": 38.390194365787885, "learning_rate": 5e-06, "loss": 0.4866, "num_input_tokens_seen": 230746344, "step": 1335 }, { "epoch": 0.5335731414868106, "loss": 0.5222955942153931, "loss_ce": 0.003130522556602955, "loss_xval": 0.51953125, "num_input_tokens_seen": 230746344, "step": 1335 }, { "epoch": 0.533972821742606, "grad_norm": 62.630756568579976, "learning_rate": 5e-06, "loss": 0.4862, "num_input_tokens_seen": 230919112, "step": 1336 }, { "epoch": 0.533972821742606, "loss": 0.20648841559886932, "loss_ce": 0.00534704327583313, "loss_xval": 0.201171875, "num_input_tokens_seen": 230919112, "step": 1336 }, { "epoch": 0.5343725019984013, "grad_norm": 54.98815999554934, "learning_rate": 5e-06, "loss": 0.357, "num_input_tokens_seen": 231092360, "step": 1337 }, { "epoch": 0.5343725019984013, "loss": 0.24189867079257965, "loss_ce": 0.0023204381577670574, "loss_xval": 0.2392578125, "num_input_tokens_seen": 231092360, "step": 1337 }, { "epoch": 0.5347721822541966, "grad_norm": 61.52109754634168, "learning_rate": 5e-06, "loss": 0.5032, "num_input_tokens_seen": 231265440, "step": 1338 }, { "epoch": 0.5347721822541966, "loss": 0.5029772520065308, "loss_ce": 0.0066393520683050156, "loss_xval": 0.49609375, "num_input_tokens_seen": 231265440, "step": 1338 }, { "epoch": 0.535171862509992, "grad_norm": 36.05634221729635, "learning_rate": 5e-06, "loss": 0.6178, "num_input_tokens_seen": 231438576, "step": 1339 }, { "epoch": 0.535171862509992, "loss": 0.7768712639808655, "loss_ce": 0.002121466211974621, "loss_xval": 0.7734375, "num_input_tokens_seen": 231438576, "step": 1339 }, { "epoch": 0.5355715427657873, "grad_norm": 64.96571340372034, "learning_rate": 5e-06, "loss": 0.9467, "num_input_tokens_seen": 231611296, "step": 1340 }, { "epoch": 0.5355715427657873, "loss": 1.1156829595565796, "loss_ce": 0.00386651698499918, "loss_xval": 1.109375, "num_input_tokens_seen": 231611296, "step": 1340 }, { "epoch": 0.5359712230215827, "grad_norm": 84.07508123457501, "learning_rate": 5e-06, "loss": 0.4938, "num_input_tokens_seen": 231783904, "step": 1341 }, { "epoch": 0.5359712230215827, "loss": 0.5198108553886414, "loss_ce": 0.0021717222407460213, "loss_xval": 0.51953125, "num_input_tokens_seen": 231783904, "step": 1341 }, { "epoch": 0.5363709032773781, "grad_norm": 57.64533848978117, "learning_rate": 5e-06, "loss": 0.3062, "num_input_tokens_seen": 231956376, "step": 1342 }, { "epoch": 0.5363709032773781, "loss": 0.36183974146842957, "loss_ce": 0.00447891466319561, "loss_xval": 0.357421875, "num_input_tokens_seen": 231956376, "step": 1342 }, { "epoch": 0.5367705835331734, "grad_norm": 90.55074914704535, "learning_rate": 5e-06, "loss": 0.5969, "num_input_tokens_seen": 232125528, "step": 1343 }, { "epoch": 0.5367705835331734, "loss": 0.3362892270088196, "loss_ce": 0.0024879206903278828, "loss_xval": 0.333984375, "num_input_tokens_seen": 232125528, "step": 1343 }, { "epoch": 0.5371702637889688, "grad_norm": 12.81961083789221, "learning_rate": 5e-06, "loss": 0.2753, "num_input_tokens_seen": 232298760, "step": 1344 }, { "epoch": 0.5371702637889688, "loss": 0.2364044338464737, "loss_ce": 0.006698611192405224, "loss_xval": 0.2294921875, "num_input_tokens_seen": 232298760, "step": 1344 }, { "epoch": 0.5375699440447642, "grad_norm": 45.502407796394216, "learning_rate": 5e-06, "loss": 0.4094, "num_input_tokens_seen": 232468344, "step": 1345 }, { "epoch": 0.5375699440447642, "loss": 0.4168083667755127, "loss_ce": 0.002806881908327341, "loss_xval": 0.4140625, "num_input_tokens_seen": 232468344, "step": 1345 }, { "epoch": 0.5379696243005595, "grad_norm": 51.143260733380274, "learning_rate": 5e-06, "loss": 0.5264, "num_input_tokens_seen": 232641464, "step": 1346 }, { "epoch": 0.5379696243005595, "loss": 0.5115495324134827, "loss_ce": 0.002791010309010744, "loss_xval": 0.5078125, "num_input_tokens_seen": 232641464, "step": 1346 }, { "epoch": 0.5383693045563549, "grad_norm": 40.13135649023107, "learning_rate": 5e-06, "loss": 0.5977, "num_input_tokens_seen": 232814664, "step": 1347 }, { "epoch": 0.5383693045563549, "loss": 0.27177077531814575, "loss_ce": 0.006420427467674017, "loss_xval": 0.265625, "num_input_tokens_seen": 232814664, "step": 1347 }, { "epoch": 0.5387689848121503, "grad_norm": 96.30534320206739, "learning_rate": 5e-06, "loss": 0.6539, "num_input_tokens_seen": 232987248, "step": 1348 }, { "epoch": 0.5387689848121503, "loss": 0.4249582290649414, "loss_ce": 0.0066537680104374886, "loss_xval": 0.41796875, "num_input_tokens_seen": 232987248, "step": 1348 }, { "epoch": 0.5391686650679457, "grad_norm": 77.13815649979884, "learning_rate": 5e-06, "loss": 0.6195, "num_input_tokens_seen": 233160320, "step": 1349 }, { "epoch": 0.5391686650679457, "loss": 0.7565589547157288, "loss_ce": 0.0020728609524667263, "loss_xval": 0.75390625, "num_input_tokens_seen": 233160320, "step": 1349 }, { "epoch": 0.539568345323741, "grad_norm": 98.78007103206556, "learning_rate": 5e-06, "loss": 0.4941, "num_input_tokens_seen": 233328096, "step": 1350 }, { "epoch": 0.539568345323741, "loss": 0.6630009412765503, "loss_ce": 0.007361266296356916, "loss_xval": 0.65625, "num_input_tokens_seen": 233328096, "step": 1350 }, { "epoch": 0.5399680255795364, "grad_norm": 36.141697477602044, "learning_rate": 5e-06, "loss": 0.4463, "num_input_tokens_seen": 233501160, "step": 1351 }, { "epoch": 0.5399680255795364, "loss": 0.2609432339668274, "loss_ce": 0.0051449015736579895, "loss_xval": 0.255859375, "num_input_tokens_seen": 233501160, "step": 1351 }, { "epoch": 0.5403677058353318, "grad_norm": 116.14707098226158, "learning_rate": 5e-06, "loss": 0.7684, "num_input_tokens_seen": 233673680, "step": 1352 }, { "epoch": 0.5403677058353318, "loss": 0.9591152667999268, "loss_ce": 0.0037320067640393972, "loss_xval": 0.95703125, "num_input_tokens_seen": 233673680, "step": 1352 }, { "epoch": 0.5407673860911271, "grad_norm": 32.51684246973528, "learning_rate": 5e-06, "loss": 0.3522, "num_input_tokens_seen": 233846656, "step": 1353 }, { "epoch": 0.5407673860911271, "loss": 0.17604267597198486, "loss_ce": 0.00724994670599699, "loss_xval": 0.1689453125, "num_input_tokens_seen": 233846656, "step": 1353 }, { "epoch": 0.5411670663469225, "grad_norm": 95.46125369800038, "learning_rate": 5e-06, "loss": 0.4154, "num_input_tokens_seen": 234019816, "step": 1354 }, { "epoch": 0.5411670663469225, "loss": 0.5023761987686157, "loss_ce": 0.003169646020978689, "loss_xval": 0.5, "num_input_tokens_seen": 234019816, "step": 1354 }, { "epoch": 0.5415667466027179, "grad_norm": 39.52923561929836, "learning_rate": 5e-06, "loss": 0.3683, "num_input_tokens_seen": 234192592, "step": 1355 }, { "epoch": 0.5415667466027179, "loss": 0.23313309252262115, "loss_ce": 0.0046327258460223675, "loss_xval": 0.228515625, "num_input_tokens_seen": 234192592, "step": 1355 }, { "epoch": 0.5419664268585132, "grad_norm": 101.44654311999328, "learning_rate": 5e-06, "loss": 0.7023, "num_input_tokens_seen": 234365312, "step": 1356 }, { "epoch": 0.5419664268585132, "loss": 0.6109759211540222, "loss_ce": 0.0038592463824898005, "loss_xval": 0.60546875, "num_input_tokens_seen": 234365312, "step": 1356 }, { "epoch": 0.5423661071143085, "grad_norm": 68.28777793206135, "learning_rate": 5e-06, "loss": 0.4509, "num_input_tokens_seen": 234538280, "step": 1357 }, { "epoch": 0.5423661071143085, "loss": 0.4432651400566101, "loss_ce": 0.004788582678884268, "loss_xval": 0.4375, "num_input_tokens_seen": 234538280, "step": 1357 }, { "epoch": 0.542765787370104, "grad_norm": 80.01066832816397, "learning_rate": 5e-06, "loss": 0.5568, "num_input_tokens_seen": 234711248, "step": 1358 }, { "epoch": 0.542765787370104, "loss": 0.4560088813304901, "loss_ce": 0.008193954825401306, "loss_xval": 0.447265625, "num_input_tokens_seen": 234711248, "step": 1358 }, { "epoch": 0.5431654676258992, "grad_norm": 72.31855145193121, "learning_rate": 5e-06, "loss": 0.8891, "num_input_tokens_seen": 234884400, "step": 1359 }, { "epoch": 0.5431654676258992, "loss": 0.7763038873672485, "loss_ce": 0.0035377484746277332, "loss_xval": 0.7734375, "num_input_tokens_seen": 234884400, "step": 1359 }, { "epoch": 0.5435651478816946, "grad_norm": 43.434452462882426, "learning_rate": 5e-06, "loss": 0.4146, "num_input_tokens_seen": 235057136, "step": 1360 }, { "epoch": 0.5435651478816946, "loss": 0.5203279256820679, "loss_ce": 0.002963472157716751, "loss_xval": 0.515625, "num_input_tokens_seen": 235057136, "step": 1360 }, { "epoch": 0.54396482813749, "grad_norm": 118.75692026928306, "learning_rate": 5e-06, "loss": 0.4007, "num_input_tokens_seen": 235230312, "step": 1361 }, { "epoch": 0.54396482813749, "loss": 0.4342734217643738, "loss_ce": 0.0037924526259303093, "loss_xval": 0.4296875, "num_input_tokens_seen": 235230312, "step": 1361 }, { "epoch": 0.5443645083932853, "grad_norm": 49.47107440249342, "learning_rate": 5e-06, "loss": 0.7464, "num_input_tokens_seen": 235403296, "step": 1362 }, { "epoch": 0.5443645083932853, "loss": 0.6611525416374207, "loss_ce": 0.003864938160404563, "loss_xval": 0.65625, "num_input_tokens_seen": 235403296, "step": 1362 }, { "epoch": 0.5447641886490807, "grad_norm": 70.75775745041976, "learning_rate": 5e-06, "loss": 0.6533, "num_input_tokens_seen": 235576152, "step": 1363 }, { "epoch": 0.5447641886490807, "loss": 0.7079252600669861, "loss_ce": 0.003945786505937576, "loss_xval": 0.703125, "num_input_tokens_seen": 235576152, "step": 1363 }, { "epoch": 0.5451638689048761, "grad_norm": 70.38526419899421, "learning_rate": 5e-06, "loss": 0.4057, "num_input_tokens_seen": 235748992, "step": 1364 }, { "epoch": 0.5451638689048761, "loss": 0.6269418001174927, "loss_ce": 0.003559216158464551, "loss_xval": 0.625, "num_input_tokens_seen": 235748992, "step": 1364 }, { "epoch": 0.5455635491606715, "grad_norm": 31.366011027189284, "learning_rate": 5e-06, "loss": 0.3862, "num_input_tokens_seen": 235921912, "step": 1365 }, { "epoch": 0.5455635491606715, "loss": 0.4212551712989807, "loss_ce": 0.007284234277904034, "loss_xval": 0.4140625, "num_input_tokens_seen": 235921912, "step": 1365 }, { "epoch": 0.5459632294164668, "grad_norm": 105.28341921676295, "learning_rate": 5e-06, "loss": 1.079, "num_input_tokens_seen": 236094904, "step": 1366 }, { "epoch": 0.5459632294164668, "loss": 0.4033566415309906, "loss_ce": 0.0030270516872406006, "loss_xval": 0.400390625, "num_input_tokens_seen": 236094904, "step": 1366 }, { "epoch": 0.5463629096722622, "grad_norm": 42.70774593040797, "learning_rate": 5e-06, "loss": 0.663, "num_input_tokens_seen": 236268016, "step": 1367 }, { "epoch": 0.5463629096722622, "loss": 0.1966470330953598, "loss_ce": 0.004935611039400101, "loss_xval": 0.19140625, "num_input_tokens_seen": 236268016, "step": 1367 }, { "epoch": 0.5467625899280576, "grad_norm": 112.81975987822089, "learning_rate": 5e-06, "loss": 0.5069, "num_input_tokens_seen": 236440808, "step": 1368 }, { "epoch": 0.5467625899280576, "loss": 0.7222706079483032, "loss_ce": 0.005046539939939976, "loss_xval": 0.71875, "num_input_tokens_seen": 236440808, "step": 1368 }, { "epoch": 0.5471622701838529, "grad_norm": 50.88949130940376, "learning_rate": 5e-06, "loss": 0.4146, "num_input_tokens_seen": 236613648, "step": 1369 }, { "epoch": 0.5471622701838529, "loss": 0.4492402970790863, "loss_ce": 0.004294017795473337, "loss_xval": 0.4453125, "num_input_tokens_seen": 236613648, "step": 1369 }, { "epoch": 0.5475619504396483, "grad_norm": 104.13474925279456, "learning_rate": 5e-06, "loss": 0.9189, "num_input_tokens_seen": 236786680, "step": 1370 }, { "epoch": 0.5475619504396483, "loss": 0.8582189679145813, "loss_ce": 0.0034826004412025213, "loss_xval": 0.85546875, "num_input_tokens_seen": 236786680, "step": 1370 }, { "epoch": 0.5479616306954437, "grad_norm": 32.81711209453196, "learning_rate": 5e-06, "loss": 0.3459, "num_input_tokens_seen": 236959600, "step": 1371 }, { "epoch": 0.5479616306954437, "loss": 0.272049218416214, "loss_ce": 0.008255256339907646, "loss_xval": 0.263671875, "num_input_tokens_seen": 236959600, "step": 1371 }, { "epoch": 0.548361310951239, "grad_norm": 105.1381007340414, "learning_rate": 5e-06, "loss": 0.592, "num_input_tokens_seen": 237132680, "step": 1372 }, { "epoch": 0.548361310951239, "loss": 0.6921520233154297, "loss_ce": 0.005262310616672039, "loss_xval": 0.6875, "num_input_tokens_seen": 237132680, "step": 1372 }, { "epoch": 0.5487609912070344, "grad_norm": 66.93404166002924, "learning_rate": 5e-06, "loss": 0.515, "num_input_tokens_seen": 237305536, "step": 1373 }, { "epoch": 0.5487609912070344, "loss": 0.4059692621231079, "loss_ce": 0.0069824280217289925, "loss_xval": 0.3984375, "num_input_tokens_seen": 237305536, "step": 1373 }, { "epoch": 0.5491606714628298, "grad_norm": 73.26212946796912, "learning_rate": 5e-06, "loss": 0.6015, "num_input_tokens_seen": 237478648, "step": 1374 }, { "epoch": 0.5491606714628298, "loss": 0.5105670690536499, "loss_ce": 0.004585604183375835, "loss_xval": 0.5078125, "num_input_tokens_seen": 237478648, "step": 1374 }, { "epoch": 0.549560351718625, "grad_norm": 107.19992432502981, "learning_rate": 5e-06, "loss": 0.8017, "num_input_tokens_seen": 237651648, "step": 1375 }, { "epoch": 0.549560351718625, "loss": 0.6293633580207825, "loss_ce": 0.004180216696113348, "loss_xval": 0.625, "num_input_tokens_seen": 237651648, "step": 1375 }, { "epoch": 0.5499600319744204, "grad_norm": 102.85773361268112, "learning_rate": 5e-06, "loss": 0.6284, "num_input_tokens_seen": 237824552, "step": 1376 }, { "epoch": 0.5499600319744204, "loss": 0.6715470552444458, "loss_ce": 0.007576065603643656, "loss_xval": 0.6640625, "num_input_tokens_seen": 237824552, "step": 1376 }, { "epoch": 0.5503597122302158, "grad_norm": 51.34013198873488, "learning_rate": 5e-06, "loss": 0.5408, "num_input_tokens_seen": 237997088, "step": 1377 }, { "epoch": 0.5503597122302158, "loss": 0.5498093366622925, "loss_ce": 0.0029343212954699993, "loss_xval": 0.546875, "num_input_tokens_seen": 237997088, "step": 1377 }, { "epoch": 0.5507593924860112, "grad_norm": 45.16480729836272, "learning_rate": 5e-06, "loss": 0.5291, "num_input_tokens_seen": 238170104, "step": 1378 }, { "epoch": 0.5507593924860112, "loss": 0.6173465847969055, "loss_ce": 0.006995024159550667, "loss_xval": 0.609375, "num_input_tokens_seen": 238170104, "step": 1378 }, { "epoch": 0.5511590727418065, "grad_norm": 90.67189419837551, "learning_rate": 5e-06, "loss": 0.341, "num_input_tokens_seen": 238342840, "step": 1379 }, { "epoch": 0.5511590727418065, "loss": 0.33946144580841064, "loss_ce": 0.0049888077192008495, "loss_xval": 0.333984375, "num_input_tokens_seen": 238342840, "step": 1379 }, { "epoch": 0.5515587529976019, "grad_norm": 52.043973144528145, "learning_rate": 5e-06, "loss": 0.4572, "num_input_tokens_seen": 238515808, "step": 1380 }, { "epoch": 0.5515587529976019, "loss": 0.5051401257514954, "loss_ce": 0.011243650689721107, "loss_xval": 0.494140625, "num_input_tokens_seen": 238515808, "step": 1380 }, { "epoch": 0.5519584332533973, "grad_norm": 56.17714897064993, "learning_rate": 5e-06, "loss": 0.3739, "num_input_tokens_seen": 238688664, "step": 1381 }, { "epoch": 0.5519584332533973, "loss": 0.5631752610206604, "loss_ce": 0.00677874032407999, "loss_xval": 0.5546875, "num_input_tokens_seen": 238688664, "step": 1381 }, { "epoch": 0.5523581135091926, "grad_norm": 40.5381866842805, "learning_rate": 5e-06, "loss": 0.3333, "num_input_tokens_seen": 238861488, "step": 1382 }, { "epoch": 0.5523581135091926, "loss": 0.37881040573120117, "loss_ce": 0.00214719888754189, "loss_xval": 0.376953125, "num_input_tokens_seen": 238861488, "step": 1382 }, { "epoch": 0.552757793764988, "grad_norm": 51.78581653084787, "learning_rate": 5e-06, "loss": 0.5934, "num_input_tokens_seen": 239034336, "step": 1383 }, { "epoch": 0.552757793764988, "loss": 0.6673434972763062, "loss_ce": 0.0035251579247415066, "loss_xval": 0.6640625, "num_input_tokens_seen": 239034336, "step": 1383 }, { "epoch": 0.5531574740207834, "grad_norm": 65.06524952164565, "learning_rate": 5e-06, "loss": 0.3498, "num_input_tokens_seen": 239207232, "step": 1384 }, { "epoch": 0.5531574740207834, "loss": 0.5492129325866699, "loss_ce": 0.008563470095396042, "loss_xval": 0.5390625, "num_input_tokens_seen": 239207232, "step": 1384 }, { "epoch": 0.5535571542765787, "grad_norm": 24.785324730924025, "learning_rate": 5e-06, "loss": 0.4747, "num_input_tokens_seen": 239380368, "step": 1385 }, { "epoch": 0.5535571542765787, "loss": 0.6038126349449158, "loss_ce": 0.006461561657488346, "loss_xval": 0.59765625, "num_input_tokens_seen": 239380368, "step": 1385 }, { "epoch": 0.5539568345323741, "grad_norm": 22.3044022119237, "learning_rate": 5e-06, "loss": 0.309, "num_input_tokens_seen": 239553512, "step": 1386 }, { "epoch": 0.5539568345323741, "loss": 0.23669841885566711, "loss_ce": 0.0036051569040864706, "loss_xval": 0.2333984375, "num_input_tokens_seen": 239553512, "step": 1386 }, { "epoch": 0.5543565147881695, "grad_norm": 32.16457873391064, "learning_rate": 5e-06, "loss": 0.3397, "num_input_tokens_seen": 239726688, "step": 1387 }, { "epoch": 0.5543565147881695, "loss": 0.39722371101379395, "loss_ce": 0.006537655834108591, "loss_xval": 0.390625, "num_input_tokens_seen": 239726688, "step": 1387 }, { "epoch": 0.5547561950439648, "grad_norm": 40.175745398782205, "learning_rate": 5e-06, "loss": 0.3402, "num_input_tokens_seen": 239899592, "step": 1388 }, { "epoch": 0.5547561950439648, "loss": 0.26957184076309204, "loss_ce": 0.00232940586283803, "loss_xval": 0.267578125, "num_input_tokens_seen": 239899592, "step": 1388 }, { "epoch": 0.5551558752997602, "grad_norm": 18.426941380820455, "learning_rate": 5e-06, "loss": 0.4403, "num_input_tokens_seen": 240072512, "step": 1389 }, { "epoch": 0.5551558752997602, "loss": 0.46690550446510315, "loss_ce": 0.0025653140619397163, "loss_xval": 0.46484375, "num_input_tokens_seen": 240072512, "step": 1389 }, { "epoch": 0.5555555555555556, "grad_norm": 45.49727718405183, "learning_rate": 5e-06, "loss": 0.5213, "num_input_tokens_seen": 240245672, "step": 1390 }, { "epoch": 0.5555555555555556, "loss": 0.593445897102356, "loss_ce": 0.005082281306385994, "loss_xval": 0.58984375, "num_input_tokens_seen": 240245672, "step": 1390 }, { "epoch": 0.5559552358113509, "grad_norm": 53.104840422529556, "learning_rate": 5e-06, "loss": 0.5958, "num_input_tokens_seen": 240418408, "step": 1391 }, { "epoch": 0.5559552358113509, "loss": 0.519351601600647, "loss_ce": 0.005221944767981768, "loss_xval": 0.515625, "num_input_tokens_seen": 240418408, "step": 1391 }, { "epoch": 0.5563549160671463, "grad_norm": 89.13366947751527, "learning_rate": 5e-06, "loss": 0.7433, "num_input_tokens_seen": 240591376, "step": 1392 }, { "epoch": 0.5563549160671463, "loss": 0.9261295795440674, "loss_ce": 0.010220762342214584, "loss_xval": 0.9140625, "num_input_tokens_seen": 240591376, "step": 1392 }, { "epoch": 0.5567545963229417, "grad_norm": 35.338435295261455, "learning_rate": 5e-06, "loss": 0.4817, "num_input_tokens_seen": 240764424, "step": 1393 }, { "epoch": 0.5567545963229417, "loss": 0.23436526954174042, "loss_ce": 0.005239281803369522, "loss_xval": 0.2294921875, "num_input_tokens_seen": 240764424, "step": 1393 }, { "epoch": 0.5571542765787371, "grad_norm": 64.32086168686824, "learning_rate": 5e-06, "loss": 0.5221, "num_input_tokens_seen": 240937200, "step": 1394 }, { "epoch": 0.5571542765787371, "loss": 0.38562482595443726, "loss_ce": 0.003971992991864681, "loss_xval": 0.380859375, "num_input_tokens_seen": 240937200, "step": 1394 }, { "epoch": 0.5575539568345323, "grad_norm": 21.415086573261146, "learning_rate": 5e-06, "loss": 0.2177, "num_input_tokens_seen": 241110088, "step": 1395 }, { "epoch": 0.5575539568345323, "loss": 0.2166837602853775, "loss_ce": 0.0027860510163009167, "loss_xval": 0.2138671875, "num_input_tokens_seen": 241110088, "step": 1395 }, { "epoch": 0.5579536370903277, "grad_norm": 62.42230199682289, "learning_rate": 5e-06, "loss": 0.4826, "num_input_tokens_seen": 241282992, "step": 1396 }, { "epoch": 0.5579536370903277, "loss": 0.5386009216308594, "loss_ce": 0.002742763375863433, "loss_xval": 0.53515625, "num_input_tokens_seen": 241282992, "step": 1396 }, { "epoch": 0.5583533173461231, "grad_norm": 29.710934144805904, "learning_rate": 5e-06, "loss": 0.4846, "num_input_tokens_seen": 241455856, "step": 1397 }, { "epoch": 0.5583533173461231, "loss": 0.6360405683517456, "loss_ce": 0.004456415772438049, "loss_xval": 0.6328125, "num_input_tokens_seen": 241455856, "step": 1397 }, { "epoch": 0.5587529976019184, "grad_norm": 30.202754655810416, "learning_rate": 5e-06, "loss": 0.4387, "num_input_tokens_seen": 241628576, "step": 1398 }, { "epoch": 0.5587529976019184, "loss": 0.41940563917160034, "loss_ce": 0.005099033936858177, "loss_xval": 0.4140625, "num_input_tokens_seen": 241628576, "step": 1398 }, { "epoch": 0.5591526778577138, "grad_norm": 34.58672133188951, "learning_rate": 5e-06, "loss": 0.5131, "num_input_tokens_seen": 241801464, "step": 1399 }, { "epoch": 0.5591526778577138, "loss": 0.8912590742111206, "loss_ce": 0.004296140745282173, "loss_xval": 0.88671875, "num_input_tokens_seen": 241801464, "step": 1399 }, { "epoch": 0.5595523581135092, "grad_norm": 61.9029431455141, "learning_rate": 5e-06, "loss": 0.3072, "num_input_tokens_seen": 241974584, "step": 1400 }, { "epoch": 0.5595523581135092, "loss": 0.3766733407974243, "loss_ce": 0.004877682775259018, "loss_xval": 0.37109375, "num_input_tokens_seen": 241974584, "step": 1400 }, { "epoch": 0.5599520383693045, "grad_norm": 58.06942738123757, "learning_rate": 5e-06, "loss": 0.4248, "num_input_tokens_seen": 242147592, "step": 1401 }, { "epoch": 0.5599520383693045, "loss": 0.3648153245449066, "loss_ce": 0.0030446944292634726, "loss_xval": 0.361328125, "num_input_tokens_seen": 242147592, "step": 1401 }, { "epoch": 0.5603517186250999, "grad_norm": 61.23774543866974, "learning_rate": 5e-06, "loss": 0.4822, "num_input_tokens_seen": 242317160, "step": 1402 }, { "epoch": 0.5603517186250999, "loss": 0.24395032227039337, "loss_ce": 0.003014039946720004, "loss_xval": 0.2412109375, "num_input_tokens_seen": 242317160, "step": 1402 }, { "epoch": 0.5607513988808953, "grad_norm": 35.92228501713754, "learning_rate": 5e-06, "loss": 0.4341, "num_input_tokens_seen": 242486528, "step": 1403 }, { "epoch": 0.5607513988808953, "loss": 0.39893341064453125, "loss_ce": 0.0020217944402247667, "loss_xval": 0.396484375, "num_input_tokens_seen": 242486528, "step": 1403 }, { "epoch": 0.5611510791366906, "grad_norm": 91.09500736565018, "learning_rate": 5e-06, "loss": 0.5419, "num_input_tokens_seen": 242659688, "step": 1404 }, { "epoch": 0.5611510791366906, "loss": 0.6004331111907959, "loss_ce": 0.0020444332621991634, "loss_xval": 0.59765625, "num_input_tokens_seen": 242659688, "step": 1404 }, { "epoch": 0.561550759392486, "grad_norm": 55.4921140642313, "learning_rate": 5e-06, "loss": 0.5621, "num_input_tokens_seen": 242832520, "step": 1405 }, { "epoch": 0.561550759392486, "loss": 0.6540317535400391, "loss_ce": 0.0017490473110228777, "loss_xval": 0.65234375, "num_input_tokens_seen": 242832520, "step": 1405 }, { "epoch": 0.5619504396482814, "grad_norm": 51.074231439851395, "learning_rate": 5e-06, "loss": 0.3966, "num_input_tokens_seen": 243005600, "step": 1406 }, { "epoch": 0.5619504396482814, "loss": 0.48184454441070557, "loss_ce": 0.006624825298786163, "loss_xval": 0.474609375, "num_input_tokens_seen": 243005600, "step": 1406 }, { "epoch": 0.5623501199040767, "grad_norm": 90.03472614066028, "learning_rate": 5e-06, "loss": 0.4644, "num_input_tokens_seen": 243178984, "step": 1407 }, { "epoch": 0.5623501199040767, "loss": 0.6506166458129883, "loss_ce": 0.006390602793544531, "loss_xval": 0.64453125, "num_input_tokens_seen": 243178984, "step": 1407 }, { "epoch": 0.5627498001598721, "grad_norm": 57.773260923011286, "learning_rate": 5e-06, "loss": 0.8877, "num_input_tokens_seen": 243352152, "step": 1408 }, { "epoch": 0.5627498001598721, "loss": 0.8280885219573975, "loss_ce": 0.00386980758048594, "loss_xval": 0.82421875, "num_input_tokens_seen": 243352152, "step": 1408 }, { "epoch": 0.5631494804156675, "grad_norm": 116.82277290671593, "learning_rate": 5e-06, "loss": 0.4323, "num_input_tokens_seen": 243525104, "step": 1409 }, { "epoch": 0.5631494804156675, "loss": 0.4614856243133545, "loss_ce": 0.006285438779741526, "loss_xval": 0.455078125, "num_input_tokens_seen": 243525104, "step": 1409 }, { "epoch": 0.5635491606714629, "grad_norm": 61.1230282236721, "learning_rate": 5e-06, "loss": 0.35, "num_input_tokens_seen": 243697976, "step": 1410 }, { "epoch": 0.5635491606714629, "loss": 0.38546931743621826, "loss_ce": 0.0026262898463755846, "loss_xval": 0.3828125, "num_input_tokens_seen": 243697976, "step": 1410 }, { "epoch": 0.5639488409272582, "grad_norm": 111.85099191601515, "learning_rate": 5e-06, "loss": 0.3607, "num_input_tokens_seen": 243870560, "step": 1411 }, { "epoch": 0.5639488409272582, "loss": 0.37697547674179077, "loss_ce": 0.00398966483771801, "loss_xval": 0.373046875, "num_input_tokens_seen": 243870560, "step": 1411 }, { "epoch": 0.5643485211830536, "grad_norm": 88.33100579942653, "learning_rate": 5e-06, "loss": 0.3984, "num_input_tokens_seen": 244043272, "step": 1412 }, { "epoch": 0.5643485211830536, "loss": 0.432317852973938, "loss_ce": 0.004156234674155712, "loss_xval": 0.427734375, "num_input_tokens_seen": 244043272, "step": 1412 }, { "epoch": 0.564748201438849, "grad_norm": 81.4102017754153, "learning_rate": 5e-06, "loss": 0.6458, "num_input_tokens_seen": 244216144, "step": 1413 }, { "epoch": 0.564748201438849, "loss": 0.6670348048210144, "loss_ce": 0.0025145430117845535, "loss_xval": 0.6640625, "num_input_tokens_seen": 244216144, "step": 1413 }, { "epoch": 0.5651478816946442, "grad_norm": 46.391307098818544, "learning_rate": 5e-06, "loss": 0.4551, "num_input_tokens_seen": 244389072, "step": 1414 }, { "epoch": 0.5651478816946442, "loss": 0.3752412796020508, "loss_ce": 0.005246136337518692, "loss_xval": 0.369140625, "num_input_tokens_seen": 244389072, "step": 1414 }, { "epoch": 0.5655475619504396, "grad_norm": 129.86301928613088, "learning_rate": 5e-06, "loss": 0.4808, "num_input_tokens_seen": 244561792, "step": 1415 }, { "epoch": 0.5655475619504396, "loss": 0.30836910009384155, "loss_ce": 0.007160608656704426, "loss_xval": 0.30078125, "num_input_tokens_seen": 244561792, "step": 1415 }, { "epoch": 0.565947242206235, "grad_norm": 40.89946165935015, "learning_rate": 5e-06, "loss": 0.4916, "num_input_tokens_seen": 244735040, "step": 1416 }, { "epoch": 0.565947242206235, "loss": 0.36573469638824463, "loss_ce": 0.002453463850542903, "loss_xval": 0.36328125, "num_input_tokens_seen": 244735040, "step": 1416 }, { "epoch": 0.5663469224620303, "grad_norm": 57.37518618513914, "learning_rate": 5e-06, "loss": 0.7517, "num_input_tokens_seen": 244907832, "step": 1417 }, { "epoch": 0.5663469224620303, "loss": 1.0576549768447876, "loss_ce": 0.007972333580255508, "loss_xval": 1.046875, "num_input_tokens_seen": 244907832, "step": 1417 }, { "epoch": 0.5667466027178257, "grad_norm": 50.498261369439454, "learning_rate": 5e-06, "loss": 0.7163, "num_input_tokens_seen": 245080504, "step": 1418 }, { "epoch": 0.5667466027178257, "loss": 0.40888845920562744, "loss_ce": 0.011854803189635277, "loss_xval": 0.396484375, "num_input_tokens_seen": 245080504, "step": 1418 }, { "epoch": 0.5671462829736211, "grad_norm": 87.47288430083069, "learning_rate": 5e-06, "loss": 0.4905, "num_input_tokens_seen": 245253712, "step": 1419 }, { "epoch": 0.5671462829736211, "loss": 0.5902141332626343, "loss_ce": 0.01254691369831562, "loss_xval": 0.578125, "num_input_tokens_seen": 245253712, "step": 1419 }, { "epoch": 0.5675459632294164, "grad_norm": 49.32901995535315, "learning_rate": 5e-06, "loss": 0.4311, "num_input_tokens_seen": 245426592, "step": 1420 }, { "epoch": 0.5675459632294164, "loss": 0.39141321182250977, "loss_ce": 0.006342416163533926, "loss_xval": 0.384765625, "num_input_tokens_seen": 245426592, "step": 1420 }, { "epoch": 0.5679456434852118, "grad_norm": 53.82129861028939, "learning_rate": 5e-06, "loss": 0.5077, "num_input_tokens_seen": 245599872, "step": 1421 }, { "epoch": 0.5679456434852118, "loss": 0.3029336631298065, "loss_ce": 0.002427075756713748, "loss_xval": 0.30078125, "num_input_tokens_seen": 245599872, "step": 1421 }, { "epoch": 0.5683453237410072, "grad_norm": 20.8997393508194, "learning_rate": 5e-06, "loss": 0.4295, "num_input_tokens_seen": 245772848, "step": 1422 }, { "epoch": 0.5683453237410072, "loss": 0.4302918314933777, "loss_ce": 0.015069630928337574, "loss_xval": 0.416015625, "num_input_tokens_seen": 245772848, "step": 1422 }, { "epoch": 0.5687450039968026, "grad_norm": 34.823028475917404, "learning_rate": 5e-06, "loss": 0.5763, "num_input_tokens_seen": 245945280, "step": 1423 }, { "epoch": 0.5687450039968026, "loss": 0.46396341919898987, "loss_ce": 0.003575220936909318, "loss_xval": 0.4609375, "num_input_tokens_seen": 245945280, "step": 1423 }, { "epoch": 0.5691446842525979, "grad_norm": 78.37392797245467, "learning_rate": 5e-06, "loss": 0.5372, "num_input_tokens_seen": 246118456, "step": 1424 }, { "epoch": 0.5691446842525979, "loss": 0.7276915311813354, "loss_ce": 0.004821660462766886, "loss_xval": 0.72265625, "num_input_tokens_seen": 246118456, "step": 1424 }, { "epoch": 0.5695443645083933, "grad_norm": 35.365197351786534, "learning_rate": 5e-06, "loss": 0.659, "num_input_tokens_seen": 246291224, "step": 1425 }, { "epoch": 0.5695443645083933, "loss": 0.3385479152202606, "loss_ce": 0.005723202601075172, "loss_xval": 0.33203125, "num_input_tokens_seen": 246291224, "step": 1425 }, { "epoch": 0.5699440447641887, "grad_norm": 38.38998430273591, "learning_rate": 5e-06, "loss": 0.6328, "num_input_tokens_seen": 246464600, "step": 1426 }, { "epoch": 0.5699440447641887, "loss": 0.5840468406677246, "loss_ce": 0.007112047169357538, "loss_xval": 0.578125, "num_input_tokens_seen": 246464600, "step": 1426 }, { "epoch": 0.570343725019984, "grad_norm": 26.75343703719278, "learning_rate": 5e-06, "loss": 0.7358, "num_input_tokens_seen": 246637424, "step": 1427 }, { "epoch": 0.570343725019984, "loss": 0.7768107652664185, "loss_ce": 0.005021188408136368, "loss_xval": 0.7734375, "num_input_tokens_seen": 246637424, "step": 1427 }, { "epoch": 0.5707434052757794, "grad_norm": 48.88082237036071, "learning_rate": 5e-06, "loss": 0.4387, "num_input_tokens_seen": 246810640, "step": 1428 }, { "epoch": 0.5707434052757794, "loss": 0.5554983019828796, "loss_ce": 0.0043356032110750675, "loss_xval": 0.55078125, "num_input_tokens_seen": 246810640, "step": 1428 }, { "epoch": 0.5711430855315748, "grad_norm": 34.52783585537693, "learning_rate": 5e-06, "loss": 0.4248, "num_input_tokens_seen": 246983392, "step": 1429 }, { "epoch": 0.5711430855315748, "loss": 0.5638842582702637, "loss_ce": 0.00471069710329175, "loss_xval": 0.55859375, "num_input_tokens_seen": 246983392, "step": 1429 }, { "epoch": 0.5715427657873701, "grad_norm": 25.030027317245604, "learning_rate": 5e-06, "loss": 0.3461, "num_input_tokens_seen": 247156080, "step": 1430 }, { "epoch": 0.5715427657873701, "loss": 0.4537726938724518, "loss_ce": 0.009070548228919506, "loss_xval": 0.4453125, "num_input_tokens_seen": 247156080, "step": 1430 }, { "epoch": 0.5719424460431655, "grad_norm": 37.87817514853326, "learning_rate": 5e-06, "loss": 0.5784, "num_input_tokens_seen": 247328944, "step": 1431 }, { "epoch": 0.5719424460431655, "loss": 0.7571723461151123, "loss_ce": 0.0038001316133886576, "loss_xval": 0.75390625, "num_input_tokens_seen": 247328944, "step": 1431 }, { "epoch": 0.5723421262989609, "grad_norm": 54.5491901614114, "learning_rate": 5e-06, "loss": 0.4931, "num_input_tokens_seen": 247502144, "step": 1432 }, { "epoch": 0.5723421262989609, "loss": 0.5219075083732605, "loss_ce": 0.003871629014611244, "loss_xval": 0.51953125, "num_input_tokens_seen": 247502144, "step": 1432 }, { "epoch": 0.5727418065547561, "grad_norm": 56.706910138506075, "learning_rate": 5e-06, "loss": 0.5465, "num_input_tokens_seen": 247674936, "step": 1433 }, { "epoch": 0.5727418065547561, "loss": 0.6499233245849609, "loss_ce": 0.002157290233299136, "loss_xval": 0.6484375, "num_input_tokens_seen": 247674936, "step": 1433 }, { "epoch": 0.5731414868105515, "grad_norm": 16.90318365693236, "learning_rate": 5e-06, "loss": 0.5191, "num_input_tokens_seen": 247848032, "step": 1434 }, { "epoch": 0.5731414868105515, "loss": 0.534213662147522, "loss_ce": 0.014712927863001823, "loss_xval": 0.51953125, "num_input_tokens_seen": 247848032, "step": 1434 }, { "epoch": 0.573541167066347, "grad_norm": 30.931323218606245, "learning_rate": 5e-06, "loss": 0.557, "num_input_tokens_seen": 248020944, "step": 1435 }, { "epoch": 0.573541167066347, "loss": 0.4585926830768585, "loss_ce": 0.005650795064866543, "loss_xval": 0.453125, "num_input_tokens_seen": 248020944, "step": 1435 }, { "epoch": 0.5739408473221422, "grad_norm": 52.81318892972862, "learning_rate": 5e-06, "loss": 0.4016, "num_input_tokens_seen": 248193744, "step": 1436 }, { "epoch": 0.5739408473221422, "loss": 0.22793583571910858, "loss_ce": 0.006530814804136753, "loss_xval": 0.2216796875, "num_input_tokens_seen": 248193744, "step": 1436 }, { "epoch": 0.5743405275779376, "grad_norm": 41.45713075341328, "learning_rate": 5e-06, "loss": 0.3829, "num_input_tokens_seen": 248366568, "step": 1437 }, { "epoch": 0.5743405275779376, "loss": 0.15585559606552124, "loss_ce": 0.002306394511833787, "loss_xval": 0.1533203125, "num_input_tokens_seen": 248366568, "step": 1437 }, { "epoch": 0.574740207833733, "grad_norm": 42.899656566869155, "learning_rate": 5e-06, "loss": 0.3808, "num_input_tokens_seen": 248539664, "step": 1438 }, { "epoch": 0.574740207833733, "loss": 0.40062421560287476, "loss_ce": 0.0016373979160562158, "loss_xval": 0.3984375, "num_input_tokens_seen": 248539664, "step": 1438 }, { "epoch": 0.5751398880895284, "grad_norm": 28.503400213541788, "learning_rate": 5e-06, "loss": 0.5283, "num_input_tokens_seen": 248712512, "step": 1439 }, { "epoch": 0.5751398880895284, "loss": 0.5747106075286865, "loss_ce": 0.001498923171311617, "loss_xval": 0.57421875, "num_input_tokens_seen": 248712512, "step": 1439 }, { "epoch": 0.5755395683453237, "grad_norm": 28.16745339282557, "learning_rate": 5e-06, "loss": 0.5161, "num_input_tokens_seen": 248882088, "step": 1440 }, { "epoch": 0.5755395683453237, "loss": 0.2827589511871338, "loss_ce": 0.001676824176684022, "loss_xval": 0.28125, "num_input_tokens_seen": 248882088, "step": 1440 }, { "epoch": 0.5759392486011191, "grad_norm": 77.82657940371696, "learning_rate": 5e-06, "loss": 0.5886, "num_input_tokens_seen": 249054720, "step": 1441 }, { "epoch": 0.5759392486011191, "loss": 0.40001291036605835, "loss_ce": 0.0038947416469454765, "loss_xval": 0.396484375, "num_input_tokens_seen": 249054720, "step": 1441 }, { "epoch": 0.5763389288569145, "grad_norm": 20.49828614983781, "learning_rate": 5e-06, "loss": 0.4377, "num_input_tokens_seen": 249227984, "step": 1442 }, { "epoch": 0.5763389288569145, "loss": 0.39074262976646423, "loss_ce": 0.0035661240108311176, "loss_xval": 0.38671875, "num_input_tokens_seen": 249227984, "step": 1442 }, { "epoch": 0.5767386091127098, "grad_norm": 25.0536969800906, "learning_rate": 5e-06, "loss": 0.435, "num_input_tokens_seen": 249400992, "step": 1443 }, { "epoch": 0.5767386091127098, "loss": 0.5896605849266052, "loss_ce": 0.006286558695137501, "loss_xval": 0.58203125, "num_input_tokens_seen": 249400992, "step": 1443 }, { "epoch": 0.5771382893685052, "grad_norm": 40.73073999908245, "learning_rate": 5e-06, "loss": 0.2902, "num_input_tokens_seen": 249574272, "step": 1444 }, { "epoch": 0.5771382893685052, "loss": 0.1345619261264801, "loss_ce": 0.003610992804169655, "loss_xval": 0.130859375, "num_input_tokens_seen": 249574272, "step": 1444 }, { "epoch": 0.5775379696243006, "grad_norm": 24.77426937534188, "learning_rate": 5e-06, "loss": 0.5771, "num_input_tokens_seen": 249747576, "step": 1445 }, { "epoch": 0.5775379696243006, "loss": 0.5811995267868042, "loss_ce": 0.005882172379642725, "loss_xval": 0.57421875, "num_input_tokens_seen": 249747576, "step": 1445 }, { "epoch": 0.5779376498800959, "grad_norm": 40.53175188685322, "learning_rate": 5e-06, "loss": 0.5273, "num_input_tokens_seen": 249920024, "step": 1446 }, { "epoch": 0.5779376498800959, "loss": 0.6215522885322571, "loss_ce": 0.002655788091942668, "loss_xval": 0.6171875, "num_input_tokens_seen": 249920024, "step": 1446 }, { "epoch": 0.5783373301358913, "grad_norm": 17.535297794439494, "learning_rate": 5e-06, "loss": 0.2833, "num_input_tokens_seen": 250092904, "step": 1447 }, { "epoch": 0.5783373301358913, "loss": 0.3107568621635437, "loss_ce": 0.0020715624559670687, "loss_xval": 0.30859375, "num_input_tokens_seen": 250092904, "step": 1447 }, { "epoch": 0.5787370103916867, "grad_norm": 59.851621485656516, "learning_rate": 5e-06, "loss": 0.3855, "num_input_tokens_seen": 250266032, "step": 1448 }, { "epoch": 0.5787370103916867, "loss": 0.254830002784729, "loss_ce": 0.008156410418450832, "loss_xval": 0.2470703125, "num_input_tokens_seen": 250266032, "step": 1448 }, { "epoch": 0.579136690647482, "grad_norm": 56.40854683309498, "learning_rate": 5e-06, "loss": 0.6026, "num_input_tokens_seen": 250438480, "step": 1449 }, { "epoch": 0.579136690647482, "loss": 0.7701046466827393, "loss_ce": 0.004235545638948679, "loss_xval": 0.765625, "num_input_tokens_seen": 250438480, "step": 1449 }, { "epoch": 0.5795363709032774, "grad_norm": 17.60230541103847, "learning_rate": 5e-06, "loss": 0.4824, "num_input_tokens_seen": 250611272, "step": 1450 }, { "epoch": 0.5795363709032774, "loss": 0.6768176555633545, "loss_ce": 0.0039049754850566387, "loss_xval": 0.671875, "num_input_tokens_seen": 250611272, "step": 1450 }, { "epoch": 0.5799360511590728, "grad_norm": 46.39964305071812, "learning_rate": 5e-06, "loss": 0.5641, "num_input_tokens_seen": 250784048, "step": 1451 }, { "epoch": 0.5799360511590728, "loss": 0.11980067938566208, "loss_ce": 0.012394066900014877, "loss_xval": 0.107421875, "num_input_tokens_seen": 250784048, "step": 1451 }, { "epoch": 0.580335731414868, "grad_norm": 31.255334541299135, "learning_rate": 5e-06, "loss": 0.2784, "num_input_tokens_seen": 250957296, "step": 1452 }, { "epoch": 0.580335731414868, "loss": 0.31082266569137573, "loss_ce": 0.0031139145139604807, "loss_xval": 0.30859375, "num_input_tokens_seen": 250957296, "step": 1452 }, { "epoch": 0.5807354116706634, "grad_norm": 77.28904921192597, "learning_rate": 5e-06, "loss": 0.3999, "num_input_tokens_seen": 251130488, "step": 1453 }, { "epoch": 0.5807354116706634, "loss": 0.2975703477859497, "loss_ce": 0.0013362220488488674, "loss_xval": 0.296875, "num_input_tokens_seen": 251130488, "step": 1453 }, { "epoch": 0.5811350919264588, "grad_norm": 19.237682772485503, "learning_rate": 5e-06, "loss": 0.5526, "num_input_tokens_seen": 251303224, "step": 1454 }, { "epoch": 0.5811350919264588, "loss": 0.4594392478466034, "loss_ce": 0.00637528020888567, "loss_xval": 0.453125, "num_input_tokens_seen": 251303224, "step": 1454 }, { "epoch": 0.5815347721822542, "grad_norm": 86.7880986943712, "learning_rate": 5e-06, "loss": 0.5105, "num_input_tokens_seen": 251476032, "step": 1455 }, { "epoch": 0.5815347721822542, "loss": 0.6088451147079468, "loss_ce": 0.01717032864689827, "loss_xval": 0.58984375, "num_input_tokens_seen": 251476032, "step": 1455 }, { "epoch": 0.5819344524380495, "grad_norm": 47.39222562187567, "learning_rate": 5e-06, "loss": 0.5009, "num_input_tokens_seen": 251649088, "step": 1456 }, { "epoch": 0.5819344524380495, "loss": 0.3581160306930542, "loss_ce": 0.0064924792386591434, "loss_xval": 0.3515625, "num_input_tokens_seen": 251649088, "step": 1456 }, { "epoch": 0.5823341326938449, "grad_norm": 60.79443614481706, "learning_rate": 5e-06, "loss": 0.419, "num_input_tokens_seen": 251822128, "step": 1457 }, { "epoch": 0.5823341326938449, "loss": 0.4279427230358124, "loss_ce": 0.002863375935703516, "loss_xval": 0.42578125, "num_input_tokens_seen": 251822128, "step": 1457 }, { "epoch": 0.5827338129496403, "grad_norm": 68.2522669779495, "learning_rate": 5e-06, "loss": 0.3529, "num_input_tokens_seen": 251994680, "step": 1458 }, { "epoch": 0.5827338129496403, "loss": 0.11519064009189606, "loss_ce": 0.006273405160754919, "loss_xval": 0.10888671875, "num_input_tokens_seen": 251994680, "step": 1458 }, { "epoch": 0.5831334932054356, "grad_norm": 23.09936882583252, "learning_rate": 5e-06, "loss": 0.4876, "num_input_tokens_seen": 252167616, "step": 1459 }, { "epoch": 0.5831334932054356, "loss": 0.35358744859695435, "loss_ce": 0.002971008885651827, "loss_xval": 0.3515625, "num_input_tokens_seen": 252167616, "step": 1459 }, { "epoch": 0.583533173461231, "grad_norm": 101.93791119936537, "learning_rate": 5e-06, "loss": 0.4939, "num_input_tokens_seen": 252340704, "step": 1460 }, { "epoch": 0.583533173461231, "loss": 0.5895069241523743, "loss_ce": 0.0047748456709086895, "loss_xval": 0.5859375, "num_input_tokens_seen": 252340704, "step": 1460 }, { "epoch": 0.5839328537170264, "grad_norm": 50.2395324965491, "learning_rate": 5e-06, "loss": 0.4909, "num_input_tokens_seen": 252513632, "step": 1461 }, { "epoch": 0.5839328537170264, "loss": 0.3211362957954407, "loss_ce": 0.0036619282327592373, "loss_xval": 0.318359375, "num_input_tokens_seen": 252513632, "step": 1461 }, { "epoch": 0.5843325339728217, "grad_norm": 71.7569870488973, "learning_rate": 5e-06, "loss": 0.5136, "num_input_tokens_seen": 252686728, "step": 1462 }, { "epoch": 0.5843325339728217, "loss": 0.6344008445739746, "loss_ce": 0.003266814863309264, "loss_xval": 0.6328125, "num_input_tokens_seen": 252686728, "step": 1462 }, { "epoch": 0.5847322142286171, "grad_norm": 39.298202628131506, "learning_rate": 5e-06, "loss": 0.4078, "num_input_tokens_seen": 252859736, "step": 1463 }, { "epoch": 0.5847322142286171, "loss": 0.33668771386146545, "loss_ce": 0.00300851883366704, "loss_xval": 0.333984375, "num_input_tokens_seen": 252859736, "step": 1463 }, { "epoch": 0.5851318944844125, "grad_norm": 94.10269985619064, "learning_rate": 5e-06, "loss": 0.5931, "num_input_tokens_seen": 253032736, "step": 1464 }, { "epoch": 0.5851318944844125, "loss": 0.6349042654037476, "loss_ce": 0.004136495292186737, "loss_xval": 0.62890625, "num_input_tokens_seen": 253032736, "step": 1464 }, { "epoch": 0.5855315747402078, "grad_norm": 88.35693572031727, "learning_rate": 5e-06, "loss": 0.3551, "num_input_tokens_seen": 253205312, "step": 1465 }, { "epoch": 0.5855315747402078, "loss": 0.28992077708244324, "loss_ce": 0.0015906940679997206, "loss_xval": 0.2890625, "num_input_tokens_seen": 253205312, "step": 1465 }, { "epoch": 0.5859312549960032, "grad_norm": 89.76469496150095, "learning_rate": 5e-06, "loss": 0.2787, "num_input_tokens_seen": 253378672, "step": 1466 }, { "epoch": 0.5859312549960032, "loss": 0.3085545301437378, "loss_ce": 0.005087736062705517, "loss_xval": 0.302734375, "num_input_tokens_seen": 253378672, "step": 1466 }, { "epoch": 0.5863309352517986, "grad_norm": 41.82595179045434, "learning_rate": 5e-06, "loss": 0.393, "num_input_tokens_seen": 253551688, "step": 1467 }, { "epoch": 0.5863309352517986, "loss": 0.3362312614917755, "loss_ce": 0.00755694042891264, "loss_xval": 0.328125, "num_input_tokens_seen": 253551688, "step": 1467 }, { "epoch": 0.586730615507594, "grad_norm": 79.00064925393347, "learning_rate": 5e-06, "loss": 0.6256, "num_input_tokens_seen": 253725080, "step": 1468 }, { "epoch": 0.586730615507594, "loss": 0.5980717539787292, "loss_ce": 0.004138659685850143, "loss_xval": 0.59375, "num_input_tokens_seen": 253725080, "step": 1468 }, { "epoch": 0.5871302957633893, "grad_norm": 84.44247148635479, "learning_rate": 5e-06, "loss": 0.504, "num_input_tokens_seen": 253898112, "step": 1469 }, { "epoch": 0.5871302957633893, "loss": 0.7288265228271484, "loss_ce": 0.0033321240916848183, "loss_xval": 0.7265625, "num_input_tokens_seen": 253898112, "step": 1469 }, { "epoch": 0.5875299760191847, "grad_norm": 26.04118376928227, "learning_rate": 5e-06, "loss": 0.6305, "num_input_tokens_seen": 254071208, "step": 1470 }, { "epoch": 0.5875299760191847, "loss": 0.39779388904571533, "loss_ce": 0.011380329728126526, "loss_xval": 0.38671875, "num_input_tokens_seen": 254071208, "step": 1470 }, { "epoch": 0.5879296562749801, "grad_norm": 36.18494670134758, "learning_rate": 5e-06, "loss": 0.3893, "num_input_tokens_seen": 254244272, "step": 1471 }, { "epoch": 0.5879296562749801, "loss": 0.29755258560180664, "loss_ce": 0.0026612617075443268, "loss_xval": 0.294921875, "num_input_tokens_seen": 254244272, "step": 1471 }, { "epoch": 0.5883293365307753, "grad_norm": 57.43211670376443, "learning_rate": 5e-06, "loss": 0.5984, "num_input_tokens_seen": 254416960, "step": 1472 }, { "epoch": 0.5883293365307753, "loss": 0.510471522808075, "loss_ce": 0.006870460696518421, "loss_xval": 0.50390625, "num_input_tokens_seen": 254416960, "step": 1472 }, { "epoch": 0.5887290167865707, "grad_norm": 39.33826844537321, "learning_rate": 5e-06, "loss": 0.6617, "num_input_tokens_seen": 254589784, "step": 1473 }, { "epoch": 0.5887290167865707, "loss": 0.5638649463653564, "loss_ce": 0.004691338166594505, "loss_xval": 0.55859375, "num_input_tokens_seen": 254589784, "step": 1473 }, { "epoch": 0.5891286970423661, "grad_norm": 53.51753517382941, "learning_rate": 5e-06, "loss": 0.4919, "num_input_tokens_seen": 254762816, "step": 1474 }, { "epoch": 0.5891286970423661, "loss": 0.4380595088005066, "loss_ce": 0.0016886851517483592, "loss_xval": 0.435546875, "num_input_tokens_seen": 254762816, "step": 1474 }, { "epoch": 0.5895283772981614, "grad_norm": 47.60334876034079, "learning_rate": 5e-06, "loss": 0.7327, "num_input_tokens_seen": 254936072, "step": 1475 }, { "epoch": 0.5895283772981614, "loss": 0.9281734228134155, "loss_ce": 0.00651207473129034, "loss_xval": 0.921875, "num_input_tokens_seen": 254936072, "step": 1475 }, { "epoch": 0.5899280575539568, "grad_norm": 42.283179440632075, "learning_rate": 5e-06, "loss": 0.3088, "num_input_tokens_seen": 255108728, "step": 1476 }, { "epoch": 0.5899280575539568, "loss": 0.17988061904907227, "loss_ce": 0.0028633992187678814, "loss_xval": 0.1767578125, "num_input_tokens_seen": 255108728, "step": 1476 }, { "epoch": 0.5903277378097522, "grad_norm": 56.89112241926728, "learning_rate": 5e-06, "loss": 0.4516, "num_input_tokens_seen": 255281672, "step": 1477 }, { "epoch": 0.5903277378097522, "loss": 0.535220742225647, "loss_ce": 0.0025211526080965996, "loss_xval": 0.53125, "num_input_tokens_seen": 255281672, "step": 1477 }, { "epoch": 0.5907274180655475, "grad_norm": 26.05614758054613, "learning_rate": 5e-06, "loss": 0.675, "num_input_tokens_seen": 255454832, "step": 1478 }, { "epoch": 0.5907274180655475, "loss": 0.8795278072357178, "loss_ce": 0.0042531476356089115, "loss_xval": 0.875, "num_input_tokens_seen": 255454832, "step": 1478 }, { "epoch": 0.5911270983213429, "grad_norm": 82.8948565903926, "learning_rate": 5e-06, "loss": 0.4564, "num_input_tokens_seen": 255627888, "step": 1479 }, { "epoch": 0.5911270983213429, "loss": 0.3429642617702484, "loss_ce": 0.0021439511328935623, "loss_xval": 0.33984375, "num_input_tokens_seen": 255627888, "step": 1479 }, { "epoch": 0.5915267785771383, "grad_norm": 74.3533568254218, "learning_rate": 5e-06, "loss": 0.436, "num_input_tokens_seen": 255800960, "step": 1480 }, { "epoch": 0.5915267785771383, "loss": 0.27249640226364136, "loss_ce": 0.005421818234026432, "loss_xval": 0.267578125, "num_input_tokens_seen": 255800960, "step": 1480 }, { "epoch": 0.5919264588329336, "grad_norm": 100.93013388274387, "learning_rate": 5e-06, "loss": 0.911, "num_input_tokens_seen": 255974224, "step": 1481 }, { "epoch": 0.5919264588329336, "loss": 1.2054524421691895, "loss_ce": 0.005745388101786375, "loss_xval": 1.203125, "num_input_tokens_seen": 255974224, "step": 1481 }, { "epoch": 0.592326139088729, "grad_norm": 30.404189839395876, "learning_rate": 5e-06, "loss": 0.4284, "num_input_tokens_seen": 256146968, "step": 1482 }, { "epoch": 0.592326139088729, "loss": 0.45736631751060486, "loss_ce": 0.011336689814925194, "loss_xval": 0.4453125, "num_input_tokens_seen": 256146968, "step": 1482 }, { "epoch": 0.5927258193445244, "grad_norm": 86.86243262803453, "learning_rate": 5e-06, "loss": 0.5621, "num_input_tokens_seen": 256319912, "step": 1483 }, { "epoch": 0.5927258193445244, "loss": 0.49696576595306396, "loss_ce": 0.005571682937443256, "loss_xval": 0.4921875, "num_input_tokens_seen": 256319912, "step": 1483 }, { "epoch": 0.5931254996003198, "grad_norm": 36.87791867548973, "learning_rate": 5e-06, "loss": 0.6044, "num_input_tokens_seen": 256492800, "step": 1484 }, { "epoch": 0.5931254996003198, "loss": 0.8356503844261169, "loss_ce": 0.004473670851439238, "loss_xval": 0.83203125, "num_input_tokens_seen": 256492800, "step": 1484 }, { "epoch": 0.5935251798561151, "grad_norm": 56.240908798030695, "learning_rate": 5e-06, "loss": 0.4727, "num_input_tokens_seen": 256665936, "step": 1485 }, { "epoch": 0.5935251798561151, "loss": 0.4955936670303345, "loss_ce": 0.005145644303411245, "loss_xval": 0.490234375, "num_input_tokens_seen": 256665936, "step": 1485 }, { "epoch": 0.5939248601119105, "grad_norm": 47.051400925900325, "learning_rate": 5e-06, "loss": 0.3654, "num_input_tokens_seen": 256838816, "step": 1486 }, { "epoch": 0.5939248601119105, "loss": 0.33166587352752686, "loss_ce": 0.002838968764990568, "loss_xval": 0.328125, "num_input_tokens_seen": 256838816, "step": 1486 }, { "epoch": 0.5943245403677059, "grad_norm": 76.5452662635696, "learning_rate": 5e-06, "loss": 0.3362, "num_input_tokens_seen": 257011728, "step": 1487 }, { "epoch": 0.5943245403677059, "loss": 0.16850775480270386, "loss_ce": 0.002980403369292617, "loss_xval": 0.166015625, "num_input_tokens_seen": 257011728, "step": 1487 }, { "epoch": 0.5947242206235012, "grad_norm": 35.09571761153484, "learning_rate": 5e-06, "loss": 0.462, "num_input_tokens_seen": 257184728, "step": 1488 }, { "epoch": 0.5947242206235012, "loss": 0.5621877312660217, "loss_ce": 0.0029989101458340883, "loss_xval": 0.55859375, "num_input_tokens_seen": 257184728, "step": 1488 }, { "epoch": 0.5951239008792966, "grad_norm": 63.285571785786225, "learning_rate": 5e-06, "loss": 0.5579, "num_input_tokens_seen": 257357600, "step": 1489 }, { "epoch": 0.5951239008792966, "loss": 0.8287309408187866, "loss_ce": 0.0077470894902944565, "loss_xval": 0.8203125, "num_input_tokens_seen": 257357600, "step": 1489 }, { "epoch": 0.595523581135092, "grad_norm": 33.80174599093451, "learning_rate": 5e-06, "loss": 0.3899, "num_input_tokens_seen": 257530392, "step": 1490 }, { "epoch": 0.595523581135092, "loss": 0.15278327465057373, "loss_ce": 0.003247134620323777, "loss_xval": 0.1494140625, "num_input_tokens_seen": 257530392, "step": 1490 }, { "epoch": 0.5959232613908872, "grad_norm": 38.413696017178964, "learning_rate": 5e-06, "loss": 0.3459, "num_input_tokens_seen": 257703408, "step": 1491 }, { "epoch": 0.5959232613908872, "loss": 0.3699941635131836, "loss_ce": 0.004729264881461859, "loss_xval": 0.365234375, "num_input_tokens_seen": 257703408, "step": 1491 }, { "epoch": 0.5963229416466826, "grad_norm": 40.28456049137768, "learning_rate": 5e-06, "loss": 0.314, "num_input_tokens_seen": 257876240, "step": 1492 }, { "epoch": 0.5963229416466826, "loss": 0.2647836208343506, "loss_ce": 0.005323182325810194, "loss_xval": 0.259765625, "num_input_tokens_seen": 257876240, "step": 1492 }, { "epoch": 0.596722621902478, "grad_norm": 50.27059541024263, "learning_rate": 5e-06, "loss": 0.4964, "num_input_tokens_seen": 258049304, "step": 1493 }, { "epoch": 0.596722621902478, "loss": 0.45188143849372864, "loss_ce": 0.004188567399978638, "loss_xval": 0.447265625, "num_input_tokens_seen": 258049304, "step": 1493 }, { "epoch": 0.5971223021582733, "grad_norm": 20.64882540231887, "learning_rate": 5e-06, "loss": 0.5905, "num_input_tokens_seen": 258222536, "step": 1494 }, { "epoch": 0.5971223021582733, "loss": 0.8253822326660156, "loss_ce": 0.0019264371367171407, "loss_xval": 0.82421875, "num_input_tokens_seen": 258222536, "step": 1494 }, { "epoch": 0.5975219824140687, "grad_norm": 28.749252671557343, "learning_rate": 5e-06, "loss": 0.4295, "num_input_tokens_seen": 258395280, "step": 1495 }, { "epoch": 0.5975219824140687, "loss": 0.3758857250213623, "loss_ce": 0.002136970404535532, "loss_xval": 0.373046875, "num_input_tokens_seen": 258395280, "step": 1495 }, { "epoch": 0.5979216626698641, "grad_norm": 28.63136497084886, "learning_rate": 5e-06, "loss": 0.333, "num_input_tokens_seen": 258568496, "step": 1496 }, { "epoch": 0.5979216626698641, "loss": 0.31825536489486694, "loss_ce": 0.0036496452521532774, "loss_xval": 0.314453125, "num_input_tokens_seen": 258568496, "step": 1496 }, { "epoch": 0.5983213429256595, "grad_norm": 43.51054271549945, "learning_rate": 5e-06, "loss": 0.32, "num_input_tokens_seen": 258741448, "step": 1497 }, { "epoch": 0.5983213429256595, "loss": 0.26728811860084534, "loss_ce": 0.008239655755460262, "loss_xval": 0.259765625, "num_input_tokens_seen": 258741448, "step": 1497 }, { "epoch": 0.5987210231814548, "grad_norm": 53.32489405720729, "learning_rate": 5e-06, "loss": 0.3977, "num_input_tokens_seen": 258914352, "step": 1498 }, { "epoch": 0.5987210231814548, "loss": 0.23198693990707397, "loss_ce": 0.004524178337305784, "loss_xval": 0.2275390625, "num_input_tokens_seen": 258914352, "step": 1498 }, { "epoch": 0.5991207034372502, "grad_norm": 19.333266417797198, "learning_rate": 5e-06, "loss": 0.3824, "num_input_tokens_seen": 259087304, "step": 1499 }, { "epoch": 0.5991207034372502, "loss": 0.5503689050674438, "loss_ce": 0.01039084792137146, "loss_xval": 0.5390625, "num_input_tokens_seen": 259087304, "step": 1499 }, { "epoch": 0.5995203836930456, "grad_norm": 88.26003375881355, "learning_rate": 5e-06, "loss": 0.5661, "num_input_tokens_seen": 259260416, "step": 1500 }, { "epoch": 0.5995203836930456, "eval_websight_new_IoU": 0.3115440905094147, "eval_websight_new_MAE_all": 0.020920580253005028, "eval_websight_new_MAE_h": 0.017279735766351223, "eval_websight_new_MAE_w": 0.03733859211206436, "eval_websight_new_MAE_x": 0.012794057838618755, "eval_websight_new_MAE_y": 0.016269936691969633, "eval_websight_new_NUM_probability": 0.9875062704086304, "eval_websight_new_inside_bbox": 0.6961805522441864, "eval_websight_new_loss": 0.07518891245126724, "eval_websight_new_loss_ce": 0.001353644474875182, "eval_websight_new_loss_xval": 0.070343017578125, "eval_websight_new_runtime": 58.5335, "eval_websight_new_samples_per_second": 0.854, "eval_websight_new_steps_per_second": 0.034, "num_input_tokens_seen": 259260416, "step": 1500 }, { "epoch": 0.5995203836930456, "eval_seeclick_IoU": 0.27675844728946686, "eval_seeclick_MAE_all": 0.0651068165898323, "eval_seeclick_MAE_h": 0.027694360353052616, "eval_seeclick_MAE_w": 0.08417735807597637, "eval_seeclick_MAE_x": 0.08383799344301224, "eval_seeclick_MAE_y": 0.06471756100654602, "eval_seeclick_NUM_probability": 0.986900806427002, "eval_seeclick_inside_bbox": 0.4756944477558136, "eval_seeclick_loss": 1.40971040725708, "eval_seeclick_loss_ce": 0.013312608003616333, "eval_seeclick_loss_xval": 1.2222900390625, "eval_seeclick_runtime": 81.445, "eval_seeclick_samples_per_second": 0.614, "eval_seeclick_steps_per_second": 0.025, "num_input_tokens_seen": 259260416, "step": 1500 }, { "epoch": 0.5995203836930456, "eval_icons_IoU": 0.10929679498076439, "eval_icons_MAE_all": 0.01885663205757737, "eval_icons_MAE_h": 0.012082248460501432, "eval_icons_MAE_w": 0.009315322153270245, "eval_icons_MAE_x": 0.026283184066414833, "eval_icons_MAE_y": 0.027745775878429413, "eval_icons_NUM_probability": 0.9871878027915955, "eval_icons_inside_bbox": 0.2795138955116272, "eval_icons_loss": 0.10837095975875854, "eval_icons_loss_ce": 0.001372582628391683, "eval_icons_loss_xval": 0.08797454833984375, "eval_icons_runtime": 89.9202, "eval_icons_samples_per_second": 0.556, "eval_icons_steps_per_second": 0.022, "num_input_tokens_seen": 259260416, "step": 1500 } ], "logging_steps": 1.0, "max_steps": 7506, "num_input_tokens_seen": 259260416, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1622802757648384.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }