{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.996258885147775, "eval_steps": 500, "global_step": 3340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014964459408903853, "grad_norm": 0.5450117588043213, "learning_rate": 4.999972352489418e-05, "loss": 1.3208, "step": 5 }, { "epoch": 0.029928918817807706, "grad_norm": 0.5009211301803589, "learning_rate": 4.9998894105691785e-05, "loss": 1.2903, "step": 10 }, { "epoch": 0.04489337822671156, "grad_norm": 0.45117729902267456, "learning_rate": 4.9997511760737915e-05, "loss": 1.2271, "step": 15 }, { "epoch": 0.05985783763561541, "grad_norm": 0.5625576376914978, "learning_rate": 4.999557652060729e-05, "loss": 1.186, "step": 20 }, { "epoch": 0.07482229704451927, "grad_norm": 0.519900381565094, "learning_rate": 4.999308842810357e-05, "loss": 1.1302, "step": 25 }, { "epoch": 0.08978675645342311, "grad_norm": 0.593724250793457, "learning_rate": 4.999004753825842e-05, "loss": 1.1372, "step": 30 }, { "epoch": 0.10475121586232697, "grad_norm": 0.663527250289917, "learning_rate": 4.998645391833024e-05, "loss": 1.0359, "step": 35 }, { "epoch": 0.11971567527123082, "grad_norm": 0.6744909286499023, "learning_rate": 4.9982307647802765e-05, "loss": 1.0511, "step": 40 }, { "epoch": 0.13468013468013468, "grad_norm": 0.5474820137023926, "learning_rate": 4.9977608818383226e-05, "loss": 0.9909, "step": 45 }, { "epoch": 0.14964459408903855, "grad_norm": 0.5713778734207153, "learning_rate": 4.9972357534000394e-05, "loss": 1.0139, "step": 50 }, { "epoch": 0.1646090534979424, "grad_norm": 0.6148533225059509, "learning_rate": 4.99665539108022e-05, "loss": 1.0156, "step": 55 }, { "epoch": 0.17957351290684623, "grad_norm": 0.6597582697868347, "learning_rate": 4.996019807715324e-05, "loss": 0.995, "step": 60 }, { "epoch": 0.1945379723157501, "grad_norm": 0.6145315170288086, "learning_rate": 4.9953290173631896e-05, "loss": 0.9641, "step": 65 }, { "epoch": 0.20950243172465394, "grad_norm": 0.7690613865852356, "learning_rate": 4.994583035302723e-05, "loss": 0.9934, "step": 70 }, { "epoch": 0.2244668911335578, "grad_norm": 0.7555385828018188, "learning_rate": 4.9937818780335646e-05, "loss": 0.946, "step": 75 }, { "epoch": 0.23943135054246165, "grad_norm": 0.7178649306297302, "learning_rate": 4.992925563275714e-05, "loss": 0.91, "step": 80 }, { "epoch": 0.2543958099513655, "grad_norm": 0.7035357356071472, "learning_rate": 4.99201410996915e-05, "loss": 0.9612, "step": 85 }, { "epoch": 0.26936026936026936, "grad_norm": 0.7382842898368835, "learning_rate": 4.9910475382734034e-05, "loss": 0.8687, "step": 90 }, { "epoch": 0.2843247287691732, "grad_norm": 0.8056157231330872, "learning_rate": 4.990025869567117e-05, "loss": 0.9038, "step": 95 }, { "epoch": 0.2992891881780771, "grad_norm": 0.9074241518974304, "learning_rate": 4.988949126447567e-05, "loss": 0.9063, "step": 100 }, { "epoch": 0.31425364758698093, "grad_norm": 0.7684347033500671, "learning_rate": 4.987817332730166e-05, "loss": 0.9065, "step": 105 }, { "epoch": 0.3292181069958848, "grad_norm": 0.8751540184020996, "learning_rate": 4.986630513447938e-05, "loss": 0.9492, "step": 110 }, { "epoch": 0.3441825664047886, "grad_norm": 0.8586133718490601, "learning_rate": 4.985388694850963e-05, "loss": 0.9085, "step": 115 }, { "epoch": 0.35914702581369246, "grad_norm": 0.7148111462593079, "learning_rate": 4.984091904405793e-05, "loss": 0.9125, "step": 120 }, { "epoch": 0.37411148522259635, "grad_norm": 0.7763463258743286, "learning_rate": 4.9827401707948504e-05, "loss": 0.9019, "step": 125 }, { "epoch": 0.3890759446315002, "grad_norm": 0.9074414372444153, "learning_rate": 4.981333523915792e-05, "loss": 0.8188, "step": 130 }, { "epoch": 0.40404040404040403, "grad_norm": 1.0170953273773193, "learning_rate": 4.979871994880845e-05, "loss": 0.8757, "step": 135 }, { "epoch": 0.4190048634493079, "grad_norm": 0.8414100408554077, "learning_rate": 4.97835561601612e-05, "loss": 0.8711, "step": 140 }, { "epoch": 0.43396932285821177, "grad_norm": 0.9481101036071777, "learning_rate": 4.9767844208608984e-05, "loss": 0.8371, "step": 145 }, { "epoch": 0.4489337822671156, "grad_norm": 0.8936446905136108, "learning_rate": 4.9751584441668874e-05, "loss": 0.8282, "step": 150 }, { "epoch": 0.46389824167601945, "grad_norm": 0.906244695186615, "learning_rate": 4.973477721897454e-05, "loss": 0.8702, "step": 155 }, { "epoch": 0.4788627010849233, "grad_norm": 0.9465859532356262, "learning_rate": 4.971742291226827e-05, "loss": 0.8779, "step": 160 }, { "epoch": 0.49382716049382713, "grad_norm": 0.752061128616333, "learning_rate": 4.969952190539276e-05, "loss": 0.8855, "step": 165 }, { "epoch": 0.508791619902731, "grad_norm": 0.9224424958229065, "learning_rate": 4.968107459428265e-05, "loss": 0.8211, "step": 170 }, { "epoch": 0.5237560793116348, "grad_norm": 0.8061802387237549, "learning_rate": 4.9662081386955714e-05, "loss": 0.84, "step": 175 }, { "epoch": 0.5387205387205387, "grad_norm": 1.4521487951278687, "learning_rate": 4.964254270350387e-05, "loss": 0.8529, "step": 180 }, { "epoch": 0.5536849981294426, "grad_norm": 0.807750403881073, "learning_rate": 4.9622458976083885e-05, "loss": 0.8891, "step": 185 }, { "epoch": 0.5686494575383464, "grad_norm": 0.9400819540023804, "learning_rate": 4.960183064890782e-05, "loss": 0.8705, "step": 190 }, { "epoch": 0.5836139169472503, "grad_norm": 0.8715450763702393, "learning_rate": 4.958065817823318e-05, "loss": 0.8671, "step": 195 }, { "epoch": 0.5985783763561542, "grad_norm": 0.9633534550666809, "learning_rate": 4.955894203235284e-05, "loss": 0.8379, "step": 200 }, { "epoch": 0.613542835765058, "grad_norm": 0.8975620865821838, "learning_rate": 4.953668269158472e-05, "loss": 0.8086, "step": 205 }, { "epoch": 0.6285072951739619, "grad_norm": 0.904045045375824, "learning_rate": 4.9513880648261114e-05, "loss": 0.8183, "step": 210 }, { "epoch": 0.6434717545828657, "grad_norm": 1.009617805480957, "learning_rate": 4.949053640671778e-05, "loss": 0.8557, "step": 215 }, { "epoch": 0.6584362139917695, "grad_norm": 1.1475061178207397, "learning_rate": 4.946665048328287e-05, "loss": 0.8809, "step": 220 }, { "epoch": 0.6734006734006734, "grad_norm": 1.087480068206787, "learning_rate": 4.944222340626543e-05, "loss": 0.7887, "step": 225 }, { "epoch": 0.6883651328095772, "grad_norm": 0.9593074321746826, "learning_rate": 4.9417255715943766e-05, "loss": 0.8965, "step": 230 }, { "epoch": 0.7033295922184811, "grad_norm": 1.103148102760315, "learning_rate": 4.939174796455346e-05, "loss": 0.9189, "step": 235 }, { "epoch": 0.7182940516273849, "grad_norm": 0.9249380826950073, "learning_rate": 4.936570071627518e-05, "loss": 0.8793, "step": 240 }, { "epoch": 0.7332585110362888, "grad_norm": 1.03147554397583, "learning_rate": 4.933911454722217e-05, "loss": 0.8052, "step": 245 }, { "epoch": 0.7482229704451927, "grad_norm": 1.0135599374771118, "learning_rate": 4.9311990045427553e-05, "loss": 0.8033, "step": 250 }, { "epoch": 0.7631874298540965, "grad_norm": 1.0295403003692627, "learning_rate": 4.928432781083128e-05, "loss": 0.9045, "step": 255 }, { "epoch": 0.7781518892630004, "grad_norm": 0.9905064105987549, "learning_rate": 4.92561284552669e-05, "loss": 0.8486, "step": 260 }, { "epoch": 0.7931163486719043, "grad_norm": 0.9656111598014832, "learning_rate": 4.9227392602447996e-05, "loss": 0.8324, "step": 265 }, { "epoch": 0.8080808080808081, "grad_norm": 0.9249575734138489, "learning_rate": 4.91981208879544e-05, "loss": 0.8172, "step": 270 }, { "epoch": 0.823045267489712, "grad_norm": 0.904988706111908, "learning_rate": 4.9168313959218135e-05, "loss": 0.8258, "step": 275 }, { "epoch": 0.8380097268986157, "grad_norm": 1.060915231704712, "learning_rate": 4.913797247550912e-05, "loss": 0.867, "step": 280 }, { "epoch": 0.8529741863075196, "grad_norm": 1.017268180847168, "learning_rate": 4.910709710792054e-05, "loss": 0.7974, "step": 285 }, { "epoch": 0.8679386457164235, "grad_norm": 1.0362051725387573, "learning_rate": 4.9075688539354025e-05, "loss": 0.8596, "step": 290 }, { "epoch": 0.8829031051253273, "grad_norm": 0.9945353269577026, "learning_rate": 4.904374746450459e-05, "loss": 0.8076, "step": 295 }, { "epoch": 0.8978675645342312, "grad_norm": 0.986596941947937, "learning_rate": 4.901127458984516e-05, "loss": 0.8126, "step": 300 }, { "epoch": 0.912832023943135, "grad_norm": 1.016927719116211, "learning_rate": 4.8978270633611086e-05, "loss": 0.817, "step": 305 }, { "epoch": 0.9277964833520389, "grad_norm": 1.0122638940811157, "learning_rate": 4.8944736325784136e-05, "loss": 0.9226, "step": 310 }, { "epoch": 0.9427609427609428, "grad_norm": 1.04526948928833, "learning_rate": 4.891067240807641e-05, "loss": 0.7878, "step": 315 }, { "epoch": 0.9577254021698466, "grad_norm": 0.8837614059448242, "learning_rate": 4.887607963391394e-05, "loss": 0.8187, "step": 320 }, { "epoch": 0.9726898615787505, "grad_norm": 1.1496906280517578, "learning_rate": 4.884095876841999e-05, "loss": 0.8531, "step": 325 }, { "epoch": 0.9876543209876543, "grad_norm": 0.9902486205101013, "learning_rate": 4.880531058839816e-05, "loss": 0.7615, "step": 330 }, { "epoch": 1.0026187803965583, "grad_norm": 1.0682637691497803, "learning_rate": 4.87691358823152e-05, "loss": 0.8132, "step": 335 }, { "epoch": 1.017583239805462, "grad_norm": 1.0381455421447754, "learning_rate": 4.8732435450283565e-05, "loss": 0.7877, "step": 340 }, { "epoch": 1.0325476992143658, "grad_norm": 0.9033912420272827, "learning_rate": 4.869521010404373e-05, "loss": 0.7892, "step": 345 }, { "epoch": 1.0475121586232696, "grad_norm": 0.9539169669151306, "learning_rate": 4.86574606669462e-05, "loss": 0.8192, "step": 350 }, { "epoch": 1.0624766180321736, "grad_norm": 0.8774738907814026, "learning_rate": 4.861918797393336e-05, "loss": 0.753, "step": 355 }, { "epoch": 1.0774410774410774, "grad_norm": 1.1601481437683105, "learning_rate": 4.8580392871520946e-05, "loss": 0.8113, "step": 360 }, { "epoch": 1.0924055368499812, "grad_norm": 0.9350395202636719, "learning_rate": 4.854107621777938e-05, "loss": 0.8731, "step": 365 }, { "epoch": 1.1073699962588852, "grad_norm": 1.0182013511657715, "learning_rate": 4.8501238882314715e-05, "loss": 0.8649, "step": 370 }, { "epoch": 1.122334455667789, "grad_norm": 1.158241629600525, "learning_rate": 4.84608817462495e-05, "loss": 0.82, "step": 375 }, { "epoch": 1.1372989150766928, "grad_norm": 1.0360873937606812, "learning_rate": 4.8420005702203196e-05, "loss": 0.8236, "step": 380 }, { "epoch": 1.1522633744855968, "grad_norm": 1.1079692840576172, "learning_rate": 4.83786116542725e-05, "loss": 0.7584, "step": 385 }, { "epoch": 1.1672278338945006, "grad_norm": 1.0275564193725586, "learning_rate": 4.833670051801131e-05, "loss": 0.7847, "step": 390 }, { "epoch": 1.1821922933034044, "grad_norm": 1.1246060132980347, "learning_rate": 4.829427322041049e-05, "loss": 0.7597, "step": 395 }, { "epoch": 1.1971567527123081, "grad_norm": 1.0290372371673584, "learning_rate": 4.825133069987737e-05, "loss": 0.733, "step": 400 }, { "epoch": 1.2121212121212122, "grad_norm": 1.1085125207901, "learning_rate": 4.820787390621499e-05, "loss": 0.7729, "step": 405 }, { "epoch": 1.227085671530116, "grad_norm": 1.2001007795333862, "learning_rate": 4.816390380060108e-05, "loss": 0.769, "step": 410 }, { "epoch": 1.24205013093902, "grad_norm": 1.1357756853103638, "learning_rate": 4.8119421355566796e-05, "loss": 0.8017, "step": 415 }, { "epoch": 1.2570145903479237, "grad_norm": 1.0524897575378418, "learning_rate": 4.807442755497524e-05, "loss": 0.7916, "step": 420 }, { "epoch": 1.2719790497568275, "grad_norm": 1.1913483142852783, "learning_rate": 4.802892339399967e-05, "loss": 0.8058, "step": 425 }, { "epoch": 1.2869435091657313, "grad_norm": 1.2256290912628174, "learning_rate": 4.7982909879101515e-05, "loss": 0.8267, "step": 430 }, { "epoch": 1.3019079685746353, "grad_norm": 1.0073180198669434, "learning_rate": 4.7936388028008084e-05, "loss": 0.8316, "step": 435 }, { "epoch": 1.316872427983539, "grad_norm": 1.030885934829712, "learning_rate": 4.7889358869690056e-05, "loss": 0.7874, "step": 440 }, { "epoch": 1.3318368873924429, "grad_norm": 1.1515103578567505, "learning_rate": 4.784182344433878e-05, "loss": 0.8268, "step": 445 }, { "epoch": 1.3468013468013469, "grad_norm": 0.8818157315254211, "learning_rate": 4.779378280334318e-05, "loss": 0.8366, "step": 450 }, { "epoch": 1.3617658062102507, "grad_norm": 0.9439221024513245, "learning_rate": 4.7745238009266556e-05, "loss": 0.8279, "step": 455 }, { "epoch": 1.3767302656191545, "grad_norm": 0.9476526379585266, "learning_rate": 4.7696190135823094e-05, "loss": 0.807, "step": 460 }, { "epoch": 1.3916947250280582, "grad_norm": 0.9290974736213684, "learning_rate": 4.764664026785405e-05, "loss": 0.8259, "step": 465 }, { "epoch": 1.4066591844369623, "grad_norm": 1.007653832435608, "learning_rate": 4.759658950130385e-05, "loss": 0.8344, "step": 470 }, { "epoch": 1.421623643845866, "grad_norm": 1.1456594467163086, "learning_rate": 4.7546038943195736e-05, "loss": 0.7565, "step": 475 }, { "epoch": 1.43658810325477, "grad_norm": 1.0801887512207031, "learning_rate": 4.749498971160742e-05, "loss": 0.7771, "step": 480 }, { "epoch": 1.4515525626636738, "grad_norm": 0.9851332306861877, "learning_rate": 4.744344293564621e-05, "loss": 0.7803, "step": 485 }, { "epoch": 1.4665170220725776, "grad_norm": 1.1050950288772583, "learning_rate": 4.739139975542415e-05, "loss": 0.8118, "step": 490 }, { "epoch": 1.4814814814814814, "grad_norm": 1.030402421951294, "learning_rate": 4.7338861322032726e-05, "loss": 0.849, "step": 495 }, { "epoch": 1.4964459408903854, "grad_norm": 0.9448444843292236, "learning_rate": 4.7285828797517465e-05, "loss": 0.7255, "step": 500 }, { "epoch": 1.5114104002992892, "grad_norm": 1.0439302921295166, "learning_rate": 4.723230335485218e-05, "loss": 0.7413, "step": 505 }, { "epoch": 1.5263748597081932, "grad_norm": 1.234944462776184, "learning_rate": 4.717828617791308e-05, "loss": 0.7648, "step": 510 }, { "epoch": 1.541339319117097, "grad_norm": 1.1755177974700928, "learning_rate": 4.7123778461452536e-05, "loss": 0.7203, "step": 515 }, { "epoch": 1.5563037785260008, "grad_norm": 0.9818833470344543, "learning_rate": 4.7068781411072686e-05, "loss": 0.7813, "step": 520 }, { "epoch": 1.5712682379349046, "grad_norm": 1.286037802696228, "learning_rate": 4.7013296243198746e-05, "loss": 0.8098, "step": 525 }, { "epoch": 1.5862326973438083, "grad_norm": 1.2840811014175415, "learning_rate": 4.695732418505214e-05, "loss": 0.7752, "step": 530 }, { "epoch": 1.6011971567527123, "grad_norm": 0.9121096730232239, "learning_rate": 4.690086647462331e-05, "loss": 0.8124, "step": 535 }, { "epoch": 1.6161616161616161, "grad_norm": 1.1097527742385864, "learning_rate": 4.684392436064439e-05, "loss": 0.8453, "step": 540 }, { "epoch": 1.6311260755705201, "grad_norm": 1.0087509155273438, "learning_rate": 4.678649910256152e-05, "loss": 0.7736, "step": 545 }, { "epoch": 1.646090534979424, "grad_norm": 1.028320074081421, "learning_rate": 4.6728591970507055e-05, "loss": 0.8248, "step": 550 }, { "epoch": 1.6610549943883277, "grad_norm": 1.2182048559188843, "learning_rate": 4.6670204245271444e-05, "loss": 0.7903, "step": 555 }, { "epoch": 1.6760194537972315, "grad_norm": 1.0399212837219238, "learning_rate": 4.661133721827486e-05, "loss": 0.7495, "step": 560 }, { "epoch": 1.6909839132061353, "grad_norm": 1.0186119079589844, "learning_rate": 4.655199219153873e-05, "loss": 0.76, "step": 565 }, { "epoch": 1.7059483726150393, "grad_norm": 1.2510963678359985, "learning_rate": 4.649217047765685e-05, "loss": 0.7618, "step": 570 }, { "epoch": 1.7209128320239433, "grad_norm": 1.3667418956756592, "learning_rate": 4.643187339976639e-05, "loss": 0.8169, "step": 575 }, { "epoch": 1.735877291432847, "grad_norm": 1.1502622365951538, "learning_rate": 4.637110229151863e-05, "loss": 0.8384, "step": 580 }, { "epoch": 1.7508417508417509, "grad_norm": 0.9805833697319031, "learning_rate": 4.6309858497049464e-05, "loss": 0.757, "step": 585 }, { "epoch": 1.7658062102506547, "grad_norm": 1.111222743988037, "learning_rate": 4.6248143370949636e-05, "loss": 0.8712, "step": 590 }, { "epoch": 1.7807706696595584, "grad_norm": 1.0927162170410156, "learning_rate": 4.618595827823486e-05, "loss": 0.8009, "step": 595 }, { "epoch": 1.7957351290684624, "grad_norm": 1.218826174736023, "learning_rate": 4.612330459431552e-05, "loss": 0.8323, "step": 600 }, { "epoch": 1.8106995884773662, "grad_norm": 1.233129858970642, "learning_rate": 4.606018370496633e-05, "loss": 0.7373, "step": 605 }, { "epoch": 1.8256640478862702, "grad_norm": 0.9207433462142944, "learning_rate": 4.5996597006295655e-05, "loss": 0.7533, "step": 610 }, { "epoch": 1.840628507295174, "grad_norm": 1.1917186975479126, "learning_rate": 4.593254590471464e-05, "loss": 0.7831, "step": 615 }, { "epoch": 1.8555929667040778, "grad_norm": 0.9462292194366455, "learning_rate": 4.586803181690609e-05, "loss": 0.7733, "step": 620 }, { "epoch": 1.8705574261129816, "grad_norm": 0.9503220915794373, "learning_rate": 4.580305616979314e-05, "loss": 0.8178, "step": 625 }, { "epoch": 1.8855218855218854, "grad_norm": 0.8902071118354797, "learning_rate": 4.573762040050772e-05, "loss": 0.8028, "step": 630 }, { "epoch": 1.9004863449307894, "grad_norm": 1.1471889019012451, "learning_rate": 4.567172595635871e-05, "loss": 0.7499, "step": 635 }, { "epoch": 1.9154508043396934, "grad_norm": 1.063602328300476, "learning_rate": 4.560537429479998e-05, "loss": 0.7516, "step": 640 }, { "epoch": 1.9304152637485972, "grad_norm": 0.9486026763916016, "learning_rate": 4.553856688339817e-05, "loss": 0.7598, "step": 645 }, { "epoch": 1.945379723157501, "grad_norm": 0.9863812923431396, "learning_rate": 4.547130519980014e-05, "loss": 0.8039, "step": 650 }, { "epoch": 1.9603441825664047, "grad_norm": 1.0052098035812378, "learning_rate": 4.54035907317004e-05, "loss": 0.7574, "step": 655 }, { "epoch": 1.9753086419753085, "grad_norm": 1.0039119720458984, "learning_rate": 4.533542497680812e-05, "loss": 0.7594, "step": 660 }, { "epoch": 1.9902731013842125, "grad_norm": 1.0683871507644653, "learning_rate": 4.5266809442814035e-05, "loss": 0.7489, "step": 665 }, { "epoch": 2.0052375607931165, "grad_norm": 0.964434802532196, "learning_rate": 4.519774564735711e-05, "loss": 0.7376, "step": 670 }, { "epoch": 2.0202020202020203, "grad_norm": 1.0128512382507324, "learning_rate": 4.512823511799098e-05, "loss": 0.7275, "step": 675 }, { "epoch": 2.035166479610924, "grad_norm": 1.0383211374282837, "learning_rate": 4.5058279392150096e-05, "loss": 0.749, "step": 680 }, { "epoch": 2.050130939019828, "grad_norm": 0.9630417227745056, "learning_rate": 4.4987880017115793e-05, "loss": 0.7563, "step": 685 }, { "epoch": 2.0650953984287317, "grad_norm": 1.2037107944488525, "learning_rate": 4.491703854998207e-05, "loss": 0.7426, "step": 690 }, { "epoch": 2.0800598578376355, "grad_norm": 1.144274353981018, "learning_rate": 4.484575655762107e-05, "loss": 0.7323, "step": 695 }, { "epoch": 2.0950243172465393, "grad_norm": 1.2053053379058838, "learning_rate": 4.477403561664852e-05, "loss": 0.7474, "step": 700 }, { "epoch": 2.1099887766554435, "grad_norm": 1.039339542388916, "learning_rate": 4.4701877313388784e-05, "loss": 0.766, "step": 705 }, { "epoch": 2.1249532360643473, "grad_norm": 1.0573480129241943, "learning_rate": 4.462928324383985e-05, "loss": 0.8314, "step": 710 }, { "epoch": 2.139917695473251, "grad_norm": 1.2173911333084106, "learning_rate": 4.455625501363794e-05, "loss": 0.7388, "step": 715 }, { "epoch": 2.154882154882155, "grad_norm": 1.1535879373550415, "learning_rate": 4.448279423802207e-05, "loss": 0.7698, "step": 720 }, { "epoch": 2.1698466142910586, "grad_norm": 0.985844075679779, "learning_rate": 4.44089025417983e-05, "loss": 0.7692, "step": 725 }, { "epoch": 2.1848110736999624, "grad_norm": 1.0249568223953247, "learning_rate": 4.43345815593038e-05, "loss": 0.7261, "step": 730 }, { "epoch": 2.1997755331088666, "grad_norm": 1.305114507675171, "learning_rate": 4.425983293437069e-05, "loss": 0.8001, "step": 735 }, { "epoch": 2.2147399925177704, "grad_norm": 1.1478573083877563, "learning_rate": 4.4184658320289675e-05, "loss": 0.8036, "step": 740 }, { "epoch": 2.229704451926674, "grad_norm": 0.9925107359886169, "learning_rate": 4.410905937977352e-05, "loss": 0.7775, "step": 745 }, { "epoch": 2.244668911335578, "grad_norm": 1.0552047491073608, "learning_rate": 4.403303778492022e-05, "loss": 0.7449, "step": 750 }, { "epoch": 2.259633370744482, "grad_norm": 1.0449482202529907, "learning_rate": 4.395659521717607e-05, "loss": 0.7197, "step": 755 }, { "epoch": 2.2745978301533856, "grad_norm": 1.0985392332077026, "learning_rate": 4.3879733367298405e-05, "loss": 0.7691, "step": 760 }, { "epoch": 2.28956228956229, "grad_norm": 1.1123350858688354, "learning_rate": 4.3802453935318294e-05, "loss": 0.7501, "step": 765 }, { "epoch": 2.3045267489711936, "grad_norm": 1.2243092060089111, "learning_rate": 4.372475863050286e-05, "loss": 0.7606, "step": 770 }, { "epoch": 2.3194912083800974, "grad_norm": 0.9873678088188171, "learning_rate": 4.364664917131751e-05, "loss": 0.7605, "step": 775 }, { "epoch": 2.334455667789001, "grad_norm": 1.1722878217697144, "learning_rate": 4.3568127285387925e-05, "loss": 0.7186, "step": 780 }, { "epoch": 2.349420127197905, "grad_norm": 1.1724722385406494, "learning_rate": 4.348919470946185e-05, "loss": 0.7614, "step": 785 }, { "epoch": 2.3643845866068087, "grad_norm": 1.0196776390075684, "learning_rate": 4.340985318937066e-05, "loss": 0.7537, "step": 790 }, { "epoch": 2.3793490460157125, "grad_norm": 1.0410211086273193, "learning_rate": 4.333010447999077e-05, "loss": 0.7246, "step": 795 }, { "epoch": 2.3943135054246163, "grad_norm": 1.0125558376312256, "learning_rate": 4.3249950345204806e-05, "loss": 0.7561, "step": 800 }, { "epoch": 2.4092779648335205, "grad_norm": 0.9731377363204956, "learning_rate": 4.31693925578626e-05, "loss": 0.7418, "step": 805 }, { "epoch": 2.4242424242424243, "grad_norm": 1.1109338998794556, "learning_rate": 4.3088432899741985e-05, "loss": 0.6956, "step": 810 }, { "epoch": 2.439206883651328, "grad_norm": 1.4336915016174316, "learning_rate": 4.3007073161509345e-05, "loss": 0.7715, "step": 815 }, { "epoch": 2.454171343060232, "grad_norm": 1.0564075708389282, "learning_rate": 4.292531514268008e-05, "loss": 0.7782, "step": 820 }, { "epoch": 2.4691358024691357, "grad_norm": 1.1472092866897583, "learning_rate": 4.2843160651578726e-05, "loss": 0.7125, "step": 825 }, { "epoch": 2.48410026187804, "grad_norm": 0.8819020390510559, "learning_rate": 4.276061150529903e-05, "loss": 0.7647, "step": 830 }, { "epoch": 2.4990647212869437, "grad_norm": 1.1399636268615723, "learning_rate": 4.267766952966369e-05, "loss": 0.7327, "step": 835 }, { "epoch": 2.5140291806958475, "grad_norm": 1.3246550559997559, "learning_rate": 4.259433655918404e-05, "loss": 0.7553, "step": 840 }, { "epoch": 2.5289936401047513, "grad_norm": 1.1041203737258911, "learning_rate": 4.2510614437019416e-05, "loss": 0.7685, "step": 845 }, { "epoch": 2.543958099513655, "grad_norm": 1.099228024482727, "learning_rate": 4.242650501493642e-05, "loss": 0.7207, "step": 850 }, { "epoch": 2.558922558922559, "grad_norm": 1.140915036201477, "learning_rate": 4.2342010153267986e-05, "loss": 0.7253, "step": 855 }, { "epoch": 2.5738870183314626, "grad_norm": 1.020257830619812, "learning_rate": 4.2257131720872164e-05, "loss": 0.8055, "step": 860 }, { "epoch": 2.5888514777403664, "grad_norm": 1.1489384174346924, "learning_rate": 4.2171871595090826e-05, "loss": 0.7747, "step": 865 }, { "epoch": 2.6038159371492706, "grad_norm": 1.0359724760055542, "learning_rate": 4.2086231661708185e-05, "loss": 0.7525, "step": 870 }, { "epoch": 2.6187803965581744, "grad_norm": 1.1845808029174805, "learning_rate": 4.200021381490899e-05, "loss": 0.7259, "step": 875 }, { "epoch": 2.633744855967078, "grad_norm": 1.0849671363830566, "learning_rate": 4.191381995723672e-05, "loss": 0.7267, "step": 880 }, { "epoch": 2.648709315375982, "grad_norm": 1.1475163698196411, "learning_rate": 4.182705199955144e-05, "loss": 0.7862, "step": 885 }, { "epoch": 2.6636737747848858, "grad_norm": 1.1838606595993042, "learning_rate": 4.173991186098757e-05, "loss": 0.8079, "step": 890 }, { "epoch": 2.67863823419379, "grad_norm": 0.9914394021034241, "learning_rate": 4.165240146891145e-05, "loss": 0.7319, "step": 895 }, { "epoch": 2.6936026936026938, "grad_norm": 1.149774193763733, "learning_rate": 4.1564522758878656e-05, "loss": 0.7478, "step": 900 }, { "epoch": 2.7085671530115976, "grad_norm": 1.1517828702926636, "learning_rate": 4.147627767459124e-05, "loss": 0.8038, "step": 905 }, { "epoch": 2.7235316124205013, "grad_norm": 1.1382722854614258, "learning_rate": 4.138766816785474e-05, "loss": 0.7596, "step": 910 }, { "epoch": 2.738496071829405, "grad_norm": 0.9787604808807373, "learning_rate": 4.1298696198534955e-05, "loss": 0.6991, "step": 915 }, { "epoch": 2.753460531238309, "grad_norm": 1.0162303447723389, "learning_rate": 4.1209363734514674e-05, "loss": 0.7014, "step": 920 }, { "epoch": 2.7684249906472127, "grad_norm": 1.106070637702942, "learning_rate": 4.1119672751650074e-05, "loss": 0.8249, "step": 925 }, { "epoch": 2.7833894500561165, "grad_norm": 1.2757290601730347, "learning_rate": 4.102962523372709e-05, "loss": 0.7091, "step": 930 }, { "epoch": 2.7983539094650207, "grad_norm": 1.107681155204773, "learning_rate": 4.093922317241748e-05, "loss": 0.8038, "step": 935 }, { "epoch": 2.8133183688739245, "grad_norm": 1.2649710178375244, "learning_rate": 4.0848468567234796e-05, "loss": 0.7707, "step": 940 }, { "epoch": 2.8282828282828283, "grad_norm": 1.0683587789535522, "learning_rate": 4.075736342549018e-05, "loss": 0.7483, "step": 945 }, { "epoch": 2.843247287691732, "grad_norm": 1.1959580183029175, "learning_rate": 4.066590976224791e-05, "loss": 0.7838, "step": 950 }, { "epoch": 2.858211747100636, "grad_norm": 1.1794955730438232, "learning_rate": 4.0574109600280886e-05, "loss": 0.7758, "step": 955 }, { "epoch": 2.87317620650954, "grad_norm": 1.0079686641693115, "learning_rate": 4.048196497002588e-05, "loss": 0.7591, "step": 960 }, { "epoch": 2.888140665918444, "grad_norm": 1.265084981918335, "learning_rate": 4.038947790953859e-05, "loss": 0.7012, "step": 965 }, { "epoch": 2.9031051253273477, "grad_norm": 1.0062329769134521, "learning_rate": 4.0296650464448616e-05, "loss": 0.8008, "step": 970 }, { "epoch": 2.9180695847362514, "grad_norm": 1.1500215530395508, "learning_rate": 4.020348468791416e-05, "loss": 0.7492, "step": 975 }, { "epoch": 2.9330340441451552, "grad_norm": 1.2765411138534546, "learning_rate": 4.0109982640576674e-05, "loss": 0.7736, "step": 980 }, { "epoch": 2.947998503554059, "grad_norm": 1.0655264854431152, "learning_rate": 4.001614639051521e-05, "loss": 0.7198, "step": 985 }, { "epoch": 2.962962962962963, "grad_norm": 1.269967794418335, "learning_rate": 3.9921978013200766e-05, "loss": 0.7513, "step": 990 }, { "epoch": 2.9779274223718666, "grad_norm": 1.0883420705795288, "learning_rate": 3.98274795914503e-05, "loss": 0.804, "step": 995 }, { "epoch": 2.992891881780771, "grad_norm": 1.0706652402877808, "learning_rate": 3.973265321538069e-05, "loss": 0.6987, "step": 1000 }, { "epoch": 3.0078563411896746, "grad_norm": 1.1653481721878052, "learning_rate": 3.963750098236253e-05, "loss": 0.8132, "step": 1005 }, { "epoch": 3.0228208005985784, "grad_norm": 1.1991537809371948, "learning_rate": 3.954202499697373e-05, "loss": 0.7291, "step": 1010 }, { "epoch": 3.037785260007482, "grad_norm": 1.0241738557815552, "learning_rate": 3.944622737095294e-05, "loss": 0.7181, "step": 1015 }, { "epoch": 3.052749719416386, "grad_norm": 1.00551438331604, "learning_rate": 3.9350110223152844e-05, "loss": 0.732, "step": 1020 }, { "epoch": 3.0677141788252897, "grad_norm": 1.3171230554580688, "learning_rate": 3.925367567949335e-05, "loss": 0.8267, "step": 1025 }, { "epoch": 3.082678638234194, "grad_norm": 1.0425944328308105, "learning_rate": 3.9156925872914506e-05, "loss": 0.6677, "step": 1030 }, { "epoch": 3.0976430976430978, "grad_norm": 1.0785578489303589, "learning_rate": 3.905986294332935e-05, "loss": 0.7701, "step": 1035 }, { "epoch": 3.1126075570520015, "grad_norm": 1.089606523513794, "learning_rate": 3.8962489037576586e-05, "loss": 0.6776, "step": 1040 }, { "epoch": 3.1275720164609053, "grad_norm": 1.234337329864502, "learning_rate": 3.8864806309373076e-05, "loss": 0.7917, "step": 1045 }, { "epoch": 3.142536475869809, "grad_norm": 1.1076655387878418, "learning_rate": 3.876681691926624e-05, "loss": 0.7032, "step": 1050 }, { "epoch": 3.157500935278713, "grad_norm": 1.0013830661773682, "learning_rate": 3.866852303458623e-05, "loss": 0.7442, "step": 1055 }, { "epoch": 3.1724653946876167, "grad_norm": 1.2173100709915161, "learning_rate": 3.856992682939803e-05, "loss": 0.6668, "step": 1060 }, { "epoch": 3.187429854096521, "grad_norm": 1.1837962865829468, "learning_rate": 3.847103048445333e-05, "loss": 0.7408, "step": 1065 }, { "epoch": 3.2023943135054247, "grad_norm": 1.1347203254699707, "learning_rate": 3.837183618714233e-05, "loss": 0.7615, "step": 1070 }, { "epoch": 3.2173587729143285, "grad_norm": 1.1322319507598877, "learning_rate": 3.827234613144533e-05, "loss": 0.6853, "step": 1075 }, { "epoch": 3.2323232323232323, "grad_norm": 1.24197256565094, "learning_rate": 3.817256251788425e-05, "loss": 0.6563, "step": 1080 }, { "epoch": 3.247287691732136, "grad_norm": 1.1162407398223877, "learning_rate": 3.807248755347387e-05, "loss": 0.7744, "step": 1085 }, { "epoch": 3.2622521511410403, "grad_norm": 1.0863921642303467, "learning_rate": 3.79721234516731e-05, "loss": 0.7257, "step": 1090 }, { "epoch": 3.277216610549944, "grad_norm": 1.1099414825439453, "learning_rate": 3.787147243233602e-05, "loss": 0.7711, "step": 1095 }, { "epoch": 3.292181069958848, "grad_norm": 1.3686941862106323, "learning_rate": 3.77705367216627e-05, "loss": 0.7528, "step": 1100 }, { "epoch": 3.3071455293677516, "grad_norm": 1.1821480989456177, "learning_rate": 3.766931855215006e-05, "loss": 0.7642, "step": 1105 }, { "epoch": 3.3221099887766554, "grad_norm": 1.0392365455627441, "learning_rate": 3.756782016254242e-05, "loss": 0.7566, "step": 1110 }, { "epoch": 3.337074448185559, "grad_norm": 1.1076756715774536, "learning_rate": 3.746604379778203e-05, "loss": 0.6818, "step": 1115 }, { "epoch": 3.352038907594463, "grad_norm": 1.2123860120773315, "learning_rate": 3.7363991708959386e-05, "loss": 0.7248, "step": 1120 }, { "epoch": 3.3670033670033668, "grad_norm": 1.1529157161712646, "learning_rate": 3.726166615326344e-05, "loss": 0.7569, "step": 1125 }, { "epoch": 3.381967826412271, "grad_norm": 1.0874592065811157, "learning_rate": 3.715906939393172e-05, "loss": 0.7775, "step": 1130 }, { "epoch": 3.396932285821175, "grad_norm": 1.1039067506790161, "learning_rate": 3.70562037002002e-05, "loss": 0.7637, "step": 1135 }, { "epoch": 3.4118967452300786, "grad_norm": 1.1319911479949951, "learning_rate": 3.695307134725317e-05, "loss": 0.7701, "step": 1140 }, { "epoch": 3.4268612046389824, "grad_norm": 1.3674046993255615, "learning_rate": 3.684967461617289e-05, "loss": 0.7202, "step": 1145 }, { "epoch": 3.441825664047886, "grad_norm": 1.214239478111267, "learning_rate": 3.674601579388913e-05, "loss": 0.736, "step": 1150 }, { "epoch": 3.45679012345679, "grad_norm": 1.1035867929458618, "learning_rate": 3.66420971731286e-05, "loss": 0.7361, "step": 1155 }, { "epoch": 3.471754582865694, "grad_norm": 1.1282587051391602, "learning_rate": 3.653792105236422e-05, "loss": 0.7012, "step": 1160 }, { "epoch": 3.486719042274598, "grad_norm": 1.4782813787460327, "learning_rate": 3.6433489735764334e-05, "loss": 0.6902, "step": 1165 }, { "epoch": 3.5016835016835017, "grad_norm": 1.2365137338638306, "learning_rate": 3.6328805533141684e-05, "loss": 0.7524, "step": 1170 }, { "epoch": 3.5166479610924055, "grad_norm": 1.1180927753448486, "learning_rate": 3.622387075990233e-05, "loss": 0.727, "step": 1175 }, { "epoch": 3.5316124205013093, "grad_norm": 1.1830908060073853, "learning_rate": 3.611868773699449e-05, "loss": 0.7811, "step": 1180 }, { "epoch": 3.546576879910213, "grad_norm": 1.2073471546173096, "learning_rate": 3.6013258790857154e-05, "loss": 0.7164, "step": 1185 }, { "epoch": 3.561541339319117, "grad_norm": 1.1175339221954346, "learning_rate": 3.590758625336864e-05, "loss": 0.7238, "step": 1190 }, { "epoch": 3.576505798728021, "grad_norm": 1.2776098251342773, "learning_rate": 3.5801672461795034e-05, "loss": 0.7886, "step": 1195 }, { "epoch": 3.591470258136925, "grad_norm": 1.021897792816162, "learning_rate": 3.569551975873847e-05, "loss": 0.7491, "step": 1200 }, { "epoch": 3.6064347175458287, "grad_norm": 1.1327399015426636, "learning_rate": 3.558913049208534e-05, "loss": 0.7499, "step": 1205 }, { "epoch": 3.6213991769547325, "grad_norm": 1.105021595954895, "learning_rate": 3.548250701495432e-05, "loss": 0.6803, "step": 1210 }, { "epoch": 3.6363636363636362, "grad_norm": 1.1674951314926147, "learning_rate": 3.537565168564442e-05, "loss": 0.7302, "step": 1215 }, { "epoch": 3.6513280957725405, "grad_norm": 1.0947383642196655, "learning_rate": 3.526856686758269e-05, "loss": 0.7106, "step": 1220 }, { "epoch": 3.6662925551814443, "grad_norm": 1.1993179321289062, "learning_rate": 3.5161254929272046e-05, "loss": 0.793, "step": 1225 }, { "epoch": 3.681257014590348, "grad_norm": 1.1340358257293701, "learning_rate": 3.505371824423885e-05, "loss": 0.8239, "step": 1230 }, { "epoch": 3.696221473999252, "grad_norm": 1.7940102815628052, "learning_rate": 3.494595919098041e-05, "loss": 0.6556, "step": 1235 }, { "epoch": 3.7111859334081556, "grad_norm": 1.231870174407959, "learning_rate": 3.483798015291239e-05, "loss": 0.7934, "step": 1240 }, { "epoch": 3.7261503928170594, "grad_norm": 1.1797089576721191, "learning_rate": 3.4729783518316056e-05, "loss": 0.773, "step": 1245 }, { "epoch": 3.741114852225963, "grad_norm": 1.148738980293274, "learning_rate": 3.462137168028549e-05, "loss": 0.7345, "step": 1250 }, { "epoch": 3.756079311634867, "grad_norm": 1.1555569171905518, "learning_rate": 3.4512747036674644e-05, "loss": 0.7036, "step": 1255 }, { "epoch": 3.771043771043771, "grad_norm": 1.172443151473999, "learning_rate": 3.440391199004431e-05, "loss": 0.8012, "step": 1260 }, { "epoch": 3.786008230452675, "grad_norm": 1.4849857091903687, "learning_rate": 3.4294868947608964e-05, "loss": 0.7567, "step": 1265 }, { "epoch": 3.8009726898615788, "grad_norm": 1.1539514064788818, "learning_rate": 3.4185620321183545e-05, "loss": 0.7258, "step": 1270 }, { "epoch": 3.8159371492704826, "grad_norm": 1.3457891941070557, "learning_rate": 3.4076168527130094e-05, "loss": 0.7048, "step": 1275 }, { "epoch": 3.8309016086793863, "grad_norm": 1.1766413450241089, "learning_rate": 3.396651598630432e-05, "loss": 0.7275, "step": 1280 }, { "epoch": 3.8458660680882906, "grad_norm": 1.2674963474273682, "learning_rate": 3.3856665124002054e-05, "loss": 0.6935, "step": 1285 }, { "epoch": 3.8608305274971944, "grad_norm": 1.256225347518921, "learning_rate": 3.37466183699056e-05, "loss": 0.7128, "step": 1290 }, { "epoch": 3.875794986906098, "grad_norm": 1.098443627357483, "learning_rate": 3.363637815802998e-05, "loss": 0.6997, "step": 1295 }, { "epoch": 3.890759446315002, "grad_norm": 1.1758556365966797, "learning_rate": 3.352594692666915e-05, "loss": 0.6989, "step": 1300 }, { "epoch": 3.9057239057239057, "grad_norm": 1.265360951423645, "learning_rate": 3.3415327118342015e-05, "loss": 0.7412, "step": 1305 }, { "epoch": 3.9206883651328095, "grad_norm": 1.212694525718689, "learning_rate": 3.3304521179738437e-05, "loss": 0.7208, "step": 1310 }, { "epoch": 3.9356528245417133, "grad_norm": 1.2661161422729492, "learning_rate": 3.319353156166509e-05, "loss": 0.7097, "step": 1315 }, { "epoch": 3.950617283950617, "grad_norm": 1.0489590167999268, "learning_rate": 3.3082360718991304e-05, "loss": 0.7063, "step": 1320 }, { "epoch": 3.9655817433595213, "grad_norm": 1.304627537727356, "learning_rate": 3.297101111059471e-05, "loss": 0.7256, "step": 1325 }, { "epoch": 3.980546202768425, "grad_norm": 1.1489557027816772, "learning_rate": 3.2859485199306885e-05, "loss": 0.7, "step": 1330 }, { "epoch": 3.995510662177329, "grad_norm": 1.1346006393432617, "learning_rate": 3.274778545185888e-05, "loss": 0.7179, "step": 1335 }, { "epoch": 4.010475121586233, "grad_norm": 1.1032986640930176, "learning_rate": 3.263591433882666e-05, "loss": 0.7768, "step": 1340 }, { "epoch": 4.025439580995137, "grad_norm": 1.5759029388427734, "learning_rate": 3.252387433457645e-05, "loss": 0.6737, "step": 1345 }, { "epoch": 4.040404040404041, "grad_norm": 1.208949327468872, "learning_rate": 3.241166791721001e-05, "loss": 0.648, "step": 1350 }, { "epoch": 4.0553684998129444, "grad_norm": 1.332651972770691, "learning_rate": 3.2299297568509835e-05, "loss": 0.7591, "step": 1355 }, { "epoch": 4.070332959221848, "grad_norm": 1.2246062755584717, "learning_rate": 3.2186765773884245e-05, "loss": 0.6756, "step": 1360 }, { "epoch": 4.085297418630752, "grad_norm": 1.288155198097229, "learning_rate": 3.2074075022312417e-05, "loss": 0.7229, "step": 1365 }, { "epoch": 4.100261878039656, "grad_norm": 1.2574666738510132, "learning_rate": 3.196122780628936e-05, "loss": 0.7267, "step": 1370 }, { "epoch": 4.11522633744856, "grad_norm": 1.1441593170166016, "learning_rate": 3.1848226621770744e-05, "loss": 0.7363, "step": 1375 }, { "epoch": 4.130190796857463, "grad_norm": 1.3331890106201172, "learning_rate": 3.173507396811774e-05, "loss": 0.7083, "step": 1380 }, { "epoch": 4.145155256266367, "grad_norm": 1.2898328304290771, "learning_rate": 3.162177234804168e-05, "loss": 0.6997, "step": 1385 }, { "epoch": 4.160119715675271, "grad_norm": 1.0942869186401367, "learning_rate": 3.150832426754877e-05, "loss": 0.7047, "step": 1390 }, { "epoch": 4.175084175084175, "grad_norm": 1.1719509363174438, "learning_rate": 3.1394732235884615e-05, "loss": 0.6965, "step": 1395 }, { "epoch": 4.1900486344930785, "grad_norm": 1.2531814575195312, "learning_rate": 3.1280998765478727e-05, "loss": 0.7139, "step": 1400 }, { "epoch": 4.205013093901983, "grad_norm": 1.9164917469024658, "learning_rate": 3.116712637188897e-05, "loss": 0.7125, "step": 1405 }, { "epoch": 4.219977553310887, "grad_norm": 1.1178081035614014, "learning_rate": 3.10531175737459e-05, "loss": 0.7247, "step": 1410 }, { "epoch": 4.234942012719791, "grad_norm": 1.3899163007736206, "learning_rate": 3.0938974892697095e-05, "loss": 0.6983, "step": 1415 }, { "epoch": 4.2499064721286945, "grad_norm": 1.175616979598999, "learning_rate": 3.082470085335133e-05, "loss": 0.7491, "step": 1420 }, { "epoch": 4.264870931537598, "grad_norm": 1.1819936037063599, "learning_rate": 3.071029798322279e-05, "loss": 0.6763, "step": 1425 }, { "epoch": 4.279835390946502, "grad_norm": 1.109136939048767, "learning_rate": 3.0595768812675104e-05, "loss": 0.7401, "step": 1430 }, { "epoch": 4.294799850355406, "grad_norm": 1.1672794818878174, "learning_rate": 3.048111587486545e-05, "loss": 0.6849, "step": 1435 }, { "epoch": 4.30976430976431, "grad_norm": 1.3153440952301025, "learning_rate": 3.0366341705688468e-05, "loss": 0.7617, "step": 1440 }, { "epoch": 4.3247287691732135, "grad_norm": 1.1830875873565674, "learning_rate": 3.025144884372021e-05, "loss": 0.7097, "step": 1445 }, { "epoch": 4.339693228582117, "grad_norm": 1.1907213926315308, "learning_rate": 3.0136439830161967e-05, "loss": 0.6899, "step": 1450 }, { "epoch": 4.354657687991021, "grad_norm": 1.1969035863876343, "learning_rate": 3.0021317208784074e-05, "loss": 0.7034, "step": 1455 }, { "epoch": 4.369622147399925, "grad_norm": 1.181558609008789, "learning_rate": 2.990608352586965e-05, "loss": 0.7223, "step": 1460 }, { "epoch": 4.3845866068088295, "grad_norm": 1.2934932708740234, "learning_rate": 2.979074133015827e-05, "loss": 0.7026, "step": 1465 }, { "epoch": 4.399551066217733, "grad_norm": 1.2202645540237427, "learning_rate": 2.9675293172789583e-05, "loss": 0.734, "step": 1470 }, { "epoch": 4.414515525626637, "grad_norm": 1.2204395532608032, "learning_rate": 2.9559741607246922e-05, "loss": 0.7691, "step": 1475 }, { "epoch": 4.429479985035541, "grad_norm": 1.2286920547485352, "learning_rate": 2.9444089189300783e-05, "loss": 0.7691, "step": 1480 }, { "epoch": 4.444444444444445, "grad_norm": 1.1387462615966797, "learning_rate": 2.932833847695234e-05, "loss": 0.7064, "step": 1485 }, { "epoch": 4.459408903853348, "grad_norm": 1.3988234996795654, "learning_rate": 2.9212492030376814e-05, "loss": 0.6983, "step": 1490 }, { "epoch": 4.474373363262252, "grad_norm": 1.144126296043396, "learning_rate": 2.90965524118669e-05, "loss": 0.7616, "step": 1495 }, { "epoch": 4.489337822671156, "grad_norm": 1.073025107383728, "learning_rate": 2.8980522185776065e-05, "loss": 0.7386, "step": 1500 }, { "epoch": 4.50430228208006, "grad_norm": 1.3249400854110718, "learning_rate": 2.8864403918461812e-05, "loss": 0.6959, "step": 1505 }, { "epoch": 4.519266741488964, "grad_norm": 1.3409395217895508, "learning_rate": 2.874820017822899e-05, "loss": 0.696, "step": 1510 }, { "epoch": 4.534231200897867, "grad_norm": 1.2458475828170776, "learning_rate": 2.8631913535272888e-05, "loss": 0.7367, "step": 1515 }, { "epoch": 4.549195660306771, "grad_norm": 1.3703510761260986, "learning_rate": 2.8515546561622462e-05, "loss": 0.7221, "step": 1520 }, { "epoch": 4.564160119715675, "grad_norm": 1.0971307754516602, "learning_rate": 2.839910183108342e-05, "loss": 0.7485, "step": 1525 }, { "epoch": 4.57912457912458, "grad_norm": 1.198792815208435, "learning_rate": 2.828258191918131e-05, "loss": 0.8012, "step": 1530 }, { "epoch": 4.5940890385334825, "grad_norm": 1.2157917022705078, "learning_rate": 2.816598940310452e-05, "loss": 0.6885, "step": 1535 }, { "epoch": 4.609053497942387, "grad_norm": 1.2653173208236694, "learning_rate": 2.8049326861647302e-05, "loss": 0.7332, "step": 1540 }, { "epoch": 4.624017957351291, "grad_norm": 1.1794272661209106, "learning_rate": 2.7932596875152744e-05, "loss": 0.7952, "step": 1545 }, { "epoch": 4.638982416760195, "grad_norm": 1.2640421390533447, "learning_rate": 2.781580202545568e-05, "loss": 0.7742, "step": 1550 }, { "epoch": 4.6539468761690985, "grad_norm": 2.050365686416626, "learning_rate": 2.7698944895825572e-05, "loss": 0.7715, "step": 1555 }, { "epoch": 4.668911335578002, "grad_norm": 1.2108561992645264, "learning_rate": 2.7582028070909415e-05, "loss": 0.7624, "step": 1560 }, { "epoch": 4.683875794986906, "grad_norm": 1.2775709629058838, "learning_rate": 2.746505413667452e-05, "loss": 0.6833, "step": 1565 }, { "epoch": 4.69884025439581, "grad_norm": 1.178462266921997, "learning_rate": 2.7348025680351363e-05, "loss": 0.6924, "step": 1570 }, { "epoch": 4.713804713804714, "grad_norm": 1.1037096977233887, "learning_rate": 2.7230945290376325e-05, "loss": 0.6909, "step": 1575 }, { "epoch": 4.7287691732136174, "grad_norm": 1.241242527961731, "learning_rate": 2.7113815556334478e-05, "loss": 0.7844, "step": 1580 }, { "epoch": 4.743733632622521, "grad_norm": 1.220365047454834, "learning_rate": 2.6996639068902253e-05, "loss": 0.7149, "step": 1585 }, { "epoch": 4.758698092031425, "grad_norm": 1.3249695301055908, "learning_rate": 2.6879418419790204e-05, "loss": 0.6882, "step": 1590 }, { "epoch": 4.77366255144033, "grad_norm": 1.3471956253051758, "learning_rate": 2.6762156201685628e-05, "loss": 0.7442, "step": 1595 }, { "epoch": 4.788627010849233, "grad_norm": 1.2055602073669434, "learning_rate": 2.6644855008195267e-05, "loss": 0.7078, "step": 1600 }, { "epoch": 4.803591470258137, "grad_norm": 1.2006497383117676, "learning_rate": 2.6527517433787913e-05, "loss": 0.6789, "step": 1605 }, { "epoch": 4.818555929667041, "grad_norm": 1.1846423149108887, "learning_rate": 2.641014607373702e-05, "loss": 0.6703, "step": 1610 }, { "epoch": 4.833520389075945, "grad_norm": 1.2655390501022339, "learning_rate": 2.6292743524063334e-05, "loss": 0.6671, "step": 1615 }, { "epoch": 4.848484848484849, "grad_norm": 1.250118374824524, "learning_rate": 2.6175312381477442e-05, "loss": 0.6936, "step": 1620 }, { "epoch": 4.863449307893752, "grad_norm": 1.2387151718139648, "learning_rate": 2.6057855243322344e-05, "loss": 0.6755, "step": 1625 }, { "epoch": 4.878413767302656, "grad_norm": 1.3161075115203857, "learning_rate": 2.5940374707516015e-05, "loss": 0.6515, "step": 1630 }, { "epoch": 4.89337822671156, "grad_norm": 1.219498872756958, "learning_rate": 2.582287337249394e-05, "loss": 0.7108, "step": 1635 }, { "epoch": 4.908342686120464, "grad_norm": 1.4078658819198608, "learning_rate": 2.570535383715165e-05, "loss": 0.7038, "step": 1640 }, { "epoch": 4.9233071455293675, "grad_norm": 1.140682578086853, "learning_rate": 2.558781870078722e-05, "loss": 0.6804, "step": 1645 }, { "epoch": 4.938271604938271, "grad_norm": 1.4205572605133057, "learning_rate": 2.547027056304379e-05, "loss": 0.7491, "step": 1650 }, { "epoch": 4.953236064347175, "grad_norm": 1.2967289686203003, "learning_rate": 2.5352712023852066e-05, "loss": 0.7297, "step": 1655 }, { "epoch": 4.96820052375608, "grad_norm": 1.2759593725204468, "learning_rate": 2.5235145683372814e-05, "loss": 0.6731, "step": 1660 }, { "epoch": 4.983164983164983, "grad_norm": 1.1895300149917603, "learning_rate": 2.5117574141939337e-05, "loss": 0.7156, "step": 1665 }, { "epoch": 4.998129442573887, "grad_norm": 1.1513454914093018, "learning_rate": 2.5e-05, "loss": 0.7455, "step": 1670 }, { "epoch": 5.013093901982791, "grad_norm": 1.231416940689087, "learning_rate": 2.4882425858060668e-05, "loss": 0.7206, "step": 1675 }, { "epoch": 5.028058361391695, "grad_norm": 1.270216941833496, "learning_rate": 2.47648543166272e-05, "loss": 0.6685, "step": 1680 }, { "epoch": 5.043022820800599, "grad_norm": 1.4066438674926758, "learning_rate": 2.4647287976147946e-05, "loss": 0.6722, "step": 1685 }, { "epoch": 5.0579872802095025, "grad_norm": 1.3440229892730713, "learning_rate": 2.452972943695621e-05, "loss": 0.7271, "step": 1690 }, { "epoch": 5.072951739618406, "grad_norm": 1.1897931098937988, "learning_rate": 2.441218129921278e-05, "loss": 0.6775, "step": 1695 }, { "epoch": 5.08791619902731, "grad_norm": 1.2431669235229492, "learning_rate": 2.4294646162848354e-05, "loss": 0.7324, "step": 1700 }, { "epoch": 5.102880658436214, "grad_norm": 1.4123824834823608, "learning_rate": 2.4177126627506067e-05, "loss": 0.7041, "step": 1705 }, { "epoch": 5.117845117845118, "grad_norm": 1.3087615966796875, "learning_rate": 2.405962529248399e-05, "loss": 0.6902, "step": 1710 }, { "epoch": 5.132809577254021, "grad_norm": 1.1675366163253784, "learning_rate": 2.394214475667767e-05, "loss": 0.7462, "step": 1715 }, { "epoch": 5.147774036662925, "grad_norm": 1.1870967149734497, "learning_rate": 2.3824687618522567e-05, "loss": 0.7482, "step": 1720 }, { "epoch": 5.162738496071829, "grad_norm": 1.1886534690856934, "learning_rate": 2.370725647593666e-05, "loss": 0.7026, "step": 1725 }, { "epoch": 5.177702955480734, "grad_norm": 1.3220059871673584, "learning_rate": 2.3589853926262977e-05, "loss": 0.681, "step": 1730 }, { "epoch": 5.1926674148896375, "grad_norm": 1.2325706481933594, "learning_rate": 2.3472482566212093e-05, "loss": 0.7101, "step": 1735 }, { "epoch": 5.207631874298541, "grad_norm": 1.1882089376449585, "learning_rate": 2.3355144991804735e-05, "loss": 0.6857, "step": 1740 }, { "epoch": 5.222596333707445, "grad_norm": 1.3109657764434814, "learning_rate": 2.323784379831438e-05, "loss": 0.7127, "step": 1745 }, { "epoch": 5.237560793116349, "grad_norm": 1.1432446241378784, "learning_rate": 2.3120581580209808e-05, "loss": 0.6823, "step": 1750 }, { "epoch": 5.252525252525253, "grad_norm": 1.3307565450668335, "learning_rate": 2.3003360931097757e-05, "loss": 0.7118, "step": 1755 }, { "epoch": 5.267489711934156, "grad_norm": 1.6253339052200317, "learning_rate": 2.2886184443665525e-05, "loss": 0.7521, "step": 1760 }, { "epoch": 5.28245417134306, "grad_norm": 1.3010215759277344, "learning_rate": 2.2769054709623674e-05, "loss": 0.7331, "step": 1765 }, { "epoch": 5.297418630751964, "grad_norm": 1.2219674587249756, "learning_rate": 2.2651974319648643e-05, "loss": 0.7031, "step": 1770 }, { "epoch": 5.312383090160868, "grad_norm": 1.2299708127975464, "learning_rate": 2.2534945863325487e-05, "loss": 0.6622, "step": 1775 }, { "epoch": 5.3273475495697715, "grad_norm": 1.1474329233169556, "learning_rate": 2.241797192909059e-05, "loss": 0.6662, "step": 1780 }, { "epoch": 5.342312008978675, "grad_norm": 1.1639771461486816, "learning_rate": 2.2301055104174433e-05, "loss": 0.6913, "step": 1785 }, { "epoch": 5.357276468387579, "grad_norm": 1.2043278217315674, "learning_rate": 2.218419797454433e-05, "loss": 0.6777, "step": 1790 }, { "epoch": 5.372240927796484, "grad_norm": 1.2802300453186035, "learning_rate": 2.206740312484726e-05, "loss": 0.6608, "step": 1795 }, { "epoch": 5.3872053872053876, "grad_norm": 1.2886018753051758, "learning_rate": 2.19506731383527e-05, "loss": 0.6696, "step": 1800 }, { "epoch": 5.402169846614291, "grad_norm": 1.6271384954452515, "learning_rate": 2.1834010596895487e-05, "loss": 0.7117, "step": 1805 }, { "epoch": 5.417134306023195, "grad_norm": 1.3303827047348022, "learning_rate": 2.1717418080818696e-05, "loss": 0.6851, "step": 1810 }, { "epoch": 5.432098765432099, "grad_norm": 1.3058645725250244, "learning_rate": 2.1600898168916584e-05, "loss": 0.7386, "step": 1815 }, { "epoch": 5.447063224841003, "grad_norm": 1.3986623287200928, "learning_rate": 2.148445343837755e-05, "loss": 0.6995, "step": 1820 }, { "epoch": 5.4620276842499065, "grad_norm": 1.2918411493301392, "learning_rate": 2.1368086464727125e-05, "loss": 0.6936, "step": 1825 }, { "epoch": 5.47699214365881, "grad_norm": 1.1513465642929077, "learning_rate": 2.1251799821771012e-05, "loss": 0.7228, "step": 1830 }, { "epoch": 5.491956603067714, "grad_norm": 1.233217716217041, "learning_rate": 2.1135596081538184e-05, "loss": 0.77, "step": 1835 }, { "epoch": 5.506921062476618, "grad_norm": 1.2311054468154907, "learning_rate": 2.1019477814223944e-05, "loss": 0.6844, "step": 1840 }, { "epoch": 5.521885521885522, "grad_norm": 1.3642069101333618, "learning_rate": 2.09034475881331e-05, "loss": 0.7025, "step": 1845 }, { "epoch": 5.536849981294425, "grad_norm": 1.311928391456604, "learning_rate": 2.0787507969623192e-05, "loss": 0.6874, "step": 1850 }, { "epoch": 5.55181444070333, "grad_norm": 1.2631707191467285, "learning_rate": 2.0671661523047663e-05, "loss": 0.7446, "step": 1855 }, { "epoch": 5.566778900112233, "grad_norm": 1.1697314977645874, "learning_rate": 2.0555910810699223e-05, "loss": 0.7386, "step": 1860 }, { "epoch": 5.581743359521138, "grad_norm": 1.2585618495941162, "learning_rate": 2.0440258392753084e-05, "loss": 0.7292, "step": 1865 }, { "epoch": 5.596707818930041, "grad_norm": 1.357924461364746, "learning_rate": 2.032470682721042e-05, "loss": 0.7167, "step": 1870 }, { "epoch": 5.611672278338945, "grad_norm": 1.0884181261062622, "learning_rate": 2.0209258669841737e-05, "loss": 0.7249, "step": 1875 }, { "epoch": 5.626636737747849, "grad_norm": 1.3644371032714844, "learning_rate": 2.0093916474130353e-05, "loss": 0.7436, "step": 1880 }, { "epoch": 5.641601197156753, "grad_norm": 1.1837425231933594, "learning_rate": 1.997868279121593e-05, "loss": 0.6922, "step": 1885 }, { "epoch": 5.656565656565657, "grad_norm": 1.3669867515563965, "learning_rate": 1.9863560169838042e-05, "loss": 0.7689, "step": 1890 }, { "epoch": 5.67153011597456, "grad_norm": 1.3488072156906128, "learning_rate": 1.97485511562798e-05, "loss": 0.7074, "step": 1895 }, { "epoch": 5.686494575383464, "grad_norm": 1.1839897632598877, "learning_rate": 1.9633658294311535e-05, "loss": 0.7115, "step": 1900 }, { "epoch": 5.701459034792368, "grad_norm": 1.3153859376907349, "learning_rate": 1.9518884125134556e-05, "loss": 0.723, "step": 1905 }, { "epoch": 5.716423494201272, "grad_norm": 1.2922106981277466, "learning_rate": 1.9404231187324902e-05, "loss": 0.6543, "step": 1910 }, { "epoch": 5.7313879536101755, "grad_norm": 1.3643290996551514, "learning_rate": 1.928970201677722e-05, "loss": 0.7399, "step": 1915 }, { "epoch": 5.74635241301908, "grad_norm": 1.188324213027954, "learning_rate": 1.9175299146648674e-05, "loss": 0.6795, "step": 1920 }, { "epoch": 5.761316872427983, "grad_norm": 1.4890059232711792, "learning_rate": 1.906102510730291e-05, "loss": 0.721, "step": 1925 }, { "epoch": 5.776281331836888, "grad_norm": 1.4943420886993408, "learning_rate": 1.8946882426254105e-05, "loss": 0.6991, "step": 1930 }, { "epoch": 5.7912457912457915, "grad_norm": 1.2924257516860962, "learning_rate": 1.8832873628111038e-05, "loss": 0.7136, "step": 1935 }, { "epoch": 5.806210250654695, "grad_norm": 1.3031319379806519, "learning_rate": 1.8719001234521283e-05, "loss": 0.6695, "step": 1940 }, { "epoch": 5.821174710063599, "grad_norm": 1.2206610441207886, "learning_rate": 1.860526776411539e-05, "loss": 0.6473, "step": 1945 }, { "epoch": 5.836139169472503, "grad_norm": 1.173349142074585, "learning_rate": 1.849167573245123e-05, "loss": 0.6412, "step": 1950 }, { "epoch": 5.851103628881407, "grad_norm": 1.5744128227233887, "learning_rate": 1.8378227651958326e-05, "loss": 0.6956, "step": 1955 }, { "epoch": 5.8660680882903105, "grad_norm": 1.170933723449707, "learning_rate": 1.8264926031882272e-05, "loss": 0.7798, "step": 1960 }, { "epoch": 5.881032547699214, "grad_norm": 1.5066628456115723, "learning_rate": 1.8151773378229265e-05, "loss": 0.7011, "step": 1965 }, { "epoch": 5.895997007108118, "grad_norm": 1.2198915481567383, "learning_rate": 1.8038772193710646e-05, "loss": 0.724, "step": 1970 }, { "epoch": 5.910961466517022, "grad_norm": 1.227023959159851, "learning_rate": 1.792592497768759e-05, "loss": 0.6702, "step": 1975 }, { "epoch": 5.925925925925926, "grad_norm": 1.3417410850524902, "learning_rate": 1.7813234226115764e-05, "loss": 0.747, "step": 1980 }, { "epoch": 5.94089038533483, "grad_norm": 1.3337069749832153, "learning_rate": 1.7700702431490174e-05, "loss": 0.669, "step": 1985 }, { "epoch": 5.955854844743733, "grad_norm": 1.2036738395690918, "learning_rate": 1.7588332082789993e-05, "loss": 0.7339, "step": 1990 }, { "epoch": 5.970819304152638, "grad_norm": 1.1622107028961182, "learning_rate": 1.747612566542356e-05, "loss": 0.6925, "step": 1995 }, { "epoch": 5.985783763561542, "grad_norm": 1.3639973402023315, "learning_rate": 1.7364085661173347e-05, "loss": 0.6798, "step": 2000 }, { "epoch": 6.000748222970445, "grad_norm": 1.2021132707595825, "learning_rate": 1.725221454814112e-05, "loss": 0.7133, "step": 2005 }, { "epoch": 6.015712682379349, "grad_norm": 1.4045711755752563, "learning_rate": 1.7140514800693124e-05, "loss": 0.6953, "step": 2010 }, { "epoch": 6.030677141788253, "grad_norm": 1.2548061609268188, "learning_rate": 1.7028988889405296e-05, "loss": 0.6381, "step": 2015 }, { "epoch": 6.045641601197157, "grad_norm": 1.1166868209838867, "learning_rate": 1.69176392810087e-05, "loss": 0.7127, "step": 2020 }, { "epoch": 6.0606060606060606, "grad_norm": 1.2931350469589233, "learning_rate": 1.6806468438334917e-05, "loss": 0.7081, "step": 2025 }, { "epoch": 6.075570520014964, "grad_norm": 1.365538239479065, "learning_rate": 1.6695478820261573e-05, "loss": 0.6766, "step": 2030 }, { "epoch": 6.090534979423868, "grad_norm": 1.4035921096801758, "learning_rate": 1.658467288165799e-05, "loss": 0.6857, "step": 2035 }, { "epoch": 6.105499438832772, "grad_norm": 1.0855042934417725, "learning_rate": 1.647405307333085e-05, "loss": 0.7685, "step": 2040 }, { "epoch": 6.120463898241676, "grad_norm": 1.4982078075408936, "learning_rate": 1.6363621841970022e-05, "loss": 0.7044, "step": 2045 }, { "epoch": 6.1354283576505795, "grad_norm": 1.233553171157837, "learning_rate": 1.625338163009441e-05, "loss": 0.6415, "step": 2050 }, { "epoch": 6.150392817059484, "grad_norm": 2.476423978805542, "learning_rate": 1.6143334875997952e-05, "loss": 0.7047, "step": 2055 }, { "epoch": 6.165357276468388, "grad_norm": 1.2853014469146729, "learning_rate": 1.6033484013695687e-05, "loss": 0.7164, "step": 2060 }, { "epoch": 6.180321735877292, "grad_norm": 1.376776933670044, "learning_rate": 1.5923831472869915e-05, "loss": 0.6773, "step": 2065 }, { "epoch": 6.1952861952861955, "grad_norm": 1.2735328674316406, "learning_rate": 1.581437967881647e-05, "loss": 0.6457, "step": 2070 }, { "epoch": 6.210250654695099, "grad_norm": 1.3325200080871582, "learning_rate": 1.5705131052391042e-05, "loss": 0.7297, "step": 2075 }, { "epoch": 6.225215114104003, "grad_norm": 1.1959949731826782, "learning_rate": 1.5596088009955695e-05, "loss": 0.7535, "step": 2080 }, { "epoch": 6.240179573512907, "grad_norm": 1.307750940322876, "learning_rate": 1.5487252963325362e-05, "loss": 0.7605, "step": 2085 }, { "epoch": 6.255144032921811, "grad_norm": 1.3463622331619263, "learning_rate": 1.5378628319714512e-05, "loss": 0.7251, "step": 2090 }, { "epoch": 6.270108492330714, "grad_norm": 1.2366999387741089, "learning_rate": 1.5270216481683953e-05, "loss": 0.6835, "step": 2095 }, { "epoch": 6.285072951739618, "grad_norm": 1.2593817710876465, "learning_rate": 1.5162019847087617e-05, "loss": 0.6598, "step": 2100 }, { "epoch": 6.300037411148522, "grad_norm": 1.3024280071258545, "learning_rate": 1.5054040809019584e-05, "loss": 0.6683, "step": 2105 }, { "epoch": 6.315001870557426, "grad_norm": 1.4586106538772583, "learning_rate": 1.4946281755761152e-05, "loss": 0.6762, "step": 2110 }, { "epoch": 6.32996632996633, "grad_norm": 1.338810920715332, "learning_rate": 1.4838745070727958e-05, "loss": 0.6821, "step": 2115 }, { "epoch": 6.344930789375233, "grad_norm": 1.425808310508728, "learning_rate": 1.4731433132417316e-05, "loss": 0.6303, "step": 2120 }, { "epoch": 6.359895248784138, "grad_norm": 1.1587165594100952, "learning_rate": 1.4624348314355585e-05, "loss": 0.6306, "step": 2125 }, { "epoch": 6.374859708193042, "grad_norm": 1.3677455186843872, "learning_rate": 1.4517492985045678e-05, "loss": 0.7352, "step": 2130 }, { "epoch": 6.389824167601946, "grad_norm": 1.4579230546951294, "learning_rate": 1.4410869507914669e-05, "loss": 0.6911, "step": 2135 }, { "epoch": 6.404788627010849, "grad_norm": 1.3865454196929932, "learning_rate": 1.4304480241261528e-05, "loss": 0.6651, "step": 2140 }, { "epoch": 6.419753086419753, "grad_norm": 1.1365728378295898, "learning_rate": 1.4198327538204961e-05, "loss": 0.6779, "step": 2145 }, { "epoch": 6.434717545828657, "grad_norm": 1.271693229675293, "learning_rate": 1.409241374663136e-05, "loss": 0.7289, "step": 2150 }, { "epoch": 6.449682005237561, "grad_norm": 1.314024567604065, "learning_rate": 1.3986741209142845e-05, "loss": 0.6656, "step": 2155 }, { "epoch": 6.4646464646464645, "grad_norm": 1.2013462781906128, "learning_rate": 1.3881312263005519e-05, "loss": 0.6836, "step": 2160 }, { "epoch": 6.479610924055368, "grad_norm": 1.332503080368042, "learning_rate": 1.3776129240097673e-05, "loss": 0.7178, "step": 2165 }, { "epoch": 6.494575383464272, "grad_norm": 1.4150094985961914, "learning_rate": 1.3671194466858334e-05, "loss": 0.6895, "step": 2170 }, { "epoch": 6.509539842873176, "grad_norm": 1.3232195377349854, "learning_rate": 1.356651026423566e-05, "loss": 0.7292, "step": 2175 }, { "epoch": 6.524504302282081, "grad_norm": 1.324210286140442, "learning_rate": 1.3462078947635781e-05, "loss": 0.756, "step": 2180 }, { "epoch": 6.5394687616909835, "grad_norm": 1.2665998935699463, "learning_rate": 1.335790282687141e-05, "loss": 0.6959, "step": 2185 }, { "epoch": 6.554433221099888, "grad_norm": 1.1720548868179321, "learning_rate": 1.325398420611088e-05, "loss": 0.7918, "step": 2190 }, { "epoch": 6.569397680508792, "grad_norm": 1.0761444568634033, "learning_rate": 1.3150325383827117e-05, "loss": 0.679, "step": 2195 }, { "epoch": 6.584362139917696, "grad_norm": 1.4445922374725342, "learning_rate": 1.3046928652746832e-05, "loss": 0.802, "step": 2200 }, { "epoch": 6.5993265993265995, "grad_norm": 1.2890619039535522, "learning_rate": 1.2943796299799809e-05, "loss": 0.747, "step": 2205 }, { "epoch": 6.614291058735503, "grad_norm": 1.3807190656661987, "learning_rate": 1.2840930606068289e-05, "loss": 0.6693, "step": 2210 }, { "epoch": 6.629255518144407, "grad_norm": 1.4410628080368042, "learning_rate": 1.273833384673656e-05, "loss": 0.7011, "step": 2215 }, { "epoch": 6.644219977553311, "grad_norm": 1.255650520324707, "learning_rate": 1.2636008291040618e-05, "loss": 0.7627, "step": 2220 }, { "epoch": 6.659184436962215, "grad_norm": 1.2652361392974854, "learning_rate": 1.2533956202217975e-05, "loss": 0.6859, "step": 2225 }, { "epoch": 6.674148896371118, "grad_norm": 1.2963732481002808, "learning_rate": 1.243217983745758e-05, "loss": 0.7204, "step": 2230 }, { "epoch": 6.689113355780022, "grad_norm": 1.4088592529296875, "learning_rate": 1.2330681447849951e-05, "loss": 0.6392, "step": 2235 }, { "epoch": 6.704077815188926, "grad_norm": 1.3027905225753784, "learning_rate": 1.2229463278337308e-05, "loss": 0.7128, "step": 2240 }, { "epoch": 6.71904227459783, "grad_norm": 1.2761296033859253, "learning_rate": 1.2128527567663988e-05, "loss": 0.7145, "step": 2245 }, { "epoch": 6.7340067340067336, "grad_norm": 1.4830342531204224, "learning_rate": 1.2027876548326897e-05, "loss": 0.6784, "step": 2250 }, { "epoch": 6.748971193415638, "grad_norm": 1.2457510232925415, "learning_rate": 1.1927512446526142e-05, "loss": 0.6929, "step": 2255 }, { "epoch": 6.763935652824542, "grad_norm": 1.4039334058761597, "learning_rate": 1.1827437482115759e-05, "loss": 0.7516, "step": 2260 }, { "epoch": 6.778900112233446, "grad_norm": 1.3703151941299438, "learning_rate": 1.172765386855467e-05, "loss": 0.699, "step": 2265 }, { "epoch": 6.79386457164235, "grad_norm": 1.3183362483978271, "learning_rate": 1.1628163812857674e-05, "loss": 0.7607, "step": 2270 }, { "epoch": 6.808829031051253, "grad_norm": 1.2728744745254517, "learning_rate": 1.1528969515546672e-05, "loss": 0.6541, "step": 2275 }, { "epoch": 6.823793490460157, "grad_norm": 1.2783997058868408, "learning_rate": 1.1430073170601968e-05, "loss": 0.684, "step": 2280 }, { "epoch": 6.838757949869061, "grad_norm": 1.145731806755066, "learning_rate": 1.1331476965413773e-05, "loss": 0.7134, "step": 2285 }, { "epoch": 6.853722409277965, "grad_norm": 1.3381609916687012, "learning_rate": 1.1233183080733764e-05, "loss": 0.7275, "step": 2290 }, { "epoch": 6.8686868686868685, "grad_norm": 1.2908689975738525, "learning_rate": 1.1135193690626925e-05, "loss": 0.6796, "step": 2295 }, { "epoch": 6.883651328095772, "grad_norm": 1.5330723524093628, "learning_rate": 1.1037510962423425e-05, "loss": 0.674, "step": 2300 }, { "epoch": 6.898615787504676, "grad_norm": 1.3555113077163696, "learning_rate": 1.0940137056670655e-05, "loss": 0.6678, "step": 2305 }, { "epoch": 6.91358024691358, "grad_norm": 1.2070436477661133, "learning_rate": 1.0843074127085507e-05, "loss": 0.6954, "step": 2310 }, { "epoch": 6.928544706322484, "grad_norm": 1.4584565162658691, "learning_rate": 1.074632432050665e-05, "loss": 0.6517, "step": 2315 }, { "epoch": 6.943509165731388, "grad_norm": 1.2838579416275024, "learning_rate": 1.0649889776847161e-05, "loss": 0.6424, "step": 2320 }, { "epoch": 6.958473625140292, "grad_norm": 1.2093007564544678, "learning_rate": 1.0553772629047067e-05, "loss": 0.7396, "step": 2325 }, { "epoch": 6.973438084549196, "grad_norm": 1.5044478178024292, "learning_rate": 1.0457975003026276e-05, "loss": 0.6806, "step": 2330 }, { "epoch": 6.9884025439581, "grad_norm": 1.2098227739334106, "learning_rate": 1.0362499017637472e-05, "loss": 0.6835, "step": 2335 }, { "epoch": 7.0033670033670035, "grad_norm": 1.259406566619873, "learning_rate": 1.0267346784619324e-05, "loss": 0.6672, "step": 2340 }, { "epoch": 7.018331462775907, "grad_norm": 1.2552211284637451, "learning_rate": 1.0172520408549716e-05, "loss": 0.6341, "step": 2345 }, { "epoch": 7.033295922184811, "grad_norm": 1.3525948524475098, "learning_rate": 1.0078021986799238e-05, "loss": 0.6665, "step": 2350 }, { "epoch": 7.048260381593715, "grad_norm": 1.2309094667434692, "learning_rate": 9.983853609484786e-06, "loss": 0.6903, "step": 2355 }, { "epoch": 7.063224841002619, "grad_norm": 1.2575538158416748, "learning_rate": 9.890017359423325e-06, "loss": 0.7205, "step": 2360 }, { "epoch": 7.078189300411522, "grad_norm": 1.2174732685089111, "learning_rate": 9.796515312085841e-06, "loss": 0.6929, "step": 2365 }, { "epoch": 7.093153759820426, "grad_norm": 1.4941829442977905, "learning_rate": 9.703349535551387e-06, "loss": 0.6346, "step": 2370 }, { "epoch": 7.10811821922933, "grad_norm": 1.3313934803009033, "learning_rate": 9.610522090461415e-06, "loss": 0.6626, "step": 2375 }, { "epoch": 7.123082678638234, "grad_norm": 1.1870646476745605, "learning_rate": 9.518035029974126e-06, "loss": 0.6738, "step": 2380 }, { "epoch": 7.138047138047138, "grad_norm": 1.376810073852539, "learning_rate": 9.425890399719115e-06, "loss": 0.657, "step": 2385 }, { "epoch": 7.153011597456042, "grad_norm": 1.2887132167816162, "learning_rate": 9.334090237752094e-06, "loss": 0.712, "step": 2390 }, { "epoch": 7.167976056864946, "grad_norm": 1.4136420488357544, "learning_rate": 9.242636574509828e-06, "loss": 0.7623, "step": 2395 }, { "epoch": 7.18294051627385, "grad_norm": 1.2454450130462646, "learning_rate": 9.151531432765203e-06, "loss": 0.7891, "step": 2400 }, { "epoch": 7.197904975682754, "grad_norm": 1.3656915426254272, "learning_rate": 9.060776827582529e-06, "loss": 0.6479, "step": 2405 }, { "epoch": 7.212869435091657, "grad_norm": 1.3422670364379883, "learning_rate": 8.970374766272915e-06, "loss": 0.7534, "step": 2410 }, { "epoch": 7.227833894500561, "grad_norm": 1.4018194675445557, "learning_rate": 8.880327248349937e-06, "loss": 0.679, "step": 2415 }, { "epoch": 7.242798353909465, "grad_norm": 1.4204267263412476, "learning_rate": 8.790636265485334e-06, "loss": 0.6811, "step": 2420 }, { "epoch": 7.257762813318369, "grad_norm": 1.3640581369400024, "learning_rate": 8.701303801465052e-06, "loss": 0.6518, "step": 2425 }, { "epoch": 7.2727272727272725, "grad_norm": 1.255414366722107, "learning_rate": 8.612331832145268e-06, "loss": 0.6485, "step": 2430 }, { "epoch": 7.287691732136176, "grad_norm": 1.3959693908691406, "learning_rate": 8.523722325408758e-06, "loss": 0.6528, "step": 2435 }, { "epoch": 7.30265619154508, "grad_norm": 1.3679065704345703, "learning_rate": 8.435477241121353e-06, "loss": 0.6834, "step": 2440 }, { "epoch": 7.317620650953984, "grad_norm": 1.1936756372451782, "learning_rate": 8.347598531088554e-06, "loss": 0.6883, "step": 2445 }, { "epoch": 7.3325851103628885, "grad_norm": 1.3999428749084473, "learning_rate": 8.260088139012435e-06, "loss": 0.6906, "step": 2450 }, { "epoch": 7.347549569771792, "grad_norm": 1.3568490743637085, "learning_rate": 8.17294800044856e-06, "loss": 0.7172, "step": 2455 }, { "epoch": 7.362514029180696, "grad_norm": 1.362327218055725, "learning_rate": 8.086180042763283e-06, "loss": 0.6523, "step": 2460 }, { "epoch": 7.3774784885896, "grad_norm": 1.2796952724456787, "learning_rate": 7.999786185091008e-06, "loss": 0.7196, "step": 2465 }, { "epoch": 7.392442947998504, "grad_norm": 1.339594841003418, "learning_rate": 7.913768338291821e-06, "loss": 0.6475, "step": 2470 }, { "epoch": 7.407407407407407, "grad_norm": 1.3105710744857788, "learning_rate": 7.828128404909171e-06, "loss": 0.6756, "step": 2475 }, { "epoch": 7.422371866816311, "grad_norm": 1.3429076671600342, "learning_rate": 7.742868279127848e-06, "loss": 0.6886, "step": 2480 }, { "epoch": 7.437336326225215, "grad_norm": 1.4829093217849731, "learning_rate": 7.657989846732019e-06, "loss": 0.6894, "step": 2485 }, { "epoch": 7.452300785634119, "grad_norm": 1.4806331396102905, "learning_rate": 7.573494985063579e-06, "loss": 0.6653, "step": 2490 }, { "epoch": 7.467265245043023, "grad_norm": 1.2165873050689697, "learning_rate": 7.489385562980589e-06, "loss": 0.7941, "step": 2495 }, { "epoch": 7.482229704451926, "grad_norm": 1.4139281511306763, "learning_rate": 7.4056634408159685e-06, "loss": 0.689, "step": 2500 }, { "epoch": 7.49719416386083, "grad_norm": 1.307259202003479, "learning_rate": 7.3223304703363135e-06, "loss": 0.6626, "step": 2505 }, { "epoch": 7.512158623269734, "grad_norm": 1.5060079097747803, "learning_rate": 7.2393884947009745e-06, "loss": 0.7061, "step": 2510 }, { "epoch": 7.527123082678639, "grad_norm": 1.623346209526062, "learning_rate": 7.156839348421279e-06, "loss": 0.6958, "step": 2515 }, { "epoch": 7.542087542087542, "grad_norm": 1.3768142461776733, "learning_rate": 7.074684857319927e-06, "loss": 0.7661, "step": 2520 }, { "epoch": 7.557052001496446, "grad_norm": 1.7065874338150024, "learning_rate": 6.992926838490657e-06, "loss": 0.6989, "step": 2525 }, { "epoch": 7.57201646090535, "grad_norm": 1.4630271196365356, "learning_rate": 6.91156710025802e-06, "loss": 0.761, "step": 2530 }, { "epoch": 7.586980920314254, "grad_norm": 1.3342783451080322, "learning_rate": 6.830607442137405e-06, "loss": 0.6834, "step": 2535 }, { "epoch": 7.6019453797231575, "grad_norm": 1.3920519351959229, "learning_rate": 6.7500496547951984e-06, "loss": 0.6939, "step": 2540 }, { "epoch": 7.616909839132061, "grad_norm": 1.4310715198516846, "learning_rate": 6.6698955200092396e-06, "loss": 0.6789, "step": 2545 }, { "epoch": 7.631874298540965, "grad_norm": 1.2729769945144653, "learning_rate": 6.590146810629347e-06, "loss": 0.6925, "step": 2550 }, { "epoch": 7.646838757949869, "grad_norm": 1.2772436141967773, "learning_rate": 6.510805290538158e-06, "loss": 0.6714, "step": 2555 }, { "epoch": 7.661803217358773, "grad_norm": 1.3461037874221802, "learning_rate": 6.431872714612072e-06, "loss": 0.6973, "step": 2560 }, { "epoch": 7.6767676767676765, "grad_norm": 1.2915376424789429, "learning_rate": 6.353350828682494e-06, "loss": 0.6669, "step": 2565 }, { "epoch": 7.69173213617658, "grad_norm": 1.287246584892273, "learning_rate": 6.275241369497142e-06, "loss": 0.7157, "step": 2570 }, { "epoch": 7.706696595585484, "grad_norm": 1.4065686464309692, "learning_rate": 6.197546064681714e-06, "loss": 0.7474, "step": 2575 }, { "epoch": 7.721661054994389, "grad_norm": 1.5173590183258057, "learning_rate": 6.120266632701599e-06, "loss": 0.6442, "step": 2580 }, { "epoch": 7.7366255144032925, "grad_norm": 1.2145261764526367, "learning_rate": 6.043404782823939e-06, "loss": 0.6729, "step": 2585 }, { "epoch": 7.751589973812196, "grad_norm": 1.3860505819320679, "learning_rate": 5.966962215079786e-06, "loss": 0.7085, "step": 2590 }, { "epoch": 7.7665544332211, "grad_norm": 1.2852251529693604, "learning_rate": 5.890940620226479e-06, "loss": 0.6983, "step": 2595 }, { "epoch": 7.781518892630004, "grad_norm": 1.2326298952102661, "learning_rate": 5.815341679710326e-06, "loss": 0.6758, "step": 2600 }, { "epoch": 7.796483352038908, "grad_norm": 1.2480541467666626, "learning_rate": 5.740167065629312e-06, "loss": 0.6605, "step": 2605 }, { "epoch": 7.811447811447811, "grad_norm": 1.2479559183120728, "learning_rate": 5.665418440696202e-06, "loss": 0.6348, "step": 2610 }, { "epoch": 7.826412270856715, "grad_norm": 1.3373992443084717, "learning_rate": 5.591097458201699e-06, "loss": 0.746, "step": 2615 }, { "epoch": 7.841376730265619, "grad_norm": 1.3737704753875732, "learning_rate": 5.51720576197794e-06, "loss": 0.6511, "step": 2620 }, { "epoch": 7.856341189674523, "grad_norm": 1.3783513307571411, "learning_rate": 5.443744986362071e-06, "loss": 0.6767, "step": 2625 }, { "epoch": 7.871305649083427, "grad_norm": 1.2600133419036865, "learning_rate": 5.370716756160157e-06, "loss": 0.6918, "step": 2630 }, { "epoch": 7.88627010849233, "grad_norm": 1.254599928855896, "learning_rate": 5.298122686611212e-06, "loss": 0.7017, "step": 2635 }, { "epoch": 7.901234567901234, "grad_norm": 1.2620840072631836, "learning_rate": 5.2259643833514896e-06, "loss": 0.7181, "step": 2640 }, { "epoch": 7.916199027310139, "grad_norm": 1.2185419797897339, "learning_rate": 5.154243442378934e-06, "loss": 0.7121, "step": 2645 }, { "epoch": 7.931163486719043, "grad_norm": 1.360809564590454, "learning_rate": 5.082961450017943e-06, "loss": 0.6642, "step": 2650 }, { "epoch": 7.946127946127946, "grad_norm": 1.3635886907577515, "learning_rate": 5.012119982884209e-06, "loss": 0.7676, "step": 2655 }, { "epoch": 7.96109240553685, "grad_norm": 1.37740159034729, "learning_rate": 4.9417206078499115e-06, "loss": 0.6912, "step": 2660 }, { "epoch": 7.976056864945754, "grad_norm": 1.2868249416351318, "learning_rate": 4.871764882009025e-06, "loss": 0.6582, "step": 2665 }, { "epoch": 7.991021324354658, "grad_norm": 1.4278684854507446, "learning_rate": 4.802254352642882e-06, "loss": 0.6806, "step": 2670 }, { "epoch": 8.005985783763562, "grad_norm": 1.2541025876998901, "learning_rate": 4.7331905571859705e-06, "loss": 0.6896, "step": 2675 }, { "epoch": 8.020950243172466, "grad_norm": 1.2635290622711182, "learning_rate": 4.664575023191886e-06, "loss": 0.6491, "step": 2680 }, { "epoch": 8.035914702581369, "grad_norm": 1.266473412513733, "learning_rate": 4.5964092682996065e-06, "loss": 0.6457, "step": 2685 }, { "epoch": 8.050879161990274, "grad_norm": 1.4658360481262207, "learning_rate": 4.528694800199859e-06, "loss": 0.673, "step": 2690 }, { "epoch": 8.065843621399177, "grad_norm": 1.3015804290771484, "learning_rate": 4.46143311660184e-06, "loss": 0.661, "step": 2695 }, { "epoch": 8.080808080808081, "grad_norm": 1.334692358970642, "learning_rate": 4.394625705200011e-06, "loss": 0.7065, "step": 2700 }, { "epoch": 8.095772540216984, "grad_norm": 1.2139922380447388, "learning_rate": 4.328274043641295e-06, "loss": 0.7074, "step": 2705 }, { "epoch": 8.110736999625889, "grad_norm": 1.2450037002563477, "learning_rate": 4.262379599492283e-06, "loss": 0.666, "step": 2710 }, { "epoch": 8.125701459034792, "grad_norm": 1.3340483903884888, "learning_rate": 4.196943830206859e-06, "loss": 0.6469, "step": 2715 }, { "epoch": 8.140665918443696, "grad_norm": 1.3370238542556763, "learning_rate": 4.131968183093912e-06, "loss": 0.6642, "step": 2720 }, { "epoch": 8.1556303778526, "grad_norm": 1.2851170301437378, "learning_rate": 4.067454095285362e-06, "loss": 0.6602, "step": 2725 }, { "epoch": 8.170594837261504, "grad_norm": 1.5661766529083252, "learning_rate": 4.003402993704353e-06, "loss": 0.6465, "step": 2730 }, { "epoch": 8.185559296670407, "grad_norm": 1.2045555114746094, "learning_rate": 3.939816295033677e-06, "loss": 0.6823, "step": 2735 }, { "epoch": 8.200523756079312, "grad_norm": 1.3167060613632202, "learning_rate": 3.8766954056844855e-06, "loss": 0.7163, "step": 2740 }, { "epoch": 8.215488215488216, "grad_norm": 1.3332468271255493, "learning_rate": 3.8140417217651438e-06, "loss": 0.7558, "step": 2745 }, { "epoch": 8.23045267489712, "grad_norm": 1.344228744506836, "learning_rate": 3.7518566290503626e-06, "loss": 0.7451, "step": 2750 }, { "epoch": 8.245417134306024, "grad_norm": 1.346323847770691, "learning_rate": 3.690141502950542e-06, "loss": 0.6998, "step": 2755 }, { "epoch": 8.260381593714927, "grad_norm": 1.3617771863937378, "learning_rate": 3.6288977084813767e-06, "loss": 0.6885, "step": 2760 }, { "epoch": 8.275346053123831, "grad_norm": 1.2529648542404175, "learning_rate": 3.568126600233615e-06, "loss": 0.6851, "step": 2765 }, { "epoch": 8.290310512532734, "grad_norm": 1.4627494812011719, "learning_rate": 3.5078295223431536e-06, "loss": 0.7307, "step": 2770 }, { "epoch": 8.305274971941639, "grad_norm": 1.3447396755218506, "learning_rate": 3.4480078084612677e-06, "loss": 0.6878, "step": 2775 }, { "epoch": 8.320239431350542, "grad_norm": 1.2098687887191772, "learning_rate": 3.388662781725141e-06, "loss": 0.6968, "step": 2780 }, { "epoch": 8.335203890759447, "grad_norm": 1.2697949409484863, "learning_rate": 3.3297957547285626e-06, "loss": 0.7097, "step": 2785 }, { "epoch": 8.35016835016835, "grad_norm": 1.344548225402832, "learning_rate": 3.2714080294929477e-06, "loss": 0.6899, "step": 2790 }, { "epoch": 8.365132809577254, "grad_norm": 1.283050298690796, "learning_rate": 3.2135008974384874e-06, "loss": 0.611, "step": 2795 }, { "epoch": 8.380097268986157, "grad_norm": 1.4077322483062744, "learning_rate": 3.1560756393556183e-06, "loss": 0.6673, "step": 2800 }, { "epoch": 8.395061728395062, "grad_norm": 1.4759045839309692, "learning_rate": 3.0991335253766934e-06, "loss": 0.7485, "step": 2805 }, { "epoch": 8.410026187803966, "grad_norm": 1.3058305978775024, "learning_rate": 3.042675814947868e-06, "loss": 0.6873, "step": 2810 }, { "epoch": 8.42499064721287, "grad_norm": 1.3055214881896973, "learning_rate": 2.986703756801257e-06, "loss": 0.7064, "step": 2815 }, { "epoch": 8.439955106621774, "grad_norm": 1.2436131238937378, "learning_rate": 2.931218588927315e-06, "loss": 0.6871, "step": 2820 }, { "epoch": 8.454919566030677, "grad_norm": 1.5080686807632446, "learning_rate": 2.8762215385474633e-06, "loss": 0.7363, "step": 2825 }, { "epoch": 8.469884025439582, "grad_norm": 1.3684037923812866, "learning_rate": 2.8217138220869187e-06, "loss": 0.6719, "step": 2830 }, { "epoch": 8.484848484848484, "grad_norm": 1.3375248908996582, "learning_rate": 2.7676966451478214e-06, "loss": 0.6715, "step": 2835 }, { "epoch": 8.499812944257389, "grad_norm": 1.4447715282440186, "learning_rate": 2.714171202482538e-06, "loss": 0.6697, "step": 2840 }, { "epoch": 8.514777403666292, "grad_norm": 1.4097157716751099, "learning_rate": 2.661138677967279e-06, "loss": 0.7199, "step": 2845 }, { "epoch": 8.529741863075197, "grad_norm": 1.4371775388717651, "learning_rate": 2.6086002445758566e-06, "loss": 0.681, "step": 2850 }, { "epoch": 8.5447063224841, "grad_norm": 1.353463053703308, "learning_rate": 2.5565570643537954e-06, "loss": 0.6461, "step": 2855 }, { "epoch": 8.559670781893004, "grad_norm": 1.2656768560409546, "learning_rate": 2.505010288392587e-06, "loss": 0.723, "step": 2860 }, { "epoch": 8.574635241301909, "grad_norm": 1.3458527326583862, "learning_rate": 2.4539610568042657e-06, "loss": 0.6481, "step": 2865 }, { "epoch": 8.589599700710812, "grad_norm": 1.4183650016784668, "learning_rate": 2.4034104986961627e-06, "loss": 0.7229, "step": 2870 }, { "epoch": 8.604564160119716, "grad_norm": 1.3535906076431274, "learning_rate": 2.3533597321459516e-06, "loss": 0.6762, "step": 2875 }, { "epoch": 8.61952861952862, "grad_norm": 1.4276947975158691, "learning_rate": 2.303809864176909e-06, "loss": 0.6379, "step": 2880 }, { "epoch": 8.634493078937524, "grad_norm": 1.312292218208313, "learning_rate": 2.254761990733445e-06, "loss": 0.6753, "step": 2885 }, { "epoch": 8.649457538346427, "grad_norm": 1.3349074125289917, "learning_rate": 2.206217196656826e-06, "loss": 0.7395, "step": 2890 }, { "epoch": 8.664421997755332, "grad_norm": 1.367660403251648, "learning_rate": 2.1581765556612233e-06, "loss": 0.7564, "step": 2895 }, { "epoch": 8.679386457164235, "grad_norm": 1.302215337753296, "learning_rate": 2.1106411303099455e-06, "loss": 0.6862, "step": 2900 }, { "epoch": 8.69435091657314, "grad_norm": 1.2132118940353394, "learning_rate": 2.0636119719919246e-06, "loss": 0.7351, "step": 2905 }, { "epoch": 8.709315375982042, "grad_norm": 1.4168857336044312, "learning_rate": 2.017090120898485e-06, "loss": 0.6748, "step": 2910 }, { "epoch": 8.724279835390947, "grad_norm": 1.5280455350875854, "learning_rate": 1.971076606000327e-06, "loss": 0.6935, "step": 2915 }, { "epoch": 8.73924429479985, "grad_norm": 1.440262794494629, "learning_rate": 1.9255724450247674e-06, "loss": 0.6629, "step": 2920 }, { "epoch": 8.754208754208754, "grad_norm": 1.36149263381958, "learning_rate": 1.8805786444332092e-06, "loss": 0.6644, "step": 2925 }, { "epoch": 8.769173213617659, "grad_norm": 1.3209813833236694, "learning_rate": 1.836096199398929e-06, "loss": 0.6469, "step": 2930 }, { "epoch": 8.784137673026562, "grad_norm": 1.3598469495773315, "learning_rate": 1.7921260937850099e-06, "loss": 0.646, "step": 2935 }, { "epoch": 8.799102132435467, "grad_norm": 1.2802332639694214, "learning_rate": 1.7486693001226268e-06, "loss": 0.7487, "step": 2940 }, { "epoch": 8.81406659184437, "grad_norm": 1.3110156059265137, "learning_rate": 1.7057267795895115e-06, "loss": 0.702, "step": 2945 }, { "epoch": 8.829031051253274, "grad_norm": 1.324245572090149, "learning_rate": 1.6632994819886977e-06, "loss": 0.6807, "step": 2950 }, { "epoch": 8.843995510662177, "grad_norm": 1.2745212316513062, "learning_rate": 1.6213883457275065e-06, "loss": 0.6846, "step": 2955 }, { "epoch": 8.858959970071082, "grad_norm": 1.4197077751159668, "learning_rate": 1.579994297796808e-06, "loss": 0.7325, "step": 2960 }, { "epoch": 8.873924429479985, "grad_norm": 1.3314228057861328, "learning_rate": 1.5391182537505072e-06, "loss": 0.6899, "step": 2965 }, { "epoch": 8.88888888888889, "grad_norm": 1.3566371202468872, "learning_rate": 1.4987611176852878e-06, "loss": 0.6596, "step": 2970 }, { "epoch": 8.903853348297792, "grad_norm": 1.3632760047912598, "learning_rate": 1.4589237822206282e-06, "loss": 0.7111, "step": 2975 }, { "epoch": 8.918817807706697, "grad_norm": 1.4764022827148438, "learning_rate": 1.419607128479053e-06, "loss": 0.7168, "step": 2980 }, { "epoch": 8.9337822671156, "grad_norm": 1.1871962547302246, "learning_rate": 1.3808120260666441e-06, "loss": 0.7182, "step": 2985 }, { "epoch": 8.948746726524504, "grad_norm": 1.2561469078063965, "learning_rate": 1.3425393330538022e-06, "loss": 0.6455, "step": 2990 }, { "epoch": 8.963711185933409, "grad_norm": 1.4918162822723389, "learning_rate": 1.3047898959562765e-06, "loss": 0.7042, "step": 2995 }, { "epoch": 8.978675645342312, "grad_norm": 1.3534742593765259, "learning_rate": 1.267564549716435e-06, "loss": 0.6742, "step": 3000 }, { "epoch": 8.993640104751217, "grad_norm": 1.4959015846252441, "learning_rate": 1.2308641176848046e-06, "loss": 0.6838, "step": 3005 }, { "epoch": 9.00860456416012, "grad_norm": 1.309097409248352, "learning_rate": 1.1946894116018404e-06, "loss": 0.6411, "step": 3010 }, { "epoch": 9.023569023569024, "grad_norm": 1.3958250284194946, "learning_rate": 1.159041231580016e-06, "loss": 0.7136, "step": 3015 }, { "epoch": 9.038533482977927, "grad_norm": 1.307607650756836, "learning_rate": 1.1239203660860648e-06, "loss": 0.7436, "step": 3020 }, { "epoch": 9.053497942386832, "grad_norm": 1.273493766784668, "learning_rate": 1.0893275919235945e-06, "loss": 0.7149, "step": 3025 }, { "epoch": 9.068462401795735, "grad_norm": 1.4512149095535278, "learning_rate": 1.05526367421587e-06, "loss": 0.7207, "step": 3030 }, { "epoch": 9.08342686120464, "grad_norm": 1.3597697019577026, "learning_rate": 1.0217293663889155e-06, "loss": 0.6602, "step": 3035 }, { "epoch": 9.098391320613542, "grad_norm": 1.4251606464385986, "learning_rate": 9.88725410154842e-07, "loss": 0.7312, "step": 3040 }, { "epoch": 9.113355780022447, "grad_norm": 1.3595529794692993, "learning_rate": 9.562525354954193e-07, "loss": 0.7044, "step": 3045 }, { "epoch": 9.12832023943135, "grad_norm": 1.2834125757217407, "learning_rate": 9.243114606459741e-07, "loss": 0.7221, "step": 3050 }, { "epoch": 9.143284698840255, "grad_norm": 1.3886545896530151, "learning_rate": 8.92902892079464e-07, "loss": 0.6504, "step": 3055 }, { "epoch": 9.158249158249157, "grad_norm": 1.533457636833191, "learning_rate": 8.620275244908827e-07, "loss": 0.6788, "step": 3060 }, { "epoch": 9.173213617658062, "grad_norm": 1.493024230003357, "learning_rate": 8.31686040781865e-07, "loss": 0.6803, "step": 3065 }, { "epoch": 9.188178077066967, "grad_norm": 1.2318785190582275, "learning_rate": 8.018791120456087e-07, "loss": 0.6904, "step": 3070 }, { "epoch": 9.20314253647587, "grad_norm": 1.4301903247833252, "learning_rate": 7.726073975520082e-07, "loss": 0.6777, "step": 3075 }, { "epoch": 9.218106995884774, "grad_norm": 1.322068452835083, "learning_rate": 7.438715447331018e-07, "loss": 0.685, "step": 3080 }, { "epoch": 9.233071455293677, "grad_norm": 1.2603065967559814, "learning_rate": 7.156721891687202e-07, "loss": 0.6712, "step": 3085 }, { "epoch": 9.248035914702582, "grad_norm": 1.4191964864730835, "learning_rate": 6.880099545724522e-07, "loss": 0.7124, "step": 3090 }, { "epoch": 9.263000374111485, "grad_norm": 1.411106824874878, "learning_rate": 6.608854527778319e-07, "loss": 0.6788, "step": 3095 }, { "epoch": 9.27796483352039, "grad_norm": 1.3679730892181396, "learning_rate": 6.342992837248235e-07, "loss": 0.69, "step": 3100 }, { "epoch": 9.292929292929292, "grad_norm": 1.2826892137527466, "learning_rate": 6.082520354465382e-07, "loss": 0.7124, "step": 3105 }, { "epoch": 9.307893752338197, "grad_norm": 1.2693568468093872, "learning_rate": 5.82744284056233e-07, "loss": 0.6702, "step": 3110 }, { "epoch": 9.3228582117471, "grad_norm": 1.512661099433899, "learning_rate": 5.577765937345686e-07, "loss": 0.663, "step": 3115 }, { "epoch": 9.337822671156005, "grad_norm": 1.5203378200531006, "learning_rate": 5.333495167171353e-07, "loss": 0.6927, "step": 3120 }, { "epoch": 9.352787130564908, "grad_norm": 1.541284203529358, "learning_rate": 5.094635932822223e-07, "loss": 0.6629, "step": 3125 }, { "epoch": 9.367751589973812, "grad_norm": 1.2277456521987915, "learning_rate": 4.861193517388923e-07, "loss": 0.7342, "step": 3130 }, { "epoch": 9.382716049382717, "grad_norm": 1.3728615045547485, "learning_rate": 4.6331730841527587e-07, "loss": 0.6597, "step": 3135 }, { "epoch": 9.39768050879162, "grad_norm": 1.2458422183990479, "learning_rate": 4.4105796764715714e-07, "loss": 0.6654, "step": 3140 }, { "epoch": 9.412644968200524, "grad_norm": 1.41146981716156, "learning_rate": 4.1934182176683045e-07, "loss": 0.7134, "step": 3145 }, { "epoch": 9.427609427609427, "grad_norm": 1.306872010231018, "learning_rate": 3.9816935109218413e-07, "loss": 0.6154, "step": 3150 }, { "epoch": 9.442573887018332, "grad_norm": 1.3811511993408203, "learning_rate": 3.7754102391611424e-07, "loss": 0.6862, "step": 3155 }, { "epoch": 9.457538346427235, "grad_norm": 1.2896925210952759, "learning_rate": 3.5745729649613034e-07, "loss": 0.6778, "step": 3160 }, { "epoch": 9.47250280583614, "grad_norm": 1.1952729225158691, "learning_rate": 3.3791861304428574e-07, "loss": 0.6891, "step": 3165 }, { "epoch": 9.487467265245042, "grad_norm": 1.3523303270339966, "learning_rate": 3.189254057173491e-07, "loss": 0.6576, "step": 3170 }, { "epoch": 9.502431724653947, "grad_norm": 1.3650611639022827, "learning_rate": 3.004780946072372e-07, "loss": 0.6533, "step": 3175 }, { "epoch": 9.51739618406285, "grad_norm": 1.2557883262634277, "learning_rate": 2.825770877317363e-07, "loss": 0.7639, "step": 3180 }, { "epoch": 9.532360643471755, "grad_norm": 1.24583899974823, "learning_rate": 2.6522278102546485e-07, "loss": 0.6856, "step": 3185 }, { "epoch": 9.547325102880658, "grad_norm": 1.48171865940094, "learning_rate": 2.484155583311276e-07, "loss": 0.6486, "step": 3190 }, { "epoch": 9.562289562289562, "grad_norm": 1.2459851503372192, "learning_rate": 2.3215579139101996e-07, "loss": 0.6377, "step": 3195 }, { "epoch": 9.577254021698467, "grad_norm": 1.1139715909957886, "learning_rate": 2.1644383983880357e-07, "loss": 0.6703, "step": 3200 }, { "epoch": 9.59221848110737, "grad_norm": 1.4323070049285889, "learning_rate": 2.012800511915547e-07, "loss": 0.6743, "step": 3205 }, { "epoch": 9.607182940516275, "grad_norm": 1.3250705003738403, "learning_rate": 1.8666476084208129e-07, "loss": 0.7117, "step": 3210 }, { "epoch": 9.622147399925177, "grad_norm": 1.4704447984695435, "learning_rate": 1.7259829205149568e-07, "loss": 0.6817, "step": 3215 }, { "epoch": 9.637111859334082, "grad_norm": 1.2608110904693604, "learning_rate": 1.5908095594207583e-07, "loss": 0.7122, "step": 3220 }, { "epoch": 9.652076318742985, "grad_norm": 1.4275965690612793, "learning_rate": 1.4611305149037358e-07, "loss": 0.6386, "step": 3225 }, { "epoch": 9.66704077815189, "grad_norm": 1.163145899772644, "learning_rate": 1.336948655206144e-07, "loss": 0.6882, "step": 3230 }, { "epoch": 9.682005237560793, "grad_norm": 1.4491647481918335, "learning_rate": 1.218266726983386e-07, "loss": 0.6826, "step": 3235 }, { "epoch": 9.696969696969697, "grad_norm": 1.3578821420669556, "learning_rate": 1.1050873552433394e-07, "loss": 0.7251, "step": 3240 }, { "epoch": 9.7119341563786, "grad_norm": 1.2743161916732788, "learning_rate": 9.974130432883199e-08, "loss": 0.7072, "step": 3245 }, { "epoch": 9.726898615787505, "grad_norm": 1.2915595769882202, "learning_rate": 8.952461726596528e-08, "loss": 0.6555, "step": 3250 }, { "epoch": 9.741863075196408, "grad_norm": 1.2591161727905273, "learning_rate": 7.985890030850762e-08, "loss": 0.6642, "step": 3255 }, { "epoch": 9.756827534605312, "grad_norm": 1.373780369758606, "learning_rate": 7.074436724286704e-08, "loss": 0.6987, "step": 3260 }, { "epoch": 9.771791994014217, "grad_norm": 1.3390823602676392, "learning_rate": 6.218121966436175e-08, "loss": 0.7699, "step": 3265 }, { "epoch": 9.78675645342312, "grad_norm": 1.4248472452163696, "learning_rate": 5.416964697276261e-08, "loss": 0.6654, "step": 3270 }, { "epoch": 9.801720912832025, "grad_norm": 1.315335988998413, "learning_rate": 4.670982636810761e-08, "loss": 0.6681, "step": 3275 }, { "epoch": 9.816685372240928, "grad_norm": 1.28786039352417, "learning_rate": 3.9801922846766095e-08, "loss": 0.7033, "step": 3280 }, { "epoch": 9.831649831649832, "grad_norm": 1.4623719453811646, "learning_rate": 3.3446089197805565e-08, "loss": 0.6899, "step": 3285 }, { "epoch": 9.846614291058735, "grad_norm": 1.390443205833435, "learning_rate": 2.7642465999613842e-08, "loss": 0.6837, "step": 3290 }, { "epoch": 9.86157875046764, "grad_norm": 1.2957769632339478, "learning_rate": 2.2391181616776556e-08, "loss": 0.6578, "step": 3295 }, { "epoch": 9.876543209876543, "grad_norm": 1.454103708267212, "learning_rate": 1.7692352197240526e-08, "loss": 0.6546, "step": 3300 }, { "epoch": 9.891507669285447, "grad_norm": 1.2412161827087402, "learning_rate": 1.354608166976301e-08, "loss": 0.6437, "step": 3305 }, { "epoch": 9.90647212869435, "grad_norm": 1.3440452814102173, "learning_rate": 9.952461741585817e-09, "loss": 0.726, "step": 3310 }, { "epoch": 9.921436588103255, "grad_norm": 1.402801513671875, "learning_rate": 6.9115718964257726e-09, "loss": 0.6458, "step": 3315 }, { "epoch": 9.936401047512158, "grad_norm": 1.253630518913269, "learning_rate": 4.423479392709484e-09, "loss": 0.6936, "step": 3320 }, { "epoch": 9.951365506921062, "grad_norm": 1.3306463956832886, "learning_rate": 2.48823926208841e-09, "loss": 0.7048, "step": 3325 }, { "epoch": 9.966329966329967, "grad_norm": 1.2049169540405273, "learning_rate": 1.10589430822039e-09, "loss": 0.6899, "step": 3330 }, { "epoch": 9.98129442573887, "grad_norm": 1.4228817224502563, "learning_rate": 2.764751058259574e-10, "loss": 0.7091, "step": 3335 }, { "epoch": 9.996258885147775, "grad_norm": 1.234875202178955, "learning_rate": 0.0, "loss": 0.6954, "step": 3340 }, { "epoch": 9.996258885147775, "step": 3340, "total_flos": 1.2597949543307674e+18, "train_loss": 0.7376079930516775, "train_runtime": 29450.1993, "train_samples_per_second": 1.815, "train_steps_per_second": 0.113 } ], "logging_steps": 5, "max_steps": 3340, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.2597949543307674e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }