{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999155096178218, "eval_steps": 500, "global_step": 4438, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00022530768580843213, "grad_norm": 1.0947727710467006, "learning_rate": 2.2522522522522524e-08, "loss": 0.3315, "step": 1 }, { "epoch": 0.00045061537161686426, "grad_norm": 1.1056270284189085, "learning_rate": 4.504504504504505e-08, "loss": 0.3365, "step": 2 }, { "epoch": 0.0006759230574252964, "grad_norm": 1.1038791662741807, "learning_rate": 6.756756756756757e-08, "loss": 0.3123, "step": 3 }, { "epoch": 0.0009012307432337285, "grad_norm": 1.1319684508304422, "learning_rate": 9.00900900900901e-08, "loss": 0.3496, "step": 4 }, { "epoch": 0.0011265384290421608, "grad_norm": 1.0950409822450369, "learning_rate": 1.1261261261261262e-07, "loss": 0.3356, "step": 5 }, { "epoch": 0.0013518461148505929, "grad_norm": 1.110496283834593, "learning_rate": 1.3513513513513515e-07, "loss": 0.3396, "step": 6 }, { "epoch": 0.001577153800659025, "grad_norm": 1.0802753121404411, "learning_rate": 1.5765765765765766e-07, "loss": 0.3135, "step": 7 }, { "epoch": 0.001802461486467457, "grad_norm": 1.1166340984108152, "learning_rate": 1.801801801801802e-07, "loss": 0.3234, "step": 8 }, { "epoch": 0.0020277691722758893, "grad_norm": 1.0777930663732238, "learning_rate": 2.0270270270270273e-07, "loss": 0.3248, "step": 9 }, { "epoch": 0.0022530768580843216, "grad_norm": 1.0647347889926293, "learning_rate": 2.2522522522522524e-07, "loss": 0.3323, "step": 10 }, { "epoch": 0.0024783845438927535, "grad_norm": 1.0702813501737287, "learning_rate": 2.477477477477478e-07, "loss": 0.3538, "step": 11 }, { "epoch": 0.0027036922297011858, "grad_norm": 1.0776989359324258, "learning_rate": 2.702702702702703e-07, "loss": 0.3381, "step": 12 }, { "epoch": 0.0029289999155096176, "grad_norm": 1.0547392159345734, "learning_rate": 2.927927927927928e-07, "loss": 0.325, "step": 13 }, { "epoch": 0.00315430760131805, "grad_norm": 1.1128330063858483, "learning_rate": 3.153153153153153e-07, "loss": 0.3457, "step": 14 }, { "epoch": 0.003379615287126482, "grad_norm": 1.026561868913419, "learning_rate": 3.378378378378379e-07, "loss": 0.3302, "step": 15 }, { "epoch": 0.003604922972934914, "grad_norm": 1.0844063716667791, "learning_rate": 3.603603603603604e-07, "loss": 0.3511, "step": 16 }, { "epoch": 0.0038302306587433463, "grad_norm": 1.0432870057572243, "learning_rate": 3.828828828828829e-07, "loss": 0.3313, "step": 17 }, { "epoch": 0.004055538344551779, "grad_norm": 1.0877367387628776, "learning_rate": 4.0540540540540546e-07, "loss": 0.3554, "step": 18 }, { "epoch": 0.0042808460303602105, "grad_norm": 1.055171840020527, "learning_rate": 4.27927927927928e-07, "loss": 0.3501, "step": 19 }, { "epoch": 0.004506153716168643, "grad_norm": 1.0226585393284515, "learning_rate": 4.504504504504505e-07, "loss": 0.3312, "step": 20 }, { "epoch": 0.004731461401977075, "grad_norm": 0.9662290728315617, "learning_rate": 4.7297297297297305e-07, "loss": 0.3339, "step": 21 }, { "epoch": 0.004956769087785507, "grad_norm": 0.9497175074415394, "learning_rate": 4.954954954954956e-07, "loss": 0.3321, "step": 22 }, { "epoch": 0.00518207677359394, "grad_norm": 0.9901701348569355, "learning_rate": 5.180180180180181e-07, "loss": 0.3265, "step": 23 }, { "epoch": 0.0054073844594023715, "grad_norm": 0.8828316210132122, "learning_rate": 5.405405405405406e-07, "loss": 0.3109, "step": 24 }, { "epoch": 0.005632692145210803, "grad_norm": 0.906520614071178, "learning_rate": 5.630630630630631e-07, "loss": 0.3252, "step": 25 }, { "epoch": 0.005857999831019235, "grad_norm": 0.8837738853128801, "learning_rate": 5.855855855855856e-07, "loss": 0.3313, "step": 26 }, { "epoch": 0.006083307516827668, "grad_norm": 0.8839821083094102, "learning_rate": 6.081081081081082e-07, "loss": 0.3475, "step": 27 }, { "epoch": 0.0063086152026361, "grad_norm": 0.8092230130785764, "learning_rate": 6.306306306306306e-07, "loss": 0.3134, "step": 28 }, { "epoch": 0.006533922888444532, "grad_norm": 0.7697471952605461, "learning_rate": 6.531531531531532e-07, "loss": 0.3143, "step": 29 }, { "epoch": 0.006759230574252964, "grad_norm": 0.7360077029834586, "learning_rate": 6.756756756756758e-07, "loss": 0.2975, "step": 30 }, { "epoch": 0.006984538260061396, "grad_norm": 0.7303882904067905, "learning_rate": 6.981981981981982e-07, "loss": 0.3138, "step": 31 }, { "epoch": 0.007209845945869828, "grad_norm": 0.7331410718564746, "learning_rate": 7.207207207207208e-07, "loss": 0.3118, "step": 32 }, { "epoch": 0.007435153631678261, "grad_norm": 0.7585176643664102, "learning_rate": 7.432432432432434e-07, "loss": 0.3238, "step": 33 }, { "epoch": 0.007660461317486693, "grad_norm": 0.6929989393254152, "learning_rate": 7.657657657657658e-07, "loss": 0.2965, "step": 34 }, { "epoch": 0.007885769003295125, "grad_norm": 0.7248401729174301, "learning_rate": 7.882882882882883e-07, "loss": 0.3134, "step": 35 }, { "epoch": 0.008111076689103557, "grad_norm": 0.6846514226447453, "learning_rate": 8.108108108108109e-07, "loss": 0.2876, "step": 36 }, { "epoch": 0.008336384374911989, "grad_norm": 0.6848701434990109, "learning_rate": 8.333333333333333e-07, "loss": 0.2921, "step": 37 }, { "epoch": 0.008561692060720421, "grad_norm": 0.684924752449004, "learning_rate": 8.55855855855856e-07, "loss": 0.3097, "step": 38 }, { "epoch": 0.008786999746528853, "grad_norm": 0.6598684646273593, "learning_rate": 8.783783783783785e-07, "loss": 0.2996, "step": 39 }, { "epoch": 0.009012307432337286, "grad_norm": 0.651883046062235, "learning_rate": 9.00900900900901e-07, "loss": 0.2918, "step": 40 }, { "epoch": 0.009237615118145718, "grad_norm": 0.6607351963566211, "learning_rate": 9.234234234234235e-07, "loss": 0.3159, "step": 41 }, { "epoch": 0.00946292280395415, "grad_norm": 0.5841038932200996, "learning_rate": 9.459459459459461e-07, "loss": 0.2989, "step": 42 }, { "epoch": 0.009688230489762582, "grad_norm": 0.5958613029441983, "learning_rate": 9.684684684684686e-07, "loss": 0.2812, "step": 43 }, { "epoch": 0.009913538175571014, "grad_norm": 0.6099154207191207, "learning_rate": 9.909909909909911e-07, "loss": 0.2793, "step": 44 }, { "epoch": 0.010138845861379446, "grad_norm": 0.6118645853590743, "learning_rate": 1.0135135135135136e-06, "loss": 0.2754, "step": 45 }, { "epoch": 0.01036415354718788, "grad_norm": 0.6542805689870427, "learning_rate": 1.0360360360360361e-06, "loss": 0.2822, "step": 46 }, { "epoch": 0.010589461232996311, "grad_norm": 0.6335055926076487, "learning_rate": 1.0585585585585587e-06, "loss": 0.2655, "step": 47 }, { "epoch": 0.010814768918804743, "grad_norm": 0.6801605196071887, "learning_rate": 1.0810810810810812e-06, "loss": 0.2919, "step": 48 }, { "epoch": 0.011040076604613175, "grad_norm": 0.5974596009318522, "learning_rate": 1.1036036036036037e-06, "loss": 0.2799, "step": 49 }, { "epoch": 0.011265384290421607, "grad_norm": 0.6125372348195963, "learning_rate": 1.1261261261261262e-06, "loss": 0.2759, "step": 50 }, { "epoch": 0.011490691976230039, "grad_norm": 0.5838366991561806, "learning_rate": 1.148648648648649e-06, "loss": 0.278, "step": 51 }, { "epoch": 0.01171599966203847, "grad_norm": 0.6341141792855924, "learning_rate": 1.1711711711711712e-06, "loss": 0.2865, "step": 52 }, { "epoch": 0.011941307347846904, "grad_norm": 0.615509564485624, "learning_rate": 1.1936936936936937e-06, "loss": 0.2848, "step": 53 }, { "epoch": 0.012166615033655336, "grad_norm": 0.5416726529744037, "learning_rate": 1.2162162162162164e-06, "loss": 0.2662, "step": 54 }, { "epoch": 0.012391922719463768, "grad_norm": 0.5686478139322753, "learning_rate": 1.2387387387387387e-06, "loss": 0.3033, "step": 55 }, { "epoch": 0.0126172304052722, "grad_norm": 0.5706950595773004, "learning_rate": 1.2612612612612613e-06, "loss": 0.2808, "step": 56 }, { "epoch": 0.012842538091080631, "grad_norm": 0.5342854152172102, "learning_rate": 1.2837837837837838e-06, "loss": 0.2682, "step": 57 }, { "epoch": 0.013067845776889063, "grad_norm": 0.5320452417181604, "learning_rate": 1.3063063063063065e-06, "loss": 0.2528, "step": 58 }, { "epoch": 0.013293153462697497, "grad_norm": 0.5435487202598747, "learning_rate": 1.328828828828829e-06, "loss": 0.273, "step": 59 }, { "epoch": 0.013518461148505929, "grad_norm": 0.5413964558339608, "learning_rate": 1.3513513513513515e-06, "loss": 0.2731, "step": 60 }, { "epoch": 0.01374376883431436, "grad_norm": 0.5348770326410263, "learning_rate": 1.373873873873874e-06, "loss": 0.2749, "step": 61 }, { "epoch": 0.013969076520122792, "grad_norm": 0.5318413005939099, "learning_rate": 1.3963963963963963e-06, "loss": 0.2672, "step": 62 }, { "epoch": 0.014194384205931224, "grad_norm": 0.4899105874773047, "learning_rate": 1.418918918918919e-06, "loss": 0.2403, "step": 63 }, { "epoch": 0.014419691891739656, "grad_norm": 0.5545085049603222, "learning_rate": 1.4414414414414416e-06, "loss": 0.2961, "step": 64 }, { "epoch": 0.01464499957754809, "grad_norm": 0.529025173817156, "learning_rate": 1.463963963963964e-06, "loss": 0.2717, "step": 65 }, { "epoch": 0.014870307263356522, "grad_norm": 0.5550818812254924, "learning_rate": 1.4864864864864868e-06, "loss": 0.2742, "step": 66 }, { "epoch": 0.015095614949164953, "grad_norm": 0.5398535606143846, "learning_rate": 1.5090090090090093e-06, "loss": 0.2688, "step": 67 }, { "epoch": 0.015320922634973385, "grad_norm": 0.5106467899782415, "learning_rate": 1.5315315315315316e-06, "loss": 0.2476, "step": 68 }, { "epoch": 0.015546230320781817, "grad_norm": 0.5333505346961758, "learning_rate": 1.5540540540540541e-06, "loss": 0.2584, "step": 69 }, { "epoch": 0.01577153800659025, "grad_norm": 0.5256415895130913, "learning_rate": 1.5765765765765766e-06, "loss": 0.2711, "step": 70 }, { "epoch": 0.015996845692398683, "grad_norm": 0.5184587920114005, "learning_rate": 1.5990990990990993e-06, "loss": 0.2768, "step": 71 }, { "epoch": 0.016222153378207115, "grad_norm": 0.5163158029073385, "learning_rate": 1.6216216216216219e-06, "loss": 0.2701, "step": 72 }, { "epoch": 0.016447461064015546, "grad_norm": 0.48032723537669814, "learning_rate": 1.6441441441441444e-06, "loss": 0.2344, "step": 73 }, { "epoch": 0.016672768749823978, "grad_norm": 0.48140088563311695, "learning_rate": 1.6666666666666667e-06, "loss": 0.2498, "step": 74 }, { "epoch": 0.01689807643563241, "grad_norm": 0.49290629978075023, "learning_rate": 1.6891891891891894e-06, "loss": 0.2544, "step": 75 }, { "epoch": 0.017123384121440842, "grad_norm": 0.4967869263831517, "learning_rate": 1.711711711711712e-06, "loss": 0.2417, "step": 76 }, { "epoch": 0.017348691807249274, "grad_norm": 0.5096265115174715, "learning_rate": 1.7342342342342344e-06, "loss": 0.2439, "step": 77 }, { "epoch": 0.017573999493057706, "grad_norm": 0.5302741235480698, "learning_rate": 1.756756756756757e-06, "loss": 0.2641, "step": 78 }, { "epoch": 0.017799307178866138, "grad_norm": 0.4832401096007166, "learning_rate": 1.7792792792792792e-06, "loss": 0.2385, "step": 79 }, { "epoch": 0.018024614864674573, "grad_norm": 0.5169432406664927, "learning_rate": 1.801801801801802e-06, "loss": 0.2616, "step": 80 }, { "epoch": 0.018249922550483005, "grad_norm": 0.4810264047446268, "learning_rate": 1.8243243243243245e-06, "loss": 0.235, "step": 81 }, { "epoch": 0.018475230236291437, "grad_norm": 0.5064455552910841, "learning_rate": 1.846846846846847e-06, "loss": 0.2502, "step": 82 }, { "epoch": 0.01870053792209987, "grad_norm": 0.5252312972000183, "learning_rate": 1.8693693693693697e-06, "loss": 0.2527, "step": 83 }, { "epoch": 0.0189258456079083, "grad_norm": 0.5152115480424759, "learning_rate": 1.8918918918918922e-06, "loss": 0.252, "step": 84 }, { "epoch": 0.019151153293716732, "grad_norm": 0.4792935284601226, "learning_rate": 1.9144144144144145e-06, "loss": 0.2239, "step": 85 }, { "epoch": 0.019376460979525164, "grad_norm": 0.46930021033135166, "learning_rate": 1.9369369369369372e-06, "loss": 0.226, "step": 86 }, { "epoch": 0.019601768665333596, "grad_norm": 0.5119347999323998, "learning_rate": 1.9594594594594595e-06, "loss": 0.2479, "step": 87 }, { "epoch": 0.019827076351142028, "grad_norm": 0.5168060113294949, "learning_rate": 1.9819819819819822e-06, "loss": 0.2448, "step": 88 }, { "epoch": 0.02005238403695046, "grad_norm": 0.5163631879799307, "learning_rate": 2.0045045045045045e-06, "loss": 0.2382, "step": 89 }, { "epoch": 0.02027769172275889, "grad_norm": 0.5159666924757886, "learning_rate": 2.0270270270270273e-06, "loss": 0.2418, "step": 90 }, { "epoch": 0.020502999408567323, "grad_norm": 0.5041283221176179, "learning_rate": 2.0495495495495496e-06, "loss": 0.2659, "step": 91 }, { "epoch": 0.02072830709437576, "grad_norm": 0.5016352886414731, "learning_rate": 2.0720720720720723e-06, "loss": 0.2473, "step": 92 }, { "epoch": 0.02095361478018419, "grad_norm": 0.4805339701948201, "learning_rate": 2.0945945945945946e-06, "loss": 0.2394, "step": 93 }, { "epoch": 0.021178922465992622, "grad_norm": 0.5063248501917516, "learning_rate": 2.1171171171171173e-06, "loss": 0.2531, "step": 94 }, { "epoch": 0.021404230151801054, "grad_norm": 0.5392779967753892, "learning_rate": 2.13963963963964e-06, "loss": 0.2303, "step": 95 }, { "epoch": 0.021629537837609486, "grad_norm": 0.5231578606978835, "learning_rate": 2.1621621621621623e-06, "loss": 0.2611, "step": 96 }, { "epoch": 0.021854845523417918, "grad_norm": 0.49267022849605074, "learning_rate": 2.1846846846846846e-06, "loss": 0.241, "step": 97 }, { "epoch": 0.02208015320922635, "grad_norm": 0.5362283704864, "learning_rate": 2.2072072072072073e-06, "loss": 0.2622, "step": 98 }, { "epoch": 0.02230546089503478, "grad_norm": 0.5223976037466284, "learning_rate": 2.22972972972973e-06, "loss": 0.2646, "step": 99 }, { "epoch": 0.022530768580843213, "grad_norm": 0.4884083903659108, "learning_rate": 2.2522522522522524e-06, "loss": 0.2343, "step": 100 }, { "epoch": 0.022756076266651645, "grad_norm": 0.5131811981346736, "learning_rate": 2.274774774774775e-06, "loss": 0.2254, "step": 101 }, { "epoch": 0.022981383952460077, "grad_norm": 0.563620994127029, "learning_rate": 2.297297297297298e-06, "loss": 0.2339, "step": 102 }, { "epoch": 0.02320669163826851, "grad_norm": 0.4985639954502683, "learning_rate": 2.31981981981982e-06, "loss": 0.2394, "step": 103 }, { "epoch": 0.02343199932407694, "grad_norm": 0.49335868238483493, "learning_rate": 2.3423423423423424e-06, "loss": 0.2311, "step": 104 }, { "epoch": 0.023657307009885376, "grad_norm": 0.5574006983841419, "learning_rate": 2.364864864864865e-06, "loss": 0.2573, "step": 105 }, { "epoch": 0.023882614695693808, "grad_norm": 0.5573967154584135, "learning_rate": 2.3873873873873874e-06, "loss": 0.2409, "step": 106 }, { "epoch": 0.02410792238150224, "grad_norm": 0.5318573843929276, "learning_rate": 2.40990990990991e-06, "loss": 0.2308, "step": 107 }, { "epoch": 0.024333230067310672, "grad_norm": 0.5018813770253644, "learning_rate": 2.432432432432433e-06, "loss": 0.2275, "step": 108 }, { "epoch": 0.024558537753119104, "grad_norm": 0.4919658976142004, "learning_rate": 2.454954954954955e-06, "loss": 0.2452, "step": 109 }, { "epoch": 0.024783845438927535, "grad_norm": 0.5172596284253529, "learning_rate": 2.4774774774774775e-06, "loss": 0.219, "step": 110 }, { "epoch": 0.025009153124735967, "grad_norm": 0.5058848987637898, "learning_rate": 2.5e-06, "loss": 0.2401, "step": 111 }, { "epoch": 0.0252344608105444, "grad_norm": 0.5146673047789411, "learning_rate": 2.5225225225225225e-06, "loss": 0.2626, "step": 112 }, { "epoch": 0.02545976849635283, "grad_norm": 0.4705301441723405, "learning_rate": 2.5450450450450452e-06, "loss": 0.214, "step": 113 }, { "epoch": 0.025685076182161263, "grad_norm": 0.524408956720651, "learning_rate": 2.5675675675675675e-06, "loss": 0.2327, "step": 114 }, { "epoch": 0.025910383867969695, "grad_norm": 0.5300256351060787, "learning_rate": 2.5900900900900907e-06, "loss": 0.2481, "step": 115 }, { "epoch": 0.026135691553778127, "grad_norm": 0.5010212711826415, "learning_rate": 2.612612612612613e-06, "loss": 0.234, "step": 116 }, { "epoch": 0.026360999239586562, "grad_norm": 0.50692577392399, "learning_rate": 2.6351351351351353e-06, "loss": 0.2341, "step": 117 }, { "epoch": 0.026586306925394994, "grad_norm": 0.5013472652726951, "learning_rate": 2.657657657657658e-06, "loss": 0.2334, "step": 118 }, { "epoch": 0.026811614611203426, "grad_norm": 0.49763653854948314, "learning_rate": 2.6801801801801803e-06, "loss": 0.2334, "step": 119 }, { "epoch": 0.027036922297011858, "grad_norm": 0.5058647611858305, "learning_rate": 2.702702702702703e-06, "loss": 0.2319, "step": 120 }, { "epoch": 0.02726222998282029, "grad_norm": 0.5220281062730923, "learning_rate": 2.7252252252252253e-06, "loss": 0.2258, "step": 121 }, { "epoch": 0.02748753766862872, "grad_norm": 0.4902295196707689, "learning_rate": 2.747747747747748e-06, "loss": 0.234, "step": 122 }, { "epoch": 0.027712845354437153, "grad_norm": 0.49334228839146704, "learning_rate": 2.7702702702702703e-06, "loss": 0.2222, "step": 123 }, { "epoch": 0.027938153040245585, "grad_norm": 0.5140530380734651, "learning_rate": 2.7927927927927926e-06, "loss": 0.246, "step": 124 }, { "epoch": 0.028163460726054017, "grad_norm": 0.5054001798341684, "learning_rate": 2.8153153153153158e-06, "loss": 0.2261, "step": 125 }, { "epoch": 0.02838876841186245, "grad_norm": 0.5025232523144892, "learning_rate": 2.837837837837838e-06, "loss": 0.2444, "step": 126 }, { "epoch": 0.02861407609767088, "grad_norm": 0.501416740003468, "learning_rate": 2.860360360360361e-06, "loss": 0.2448, "step": 127 }, { "epoch": 0.028839383783479312, "grad_norm": 0.5001885864063282, "learning_rate": 2.882882882882883e-06, "loss": 0.221, "step": 128 }, { "epoch": 0.029064691469287748, "grad_norm": 0.5058916094632965, "learning_rate": 2.9054054054054054e-06, "loss": 0.2372, "step": 129 }, { "epoch": 0.02928999915509618, "grad_norm": 0.49414108281771374, "learning_rate": 2.927927927927928e-06, "loss": 0.2449, "step": 130 }, { "epoch": 0.02951530684090461, "grad_norm": 0.5274282549164447, "learning_rate": 2.9504504504504504e-06, "loss": 0.2204, "step": 131 }, { "epoch": 0.029740614526713043, "grad_norm": 0.5059068511500548, "learning_rate": 2.9729729729729736e-06, "loss": 0.239, "step": 132 }, { "epoch": 0.029965922212521475, "grad_norm": 0.5403324542543835, "learning_rate": 2.995495495495496e-06, "loss": 0.2427, "step": 133 }, { "epoch": 0.030191229898329907, "grad_norm": 0.5129734450730943, "learning_rate": 3.0180180180180186e-06, "loss": 0.2233, "step": 134 }, { "epoch": 0.03041653758413834, "grad_norm": 0.4995498012664299, "learning_rate": 3.040540540540541e-06, "loss": 0.2384, "step": 135 }, { "epoch": 0.03064184526994677, "grad_norm": 0.4705902432670821, "learning_rate": 3.063063063063063e-06, "loss": 0.2293, "step": 136 }, { "epoch": 0.030867152955755203, "grad_norm": 0.47864446491382573, "learning_rate": 3.085585585585586e-06, "loss": 0.2283, "step": 137 }, { "epoch": 0.031092460641563634, "grad_norm": 0.5237766445850344, "learning_rate": 3.1081081081081082e-06, "loss": 0.2352, "step": 138 }, { "epoch": 0.03131776832737207, "grad_norm": 0.47357640320589656, "learning_rate": 3.130630630630631e-06, "loss": 0.2305, "step": 139 }, { "epoch": 0.0315430760131805, "grad_norm": 0.47748642005092856, "learning_rate": 3.1531531531531532e-06, "loss": 0.2173, "step": 140 }, { "epoch": 0.03176838369898893, "grad_norm": 0.5076332367130231, "learning_rate": 3.1756756756756755e-06, "loss": 0.2211, "step": 141 }, { "epoch": 0.031993691384797365, "grad_norm": 0.49914085991583146, "learning_rate": 3.1981981981981987e-06, "loss": 0.229, "step": 142 }, { "epoch": 0.0322189990706058, "grad_norm": 0.5156689222761607, "learning_rate": 3.220720720720721e-06, "loss": 0.247, "step": 143 }, { "epoch": 0.03244430675641423, "grad_norm": 0.5306167064232364, "learning_rate": 3.2432432432432437e-06, "loss": 0.2321, "step": 144 }, { "epoch": 0.03266961444222266, "grad_norm": 0.5006105302460152, "learning_rate": 3.265765765765766e-06, "loss": 0.2412, "step": 145 }, { "epoch": 0.03289492212803109, "grad_norm": 0.47660844123587387, "learning_rate": 3.2882882882882887e-06, "loss": 0.2112, "step": 146 }, { "epoch": 0.033120229813839525, "grad_norm": 0.4908936026508093, "learning_rate": 3.310810810810811e-06, "loss": 0.2146, "step": 147 }, { "epoch": 0.033345537499647956, "grad_norm": 0.4917283785130264, "learning_rate": 3.3333333333333333e-06, "loss": 0.2115, "step": 148 }, { "epoch": 0.03357084518545639, "grad_norm": 0.49052234204011164, "learning_rate": 3.3558558558558565e-06, "loss": 0.2091, "step": 149 }, { "epoch": 0.03379615287126482, "grad_norm": 0.5035124349420905, "learning_rate": 3.3783783783783788e-06, "loss": 0.2232, "step": 150 }, { "epoch": 0.03402146055707325, "grad_norm": 0.5079586810803105, "learning_rate": 3.4009009009009015e-06, "loss": 0.2283, "step": 151 }, { "epoch": 0.034246768242881684, "grad_norm": 0.5022741231517249, "learning_rate": 3.423423423423424e-06, "loss": 0.2043, "step": 152 }, { "epoch": 0.034472075928690116, "grad_norm": 0.49004074919119595, "learning_rate": 3.445945945945946e-06, "loss": 0.2082, "step": 153 }, { "epoch": 0.03469738361449855, "grad_norm": 0.4656449371886705, "learning_rate": 3.468468468468469e-06, "loss": 0.2184, "step": 154 }, { "epoch": 0.03492269130030698, "grad_norm": 0.5026818550664739, "learning_rate": 3.490990990990991e-06, "loss": 0.2231, "step": 155 }, { "epoch": 0.03514799898611541, "grad_norm": 0.5322757840474942, "learning_rate": 3.513513513513514e-06, "loss": 0.2458, "step": 156 }, { "epoch": 0.03537330667192384, "grad_norm": 0.5107332667843115, "learning_rate": 3.536036036036036e-06, "loss": 0.2147, "step": 157 }, { "epoch": 0.035598614357732275, "grad_norm": 0.5053701639493772, "learning_rate": 3.5585585585585584e-06, "loss": 0.2084, "step": 158 }, { "epoch": 0.03582392204354071, "grad_norm": 0.48856795307644946, "learning_rate": 3.5810810810810816e-06, "loss": 0.2087, "step": 159 }, { "epoch": 0.036049229729349146, "grad_norm": 0.5910012852386906, "learning_rate": 3.603603603603604e-06, "loss": 0.2578, "step": 160 }, { "epoch": 0.03627453741515758, "grad_norm": 0.5206932068291034, "learning_rate": 3.6261261261261266e-06, "loss": 0.228, "step": 161 }, { "epoch": 0.03649984510096601, "grad_norm": 0.506397979121453, "learning_rate": 3.648648648648649e-06, "loss": 0.2218, "step": 162 }, { "epoch": 0.03672515278677444, "grad_norm": 0.51976705428227, "learning_rate": 3.6711711711711716e-06, "loss": 0.2035, "step": 163 }, { "epoch": 0.03695046047258287, "grad_norm": 0.5024417944839559, "learning_rate": 3.693693693693694e-06, "loss": 0.2226, "step": 164 }, { "epoch": 0.037175768158391305, "grad_norm": 0.5121519754408757, "learning_rate": 3.7162162162162162e-06, "loss": 0.2157, "step": 165 }, { "epoch": 0.03740107584419974, "grad_norm": 0.48543181216606685, "learning_rate": 3.7387387387387394e-06, "loss": 0.217, "step": 166 }, { "epoch": 0.03762638353000817, "grad_norm": 0.5253206660875263, "learning_rate": 3.7612612612612612e-06, "loss": 0.2287, "step": 167 }, { "epoch": 0.0378516912158166, "grad_norm": 0.5519087820882419, "learning_rate": 3.7837837837837844e-06, "loss": 0.2295, "step": 168 }, { "epoch": 0.03807699890162503, "grad_norm": 0.48023104908811204, "learning_rate": 3.8063063063063067e-06, "loss": 0.2146, "step": 169 }, { "epoch": 0.038302306587433464, "grad_norm": 0.497406635423104, "learning_rate": 3.828828828828829e-06, "loss": 0.2282, "step": 170 }, { "epoch": 0.038527614273241896, "grad_norm": 0.5109261032588829, "learning_rate": 3.851351351351352e-06, "loss": 0.2229, "step": 171 }, { "epoch": 0.03875292195905033, "grad_norm": 0.5105826503113667, "learning_rate": 3.8738738738738744e-06, "loss": 0.2205, "step": 172 }, { "epoch": 0.03897822964485876, "grad_norm": 0.5062865925051617, "learning_rate": 3.896396396396397e-06, "loss": 0.2106, "step": 173 }, { "epoch": 0.03920353733066719, "grad_norm": 0.5421073045426972, "learning_rate": 3.918918918918919e-06, "loss": 0.2298, "step": 174 }, { "epoch": 0.039428845016475624, "grad_norm": 0.5143452971037656, "learning_rate": 3.941441441441442e-06, "loss": 0.2195, "step": 175 }, { "epoch": 0.039654152702284055, "grad_norm": 0.4951933237019935, "learning_rate": 3.9639639639639645e-06, "loss": 0.2223, "step": 176 }, { "epoch": 0.03987946038809249, "grad_norm": 0.47561889500434723, "learning_rate": 3.986486486486487e-06, "loss": 0.2021, "step": 177 }, { "epoch": 0.04010476807390092, "grad_norm": 0.47940239501680004, "learning_rate": 4.009009009009009e-06, "loss": 0.2271, "step": 178 }, { "epoch": 0.04033007575970935, "grad_norm": 0.5296219636082108, "learning_rate": 4.031531531531531e-06, "loss": 0.2103, "step": 179 }, { "epoch": 0.04055538344551778, "grad_norm": 0.5344357933055536, "learning_rate": 4.0540540540540545e-06, "loss": 0.2321, "step": 180 }, { "epoch": 0.040780691131326215, "grad_norm": 0.5454856188785351, "learning_rate": 4.076576576576577e-06, "loss": 0.2377, "step": 181 }, { "epoch": 0.041005998817134647, "grad_norm": 0.48278341354458393, "learning_rate": 4.099099099099099e-06, "loss": 0.2023, "step": 182 }, { "epoch": 0.04123130650294308, "grad_norm": 0.4990000028566509, "learning_rate": 4.121621621621622e-06, "loss": 0.1954, "step": 183 }, { "epoch": 0.04145661418875152, "grad_norm": 0.49432596571018167, "learning_rate": 4.1441441441441446e-06, "loss": 0.2026, "step": 184 }, { "epoch": 0.04168192187455995, "grad_norm": 0.5323740667881898, "learning_rate": 4.166666666666667e-06, "loss": 0.2357, "step": 185 }, { "epoch": 0.04190722956036838, "grad_norm": 0.49808163776567965, "learning_rate": 4.189189189189189e-06, "loss": 0.2131, "step": 186 }, { "epoch": 0.04213253724617681, "grad_norm": 0.5274371891475365, "learning_rate": 4.2117117117117115e-06, "loss": 0.2258, "step": 187 }, { "epoch": 0.042357844931985245, "grad_norm": 0.49290274768513864, "learning_rate": 4.234234234234235e-06, "loss": 0.1998, "step": 188 }, { "epoch": 0.042583152617793676, "grad_norm": 0.49281647121251326, "learning_rate": 4.256756756756757e-06, "loss": 0.1896, "step": 189 }, { "epoch": 0.04280846030360211, "grad_norm": 0.45544492366938527, "learning_rate": 4.27927927927928e-06, "loss": 0.2038, "step": 190 }, { "epoch": 0.04303376798941054, "grad_norm": 0.5004631867414786, "learning_rate": 4.301801801801802e-06, "loss": 0.2127, "step": 191 }, { "epoch": 0.04325907567521897, "grad_norm": 0.5303953782202576, "learning_rate": 4.324324324324325e-06, "loss": 0.2192, "step": 192 }, { "epoch": 0.043484383361027404, "grad_norm": 0.4990270446873571, "learning_rate": 4.346846846846847e-06, "loss": 0.2129, "step": 193 }, { "epoch": 0.043709691046835836, "grad_norm": 0.519415561287903, "learning_rate": 4.369369369369369e-06, "loss": 0.2256, "step": 194 }, { "epoch": 0.04393499873264427, "grad_norm": 0.534723441521201, "learning_rate": 4.391891891891892e-06, "loss": 0.2062, "step": 195 }, { "epoch": 0.0441603064184527, "grad_norm": 0.48622823371636226, "learning_rate": 4.414414414414415e-06, "loss": 0.213, "step": 196 }, { "epoch": 0.04438561410426113, "grad_norm": 0.5119797847730078, "learning_rate": 4.436936936936938e-06, "loss": 0.2139, "step": 197 }, { "epoch": 0.04461092179006956, "grad_norm": 0.5153856589899163, "learning_rate": 4.45945945945946e-06, "loss": 0.2086, "step": 198 }, { "epoch": 0.044836229475877995, "grad_norm": 0.5225611573890641, "learning_rate": 4.4819819819819824e-06, "loss": 0.2262, "step": 199 }, { "epoch": 0.04506153716168643, "grad_norm": 0.5064606336826201, "learning_rate": 4.504504504504505e-06, "loss": 0.2121, "step": 200 }, { "epoch": 0.04528684484749486, "grad_norm": 0.5200580083558093, "learning_rate": 4.527027027027027e-06, "loss": 0.2174, "step": 201 }, { "epoch": 0.04551215253330329, "grad_norm": 0.5072035357190301, "learning_rate": 4.54954954954955e-06, "loss": 0.2069, "step": 202 }, { "epoch": 0.04573746021911172, "grad_norm": 0.5428413804423315, "learning_rate": 4.5720720720720725e-06, "loss": 0.2215, "step": 203 }, { "epoch": 0.045962767904920154, "grad_norm": 0.5062085074677846, "learning_rate": 4.594594594594596e-06, "loss": 0.2159, "step": 204 }, { "epoch": 0.046188075590728586, "grad_norm": 0.5077756214561331, "learning_rate": 4.617117117117118e-06, "loss": 0.2255, "step": 205 }, { "epoch": 0.04641338327653702, "grad_norm": 0.516342590401075, "learning_rate": 4.63963963963964e-06, "loss": 0.2334, "step": 206 }, { "epoch": 0.04663869096234545, "grad_norm": 0.5146447530347443, "learning_rate": 4.6621621621621625e-06, "loss": 0.2202, "step": 207 }, { "epoch": 0.04686399864815388, "grad_norm": 0.5218717935866587, "learning_rate": 4.684684684684685e-06, "loss": 0.2227, "step": 208 }, { "epoch": 0.04708930633396232, "grad_norm": 0.5108508636146013, "learning_rate": 4.707207207207208e-06, "loss": 0.2159, "step": 209 }, { "epoch": 0.04731461401977075, "grad_norm": 0.49732513414567286, "learning_rate": 4.72972972972973e-06, "loss": 0.2187, "step": 210 }, { "epoch": 0.047539921705579184, "grad_norm": 0.48464337118429296, "learning_rate": 4.7522522522522526e-06, "loss": 0.1996, "step": 211 }, { "epoch": 0.047765229391387616, "grad_norm": 0.5456833150991911, "learning_rate": 4.774774774774775e-06, "loss": 0.2249, "step": 212 }, { "epoch": 0.04799053707719605, "grad_norm": 0.492362483621816, "learning_rate": 4.797297297297297e-06, "loss": 0.1968, "step": 213 }, { "epoch": 0.04821584476300448, "grad_norm": 0.5257529230994231, "learning_rate": 4.81981981981982e-06, "loss": 0.2097, "step": 214 }, { "epoch": 0.04844115244881291, "grad_norm": 0.5358862789377798, "learning_rate": 4.842342342342343e-06, "loss": 0.2101, "step": 215 }, { "epoch": 0.048666460134621344, "grad_norm": 0.5077109074089186, "learning_rate": 4.864864864864866e-06, "loss": 0.2142, "step": 216 }, { "epoch": 0.048891767820429775, "grad_norm": 0.5048470686894994, "learning_rate": 4.887387387387388e-06, "loss": 0.2033, "step": 217 }, { "epoch": 0.04911707550623821, "grad_norm": 0.5601972610125039, "learning_rate": 4.90990990990991e-06, "loss": 0.2183, "step": 218 }, { "epoch": 0.04934238319204664, "grad_norm": 0.5449553809805391, "learning_rate": 4.932432432432433e-06, "loss": 0.2153, "step": 219 }, { "epoch": 0.04956769087785507, "grad_norm": 0.5751833317104584, "learning_rate": 4.954954954954955e-06, "loss": 0.2315, "step": 220 }, { "epoch": 0.0497929985636635, "grad_norm": 0.5707106430637009, "learning_rate": 4.977477477477478e-06, "loss": 0.2184, "step": 221 }, { "epoch": 0.050018306249471935, "grad_norm": 0.5132652534577439, "learning_rate": 5e-06, "loss": 0.216, "step": 222 }, { "epoch": 0.050243613935280367, "grad_norm": 0.539045624424229, "learning_rate": 4.999999305921593e-06, "loss": 0.2141, "step": 223 }, { "epoch": 0.0504689216210888, "grad_norm": 0.5075400773617107, "learning_rate": 4.999997223686756e-06, "loss": 0.2013, "step": 224 }, { "epoch": 0.05069422930689723, "grad_norm": 0.5494829828837551, "learning_rate": 4.9999937532966454e-06, "loss": 0.2347, "step": 225 }, { "epoch": 0.05091953699270566, "grad_norm": 0.46487989660605866, "learning_rate": 4.999988894753189e-06, "loss": 0.1956, "step": 226 }, { "epoch": 0.051144844678514094, "grad_norm": 0.5753347536093394, "learning_rate": 4.999982648059082e-06, "loss": 0.2384, "step": 227 }, { "epoch": 0.051370152364322526, "grad_norm": 0.5651567672972739, "learning_rate": 4.999975013217796e-06, "loss": 0.2252, "step": 228 }, { "epoch": 0.05159546005013096, "grad_norm": 0.5315878474657139, "learning_rate": 4.99996599023357e-06, "loss": 0.227, "step": 229 }, { "epoch": 0.05182076773593939, "grad_norm": 0.5177944206907267, "learning_rate": 4.999955579111413e-06, "loss": 0.2147, "step": 230 }, { "epoch": 0.05204607542174782, "grad_norm": 0.49178836271858745, "learning_rate": 4.999943779857106e-06, "loss": 0.1958, "step": 231 }, { "epoch": 0.05227138310755625, "grad_norm": 0.5403786614660475, "learning_rate": 4.9999305924772e-06, "loss": 0.2243, "step": 232 }, { "epoch": 0.05249669079336469, "grad_norm": 0.5249176799168239, "learning_rate": 4.999916016979019e-06, "loss": 0.2067, "step": 233 }, { "epoch": 0.052721998479173124, "grad_norm": 0.5514864920080108, "learning_rate": 4.999900053370657e-06, "loss": 0.2287, "step": 234 }, { "epoch": 0.052947306164981556, "grad_norm": 0.5183980485361032, "learning_rate": 4.999882701660975e-06, "loss": 0.2154, "step": 235 }, { "epoch": 0.05317261385078999, "grad_norm": 0.48926124135397414, "learning_rate": 4.99986396185961e-06, "loss": 0.1996, "step": 236 }, { "epoch": 0.05339792153659842, "grad_norm": 0.5520877940291486, "learning_rate": 4.999843833976967e-06, "loss": 0.2238, "step": 237 }, { "epoch": 0.05362322922240685, "grad_norm": 0.48531468109936043, "learning_rate": 4.999822318024222e-06, "loss": 0.197, "step": 238 }, { "epoch": 0.05384853690821528, "grad_norm": 0.5315551790916936, "learning_rate": 4.999799414013322e-06, "loss": 0.2285, "step": 239 }, { "epoch": 0.054073844594023715, "grad_norm": 0.4824546627085691, "learning_rate": 4.9997751219569844e-06, "loss": 0.1994, "step": 240 }, { "epoch": 0.05429915227983215, "grad_norm": 0.48508802248921484, "learning_rate": 4.999749441868699e-06, "loss": 0.1941, "step": 241 }, { "epoch": 0.05452445996564058, "grad_norm": 0.48336029019618354, "learning_rate": 4.999722373762725e-06, "loss": 0.1964, "step": 242 }, { "epoch": 0.05474976765144901, "grad_norm": 0.5233609224975063, "learning_rate": 4.9996939176540895e-06, "loss": 0.2101, "step": 243 }, { "epoch": 0.05497507533725744, "grad_norm": 0.5366150291161705, "learning_rate": 4.999664073558596e-06, "loss": 0.2217, "step": 244 }, { "epoch": 0.055200383023065874, "grad_norm": 0.5273997275923541, "learning_rate": 4.999632841492815e-06, "loss": 0.2192, "step": 245 }, { "epoch": 0.055425690708874306, "grad_norm": 0.47872243223030636, "learning_rate": 4.999600221474089e-06, "loss": 0.1949, "step": 246 }, { "epoch": 0.05565099839468274, "grad_norm": 0.462778258865609, "learning_rate": 4.999566213520529e-06, "loss": 0.1891, "step": 247 }, { "epoch": 0.05587630608049117, "grad_norm": 0.5192808679253843, "learning_rate": 4.99953081765102e-06, "loss": 0.2025, "step": 248 }, { "epoch": 0.0561016137662996, "grad_norm": 0.4912780617421906, "learning_rate": 4.999494033885215e-06, "loss": 0.1942, "step": 249 }, { "epoch": 0.056326921452108034, "grad_norm": 0.5893997616218107, "learning_rate": 4.999455862243539e-06, "loss": 0.2222, "step": 250 }, { "epoch": 0.056552229137916465, "grad_norm": 0.5167969480025867, "learning_rate": 4.999416302747189e-06, "loss": 0.2159, "step": 251 }, { "epoch": 0.0567775368237249, "grad_norm": 0.5182504713211056, "learning_rate": 4.999375355418128e-06, "loss": 0.2231, "step": 252 }, { "epoch": 0.05700284450953333, "grad_norm": 0.5730553068204561, "learning_rate": 4.999333020279094e-06, "loss": 0.2011, "step": 253 }, { "epoch": 0.05722815219534176, "grad_norm": 0.5219964473354403, "learning_rate": 4.999289297353593e-06, "loss": 0.2123, "step": 254 }, { "epoch": 0.05745345988115019, "grad_norm": 0.5140790599827941, "learning_rate": 4.9992441866659054e-06, "loss": 0.2115, "step": 255 }, { "epoch": 0.057678767566958625, "grad_norm": 0.5218855782116167, "learning_rate": 4.999197688241076e-06, "loss": 0.1882, "step": 256 }, { "epoch": 0.05790407525276706, "grad_norm": 0.4957230593038792, "learning_rate": 4.999149802104926e-06, "loss": 0.1952, "step": 257 }, { "epoch": 0.058129382938575495, "grad_norm": 0.4874351482139566, "learning_rate": 4.999100528284045e-06, "loss": 0.2074, "step": 258 }, { "epoch": 0.05835469062438393, "grad_norm": 0.4999889869456212, "learning_rate": 4.999049866805793e-06, "loss": 0.2155, "step": 259 }, { "epoch": 0.05857999831019236, "grad_norm": 0.495198415318367, "learning_rate": 4.998997817698298e-06, "loss": 0.1936, "step": 260 }, { "epoch": 0.05880530599600079, "grad_norm": 0.5540297345897838, "learning_rate": 4.998944380990462e-06, "loss": 0.2295, "step": 261 }, { "epoch": 0.05903061368180922, "grad_norm": 0.5175758124205073, "learning_rate": 4.998889556711958e-06, "loss": 0.2163, "step": 262 }, { "epoch": 0.059255921367617655, "grad_norm": 0.5276038982841817, "learning_rate": 4.998833344893226e-06, "loss": 0.1934, "step": 263 }, { "epoch": 0.059481229053426087, "grad_norm": 0.5411162569824867, "learning_rate": 4.998775745565479e-06, "loss": 0.2199, "step": 264 }, { "epoch": 0.05970653673923452, "grad_norm": 0.5373294969263402, "learning_rate": 4.998716758760701e-06, "loss": 0.2254, "step": 265 }, { "epoch": 0.05993184442504295, "grad_norm": 0.5141799318273902, "learning_rate": 4.998656384511643e-06, "loss": 0.2102, "step": 266 }, { "epoch": 0.06015715211085138, "grad_norm": 0.475281124330064, "learning_rate": 4.998594622851829e-06, "loss": 0.1692, "step": 267 }, { "epoch": 0.060382459796659814, "grad_norm": 0.5380386686045225, "learning_rate": 4.9985314738155545e-06, "loss": 0.2162, "step": 268 }, { "epoch": 0.060607767482468246, "grad_norm": 0.4897137619103139, "learning_rate": 4.9984669374378825e-06, "loss": 0.1958, "step": 269 }, { "epoch": 0.06083307516827668, "grad_norm": 0.534719780931734, "learning_rate": 4.9984010137546475e-06, "loss": 0.236, "step": 270 }, { "epoch": 0.06105838285408511, "grad_norm": 0.5043803991722573, "learning_rate": 4.998333702802457e-06, "loss": 0.2067, "step": 271 }, { "epoch": 0.06128369053989354, "grad_norm": 0.5300708413018712, "learning_rate": 4.998265004618682e-06, "loss": 0.2178, "step": 272 }, { "epoch": 0.06150899822570197, "grad_norm": 0.49714976101530717, "learning_rate": 4.998194919241471e-06, "loss": 0.2019, "step": 273 }, { "epoch": 0.061734305911510405, "grad_norm": 0.5282286817738815, "learning_rate": 4.998123446709739e-06, "loss": 0.2135, "step": 274 }, { "epoch": 0.06195961359731884, "grad_norm": 0.48789888952975713, "learning_rate": 4.998050587063173e-06, "loss": 0.198, "step": 275 }, { "epoch": 0.06218492128312727, "grad_norm": 0.44970788932918077, "learning_rate": 4.997976340342226e-06, "loss": 0.2017, "step": 276 }, { "epoch": 0.0624102289689357, "grad_norm": 0.5583928627478204, "learning_rate": 4.997900706588129e-06, "loss": 0.2158, "step": 277 }, { "epoch": 0.06263553665474414, "grad_norm": 0.5258990624055785, "learning_rate": 4.997823685842875e-06, "loss": 0.2149, "step": 278 }, { "epoch": 0.06286084434055257, "grad_norm": 0.46981935210103615, "learning_rate": 4.997745278149233e-06, "loss": 0.1874, "step": 279 }, { "epoch": 0.063086152026361, "grad_norm": 0.5041562581871978, "learning_rate": 4.997665483550739e-06, "loss": 0.1942, "step": 280 }, { "epoch": 0.06331145971216944, "grad_norm": 0.5324552273484924, "learning_rate": 4.997584302091699e-06, "loss": 0.2202, "step": 281 }, { "epoch": 0.06353676739797787, "grad_norm": 0.5478873025934429, "learning_rate": 4.997501733817191e-06, "loss": 0.21, "step": 282 }, { "epoch": 0.0637620750837863, "grad_norm": 0.46598209735292806, "learning_rate": 4.997417778773064e-06, "loss": 0.1811, "step": 283 }, { "epoch": 0.06398738276959473, "grad_norm": 0.5996968824573213, "learning_rate": 4.997332437005932e-06, "loss": 0.2384, "step": 284 }, { "epoch": 0.06421269045540316, "grad_norm": 0.5051133961780844, "learning_rate": 4.9972457085631825e-06, "loss": 0.2043, "step": 285 }, { "epoch": 0.0644379981412116, "grad_norm": 0.5040193073935666, "learning_rate": 4.997157593492974e-06, "loss": 0.1825, "step": 286 }, { "epoch": 0.06466330582702003, "grad_norm": 0.4888013540021299, "learning_rate": 4.997068091844233e-06, "loss": 0.2068, "step": 287 }, { "epoch": 0.06488861351282846, "grad_norm": 0.49915136047539255, "learning_rate": 4.996977203666657e-06, "loss": 0.1984, "step": 288 }, { "epoch": 0.06511392119863689, "grad_norm": 0.5078177992971098, "learning_rate": 4.99688492901071e-06, "loss": 0.1993, "step": 289 }, { "epoch": 0.06533922888444532, "grad_norm": 0.5236192779491511, "learning_rate": 4.996791267927632e-06, "loss": 0.2027, "step": 290 }, { "epoch": 0.06556453657025375, "grad_norm": 0.5245389871527842, "learning_rate": 4.996696220469429e-06, "loss": 0.1979, "step": 291 }, { "epoch": 0.06578984425606219, "grad_norm": 0.5395585265194126, "learning_rate": 4.996599786688876e-06, "loss": 0.212, "step": 292 }, { "epoch": 0.06601515194187062, "grad_norm": 0.5353267861924169, "learning_rate": 4.996501966639519e-06, "loss": 0.214, "step": 293 }, { "epoch": 0.06624045962767905, "grad_norm": 0.5363756637398909, "learning_rate": 4.996402760375676e-06, "loss": 0.2353, "step": 294 }, { "epoch": 0.06646576731348748, "grad_norm": 0.574596026064565, "learning_rate": 4.99630216795243e-06, "loss": 0.2202, "step": 295 }, { "epoch": 0.06669107499929591, "grad_norm": 0.5410467145659404, "learning_rate": 4.996200189425638e-06, "loss": 0.2077, "step": 296 }, { "epoch": 0.06691638268510434, "grad_norm": 0.4998899935289798, "learning_rate": 4.996096824851923e-06, "loss": 0.2141, "step": 297 }, { "epoch": 0.06714169037091278, "grad_norm": 0.5162735905138214, "learning_rate": 4.9959920742886815e-06, "loss": 0.2006, "step": 298 }, { "epoch": 0.06736699805672121, "grad_norm": 0.5737456586581821, "learning_rate": 4.9958859377940765e-06, "loss": 0.2027, "step": 299 }, { "epoch": 0.06759230574252964, "grad_norm": 0.5413621557656281, "learning_rate": 4.995778415427042e-06, "loss": 0.2098, "step": 300 }, { "epoch": 0.06781761342833807, "grad_norm": 0.577394779919834, "learning_rate": 4.99566950724728e-06, "loss": 0.2269, "step": 301 }, { "epoch": 0.0680429211141465, "grad_norm": 0.515822079782068, "learning_rate": 4.995559213315267e-06, "loss": 0.2261, "step": 302 }, { "epoch": 0.06826822879995494, "grad_norm": 0.4712268338728312, "learning_rate": 4.995447533692239e-06, "loss": 0.2003, "step": 303 }, { "epoch": 0.06849353648576337, "grad_norm": 0.5134784305459534, "learning_rate": 4.995334468440213e-06, "loss": 0.1997, "step": 304 }, { "epoch": 0.0687188441715718, "grad_norm": 0.4907228514504316, "learning_rate": 4.995220017621967e-06, "loss": 0.1896, "step": 305 }, { "epoch": 0.06894415185738023, "grad_norm": 0.4962154271689104, "learning_rate": 4.995104181301052e-06, "loss": 0.2027, "step": 306 }, { "epoch": 0.06916945954318866, "grad_norm": 0.5537049140921233, "learning_rate": 4.994986959541788e-06, "loss": 0.213, "step": 307 }, { "epoch": 0.0693947672289971, "grad_norm": 0.49941255076713437, "learning_rate": 4.994868352409263e-06, "loss": 0.1964, "step": 308 }, { "epoch": 0.06962007491480553, "grad_norm": 0.5054354369750879, "learning_rate": 4.994748359969336e-06, "loss": 0.2071, "step": 309 }, { "epoch": 0.06984538260061396, "grad_norm": 0.5172274273450349, "learning_rate": 4.9946269822886335e-06, "loss": 0.2034, "step": 310 }, { "epoch": 0.07007069028642239, "grad_norm": 0.4931991542019778, "learning_rate": 4.994504219434553e-06, "loss": 0.2136, "step": 311 }, { "epoch": 0.07029599797223082, "grad_norm": 0.5264002662303077, "learning_rate": 4.9943800714752586e-06, "loss": 0.2181, "step": 312 }, { "epoch": 0.07052130565803925, "grad_norm": 0.5062978313923276, "learning_rate": 4.994254538479687e-06, "loss": 0.2001, "step": 313 }, { "epoch": 0.07074661334384769, "grad_norm": 0.5508272404060727, "learning_rate": 4.9941276205175405e-06, "loss": 0.209, "step": 314 }, { "epoch": 0.07097192102965612, "grad_norm": 0.5076274602102792, "learning_rate": 4.993999317659293e-06, "loss": 0.2038, "step": 315 }, { "epoch": 0.07119722871546455, "grad_norm": 0.5122198034692836, "learning_rate": 4.9938696299761856e-06, "loss": 0.1889, "step": 316 }, { "epoch": 0.07142253640127298, "grad_norm": 0.6066990312387696, "learning_rate": 4.9937385575402284e-06, "loss": 0.2387, "step": 317 }, { "epoch": 0.07164784408708141, "grad_norm": 0.5113729441842336, "learning_rate": 4.993606100424202e-06, "loss": 0.1925, "step": 318 }, { "epoch": 0.07187315177288986, "grad_norm": 0.5434767861340162, "learning_rate": 4.9934722587016555e-06, "loss": 0.2083, "step": 319 }, { "epoch": 0.07209845945869829, "grad_norm": 0.5220970910896368, "learning_rate": 4.9933370324469045e-06, "loss": 0.2127, "step": 320 }, { "epoch": 0.07232376714450672, "grad_norm": 0.5133695423087399, "learning_rate": 4.993200421735037e-06, "loss": 0.2072, "step": 321 }, { "epoch": 0.07254907483031516, "grad_norm": 0.5107258940556315, "learning_rate": 4.993062426641906e-06, "loss": 0.2036, "step": 322 }, { "epoch": 0.07277438251612359, "grad_norm": 0.5455096340183896, "learning_rate": 4.992923047244136e-06, "loss": 0.2219, "step": 323 }, { "epoch": 0.07299969020193202, "grad_norm": 0.5218309849291067, "learning_rate": 4.9927822836191185e-06, "loss": 0.2235, "step": 324 }, { "epoch": 0.07322499788774045, "grad_norm": 0.5057153674679081, "learning_rate": 4.992640135845016e-06, "loss": 0.2078, "step": 325 }, { "epoch": 0.07345030557354888, "grad_norm": 0.4632931130727815, "learning_rate": 4.992496604000756e-06, "loss": 0.1917, "step": 326 }, { "epoch": 0.07367561325935731, "grad_norm": 0.5133401143389513, "learning_rate": 4.992351688166038e-06, "loss": 0.1942, "step": 327 }, { "epoch": 0.07390092094516575, "grad_norm": 0.5425225307579951, "learning_rate": 4.992205388421326e-06, "loss": 0.2136, "step": 328 }, { "epoch": 0.07412622863097418, "grad_norm": 0.4938836282672529, "learning_rate": 4.992057704847858e-06, "loss": 0.2071, "step": 329 }, { "epoch": 0.07435153631678261, "grad_norm": 0.5042566721615249, "learning_rate": 4.991908637527634e-06, "loss": 0.2048, "step": 330 }, { "epoch": 0.07457684400259104, "grad_norm": 0.5079880579268699, "learning_rate": 4.9917581865434275e-06, "loss": 0.192, "step": 331 }, { "epoch": 0.07480215168839947, "grad_norm": 0.5111920959241932, "learning_rate": 4.9916063519787775e-06, "loss": 0.2007, "step": 332 }, { "epoch": 0.0750274593742079, "grad_norm": 0.5033017411516871, "learning_rate": 4.991453133917993e-06, "loss": 0.2049, "step": 333 }, { "epoch": 0.07525276706001634, "grad_norm": 0.5129775312572191, "learning_rate": 4.991298532446149e-06, "loss": 0.2069, "step": 334 }, { "epoch": 0.07547807474582477, "grad_norm": 0.548652704267977, "learning_rate": 4.991142547649091e-06, "loss": 0.1932, "step": 335 }, { "epoch": 0.0757033824316332, "grad_norm": 0.4673398399684016, "learning_rate": 4.990985179613431e-06, "loss": 0.1953, "step": 336 }, { "epoch": 0.07592869011744163, "grad_norm": 0.5295523972404997, "learning_rate": 4.990826428426549e-06, "loss": 0.2049, "step": 337 }, { "epoch": 0.07615399780325006, "grad_norm": 0.5189619606719836, "learning_rate": 4.990666294176596e-06, "loss": 0.2045, "step": 338 }, { "epoch": 0.0763793054890585, "grad_norm": 0.5067351401379646, "learning_rate": 4.9905047769524855e-06, "loss": 0.2198, "step": 339 }, { "epoch": 0.07660461317486693, "grad_norm": 0.5121777968541428, "learning_rate": 4.990341876843904e-06, "loss": 0.1985, "step": 340 }, { "epoch": 0.07682992086067536, "grad_norm": 0.5311438794688305, "learning_rate": 4.990177593941303e-06, "loss": 0.2059, "step": 341 }, { "epoch": 0.07705522854648379, "grad_norm": 0.49129677532414406, "learning_rate": 4.9900119283359025e-06, "loss": 0.1895, "step": 342 }, { "epoch": 0.07728053623229222, "grad_norm": 0.4893539747998295, "learning_rate": 4.989844880119692e-06, "loss": 0.1807, "step": 343 }, { "epoch": 0.07750584391810066, "grad_norm": 0.5072252752550346, "learning_rate": 4.989676449385426e-06, "loss": 0.2018, "step": 344 }, { "epoch": 0.07773115160390909, "grad_norm": 0.4940421937088313, "learning_rate": 4.989506636226626e-06, "loss": 0.1967, "step": 345 }, { "epoch": 0.07795645928971752, "grad_norm": 0.4786128739759888, "learning_rate": 4.989335440737587e-06, "loss": 0.1883, "step": 346 }, { "epoch": 0.07818176697552595, "grad_norm": 0.4980537115424014, "learning_rate": 4.989162863013364e-06, "loss": 0.1878, "step": 347 }, { "epoch": 0.07840707466133438, "grad_norm": 0.4874848708154177, "learning_rate": 4.988988903149784e-06, "loss": 0.1909, "step": 348 }, { "epoch": 0.07863238234714282, "grad_norm": 0.5228954597731107, "learning_rate": 4.9888135612434415e-06, "loss": 0.1974, "step": 349 }, { "epoch": 0.07885769003295125, "grad_norm": 0.5121660056443094, "learning_rate": 4.988636837391696e-06, "loss": 0.1882, "step": 350 }, { "epoch": 0.07908299771875968, "grad_norm": 0.5031256954337335, "learning_rate": 4.9884587316926765e-06, "loss": 0.2116, "step": 351 }, { "epoch": 0.07930830540456811, "grad_norm": 0.5144091853279764, "learning_rate": 4.988279244245278e-06, "loss": 0.2033, "step": 352 }, { "epoch": 0.07953361309037654, "grad_norm": 0.531214217524162, "learning_rate": 4.988098375149163e-06, "loss": 0.198, "step": 353 }, { "epoch": 0.07975892077618497, "grad_norm": 0.4889296327628917, "learning_rate": 4.987916124504761e-06, "loss": 0.1997, "step": 354 }, { "epoch": 0.0799842284619934, "grad_norm": 0.49524804048725096, "learning_rate": 4.987732492413271e-06, "loss": 0.1855, "step": 355 }, { "epoch": 0.08020953614780184, "grad_norm": 0.4864717158203817, "learning_rate": 4.987547478976655e-06, "loss": 0.1911, "step": 356 }, { "epoch": 0.08043484383361027, "grad_norm": 0.4936470667020924, "learning_rate": 4.987361084297645e-06, "loss": 0.1968, "step": 357 }, { "epoch": 0.0806601515194187, "grad_norm": 0.5020980156378522, "learning_rate": 4.987173308479738e-06, "loss": 0.1922, "step": 358 }, { "epoch": 0.08088545920522713, "grad_norm": 0.467049169764784, "learning_rate": 4.9869841516272004e-06, "loss": 0.1933, "step": 359 }, { "epoch": 0.08111076689103557, "grad_norm": 0.4926168450953013, "learning_rate": 4.9867936138450635e-06, "loss": 0.1875, "step": 360 }, { "epoch": 0.081336074576844, "grad_norm": 0.5553008487726967, "learning_rate": 4.986601695239125e-06, "loss": 0.2116, "step": 361 }, { "epoch": 0.08156138226265243, "grad_norm": 0.46531354659798435, "learning_rate": 4.98640839591595e-06, "loss": 0.2005, "step": 362 }, { "epoch": 0.08178668994846086, "grad_norm": 0.4975006830434269, "learning_rate": 4.986213715982873e-06, "loss": 0.1997, "step": 363 }, { "epoch": 0.08201199763426929, "grad_norm": 0.5484910187073875, "learning_rate": 4.986017655547989e-06, "loss": 0.2052, "step": 364 }, { "epoch": 0.08223730532007772, "grad_norm": 0.4917366895315546, "learning_rate": 4.985820214720165e-06, "loss": 0.1963, "step": 365 }, { "epoch": 0.08246261300588616, "grad_norm": 0.5079414756556945, "learning_rate": 4.985621393609032e-06, "loss": 0.2171, "step": 366 }, { "epoch": 0.08268792069169459, "grad_norm": 0.5025287311574784, "learning_rate": 4.98542119232499e-06, "loss": 0.1847, "step": 367 }, { "epoch": 0.08291322837750303, "grad_norm": 0.5054672772723833, "learning_rate": 4.9852196109792e-06, "loss": 0.1828, "step": 368 }, { "epoch": 0.08313853606331147, "grad_norm": 0.5119523083386188, "learning_rate": 4.985016649683594e-06, "loss": 0.2057, "step": 369 }, { "epoch": 0.0833638437491199, "grad_norm": 0.4973589633933691, "learning_rate": 4.984812308550869e-06, "loss": 0.2077, "step": 370 }, { "epoch": 0.08358915143492833, "grad_norm": 0.4809212159374925, "learning_rate": 4.984606587694488e-06, "loss": 0.1875, "step": 371 }, { "epoch": 0.08381445912073676, "grad_norm": 0.5694307394026895, "learning_rate": 4.98439948722868e-06, "loss": 0.2076, "step": 372 }, { "epoch": 0.0840397668065452, "grad_norm": 0.49181225929988026, "learning_rate": 4.9841910072684406e-06, "loss": 0.2081, "step": 373 }, { "epoch": 0.08426507449235363, "grad_norm": 0.5163094079454916, "learning_rate": 4.98398114792953e-06, "loss": 0.1999, "step": 374 }, { "epoch": 0.08449038217816206, "grad_norm": 0.48034370172395746, "learning_rate": 4.9837699093284765e-06, "loss": 0.1954, "step": 375 }, { "epoch": 0.08471568986397049, "grad_norm": 0.5167164355540542, "learning_rate": 4.983557291582572e-06, "loss": 0.2055, "step": 376 }, { "epoch": 0.08494099754977892, "grad_norm": 0.5221710536395783, "learning_rate": 4.983343294809875e-06, "loss": 0.187, "step": 377 }, { "epoch": 0.08516630523558735, "grad_norm": 0.5195789727627067, "learning_rate": 4.9831279191292114e-06, "loss": 0.196, "step": 378 }, { "epoch": 0.08539161292139578, "grad_norm": 0.4749341701034613, "learning_rate": 4.98291116466017e-06, "loss": 0.1979, "step": 379 }, { "epoch": 0.08561692060720422, "grad_norm": 0.4940468961257005, "learning_rate": 4.982693031523107e-06, "loss": 0.1725, "step": 380 }, { "epoch": 0.08584222829301265, "grad_norm": 0.49776133813703377, "learning_rate": 4.982473519839144e-06, "loss": 0.1968, "step": 381 }, { "epoch": 0.08606753597882108, "grad_norm": 0.5406461195372502, "learning_rate": 4.982252629730167e-06, "loss": 0.2002, "step": 382 }, { "epoch": 0.08629284366462951, "grad_norm": 0.5102425028271348, "learning_rate": 4.982030361318827e-06, "loss": 0.1943, "step": 383 }, { "epoch": 0.08651815135043794, "grad_norm": 0.4743565750135506, "learning_rate": 4.981806714728543e-06, "loss": 0.1866, "step": 384 }, { "epoch": 0.08674345903624638, "grad_norm": 0.46885833113908765, "learning_rate": 4.981581690083498e-06, "loss": 0.1726, "step": 385 }, { "epoch": 0.08696876672205481, "grad_norm": 0.48609707569076405, "learning_rate": 4.981355287508638e-06, "loss": 0.1757, "step": 386 }, { "epoch": 0.08719407440786324, "grad_norm": 0.4667625682253398, "learning_rate": 4.981127507129677e-06, "loss": 0.1909, "step": 387 }, { "epoch": 0.08741938209367167, "grad_norm": 0.5218958233886627, "learning_rate": 4.980898349073094e-06, "loss": 0.2007, "step": 388 }, { "epoch": 0.0876446897794801, "grad_norm": 0.4673222943348308, "learning_rate": 4.9806678134661295e-06, "loss": 0.1915, "step": 389 }, { "epoch": 0.08786999746528854, "grad_norm": 0.49616638111195194, "learning_rate": 4.980435900436793e-06, "loss": 0.1875, "step": 390 }, { "epoch": 0.08809530515109697, "grad_norm": 0.4817346343815341, "learning_rate": 4.980202610113857e-06, "loss": 0.1905, "step": 391 }, { "epoch": 0.0883206128369054, "grad_norm": 0.5351889949883639, "learning_rate": 4.9799679426268575e-06, "loss": 0.2208, "step": 392 }, { "epoch": 0.08854592052271383, "grad_norm": 0.5047966627283784, "learning_rate": 4.9797318981061e-06, "loss": 0.2039, "step": 393 }, { "epoch": 0.08877122820852226, "grad_norm": 0.4923828536733769, "learning_rate": 4.979494476682647e-06, "loss": 0.2018, "step": 394 }, { "epoch": 0.0889965358943307, "grad_norm": 0.48812777672235563, "learning_rate": 4.979255678488332e-06, "loss": 0.1871, "step": 395 }, { "epoch": 0.08922184358013913, "grad_norm": 0.48533989253285686, "learning_rate": 4.979015503655751e-06, "loss": 0.2001, "step": 396 }, { "epoch": 0.08944715126594756, "grad_norm": 0.500758368828542, "learning_rate": 4.978773952318263e-06, "loss": 0.2083, "step": 397 }, { "epoch": 0.08967245895175599, "grad_norm": 0.4751863671531154, "learning_rate": 4.978531024609994e-06, "loss": 0.2025, "step": 398 }, { "epoch": 0.08989776663756442, "grad_norm": 0.5028504445052441, "learning_rate": 4.978286720665832e-06, "loss": 0.2034, "step": 399 }, { "epoch": 0.09012307432337285, "grad_norm": 0.4840968995820041, "learning_rate": 4.978041040621428e-06, "loss": 0.1908, "step": 400 }, { "epoch": 0.09034838200918129, "grad_norm": 0.49160319980568024, "learning_rate": 4.977793984613202e-06, "loss": 0.1944, "step": 401 }, { "epoch": 0.09057368969498972, "grad_norm": 0.4973629969371634, "learning_rate": 4.977545552778333e-06, "loss": 0.1962, "step": 402 }, { "epoch": 0.09079899738079815, "grad_norm": 0.47691284590905275, "learning_rate": 4.977295745254766e-06, "loss": 0.1914, "step": 403 }, { "epoch": 0.09102430506660658, "grad_norm": 0.47003075599798927, "learning_rate": 4.977044562181212e-06, "loss": 0.1847, "step": 404 }, { "epoch": 0.09124961275241501, "grad_norm": 0.476135794675388, "learning_rate": 4.9767920036971406e-06, "loss": 0.1924, "step": 405 }, { "epoch": 0.09147492043822344, "grad_norm": 0.4801530579110471, "learning_rate": 4.9765380699427905e-06, "loss": 0.1794, "step": 406 }, { "epoch": 0.09170022812403188, "grad_norm": 0.5585643748818848, "learning_rate": 4.97628276105916e-06, "loss": 0.2114, "step": 407 }, { "epoch": 0.09192553580984031, "grad_norm": 0.49406470093699134, "learning_rate": 4.976026077188013e-06, "loss": 0.1964, "step": 408 }, { "epoch": 0.09215084349564874, "grad_norm": 0.48542519989421573, "learning_rate": 4.975768018471877e-06, "loss": 0.1783, "step": 409 }, { "epoch": 0.09237615118145717, "grad_norm": 0.48764410824033644, "learning_rate": 4.9755085850540426e-06, "loss": 0.198, "step": 410 }, { "epoch": 0.0926014588672656, "grad_norm": 0.504358514720316, "learning_rate": 4.9752477770785625e-06, "loss": 0.201, "step": 411 }, { "epoch": 0.09282676655307404, "grad_norm": 0.49485581002695883, "learning_rate": 4.974985594690255e-06, "loss": 0.1972, "step": 412 }, { "epoch": 0.09305207423888247, "grad_norm": 0.4770605027062997, "learning_rate": 4.9747220380346975e-06, "loss": 0.1769, "step": 413 }, { "epoch": 0.0932773819246909, "grad_norm": 0.5054068132119154, "learning_rate": 4.9744571072582365e-06, "loss": 0.2051, "step": 414 }, { "epoch": 0.09350268961049933, "grad_norm": 0.5064939644802016, "learning_rate": 4.974190802507977e-06, "loss": 0.1935, "step": 415 }, { "epoch": 0.09372799729630776, "grad_norm": 0.4889748679569221, "learning_rate": 4.973923123931786e-06, "loss": 0.191, "step": 416 }, { "epoch": 0.09395330498211621, "grad_norm": 0.5089835583101964, "learning_rate": 4.973654071678299e-06, "loss": 0.1927, "step": 417 }, { "epoch": 0.09417861266792464, "grad_norm": 0.5729086031783864, "learning_rate": 4.973383645896908e-06, "loss": 0.1983, "step": 418 }, { "epoch": 0.09440392035373307, "grad_norm": 0.4816122719585705, "learning_rate": 4.973111846737772e-06, "loss": 0.1841, "step": 419 }, { "epoch": 0.0946292280395415, "grad_norm": 0.5084418909549541, "learning_rate": 4.97283867435181e-06, "loss": 0.1868, "step": 420 }, { "epoch": 0.09485453572534994, "grad_norm": 0.5105786107510725, "learning_rate": 4.972564128890704e-06, "loss": 0.2074, "step": 421 }, { "epoch": 0.09507984341115837, "grad_norm": 0.48927251560238677, "learning_rate": 4.972288210506902e-06, "loss": 0.1945, "step": 422 }, { "epoch": 0.0953051510969668, "grad_norm": 0.4939682917412821, "learning_rate": 4.972010919353606e-06, "loss": 0.1901, "step": 423 }, { "epoch": 0.09553045878277523, "grad_norm": 0.49015495094460076, "learning_rate": 4.971732255584789e-06, "loss": 0.1925, "step": 424 }, { "epoch": 0.09575576646858366, "grad_norm": 0.4959299200961116, "learning_rate": 4.971452219355182e-06, "loss": 0.2026, "step": 425 }, { "epoch": 0.0959810741543921, "grad_norm": 0.5064117553090074, "learning_rate": 4.971170810820279e-06, "loss": 0.192, "step": 426 }, { "epoch": 0.09620638184020053, "grad_norm": 0.49162239980799377, "learning_rate": 4.970888030136335e-06, "loss": 0.1896, "step": 427 }, { "epoch": 0.09643168952600896, "grad_norm": 0.5188915261797445, "learning_rate": 4.970603877460367e-06, "loss": 0.2043, "step": 428 }, { "epoch": 0.09665699721181739, "grad_norm": 0.5248312270545016, "learning_rate": 4.970318352950155e-06, "loss": 0.2105, "step": 429 }, { "epoch": 0.09688230489762582, "grad_norm": 0.4947470788962499, "learning_rate": 4.970031456764242e-06, "loss": 0.2016, "step": 430 }, { "epoch": 0.09710761258343426, "grad_norm": 0.4725899497424823, "learning_rate": 4.9697431890619265e-06, "loss": 0.1892, "step": 431 }, { "epoch": 0.09733292026924269, "grad_norm": 0.5106782555738607, "learning_rate": 4.969453550003277e-06, "loss": 0.1916, "step": 432 }, { "epoch": 0.09755822795505112, "grad_norm": 0.5277102908599764, "learning_rate": 4.969162539749117e-06, "loss": 0.1952, "step": 433 }, { "epoch": 0.09778353564085955, "grad_norm": 0.47760908967624405, "learning_rate": 4.9688701584610345e-06, "loss": 0.1831, "step": 434 }, { "epoch": 0.09800884332666798, "grad_norm": 0.44518578891497956, "learning_rate": 4.968576406301377e-06, "loss": 0.1836, "step": 435 }, { "epoch": 0.09823415101247641, "grad_norm": 0.47596718675919764, "learning_rate": 4.968281283433256e-06, "loss": 0.1927, "step": 436 }, { "epoch": 0.09845945869828485, "grad_norm": 0.4776725350497191, "learning_rate": 4.96798479002054e-06, "loss": 0.1812, "step": 437 }, { "epoch": 0.09868476638409328, "grad_norm": 0.5053833290898005, "learning_rate": 4.967686926227862e-06, "loss": 0.1921, "step": 438 }, { "epoch": 0.09891007406990171, "grad_norm": 0.5190307175787031, "learning_rate": 4.967387692220615e-06, "loss": 0.2063, "step": 439 }, { "epoch": 0.09913538175571014, "grad_norm": 0.47988599552415206, "learning_rate": 4.967087088164951e-06, "loss": 0.1887, "step": 440 }, { "epoch": 0.09936068944151857, "grad_norm": 0.4763852121519757, "learning_rate": 4.966785114227785e-06, "loss": 0.1871, "step": 441 }, { "epoch": 0.099585997127327, "grad_norm": 0.5291626373728735, "learning_rate": 4.966481770576793e-06, "loss": 0.1982, "step": 442 }, { "epoch": 0.09981130481313544, "grad_norm": 0.4999379174851594, "learning_rate": 4.966177057380409e-06, "loss": 0.186, "step": 443 }, { "epoch": 0.10003661249894387, "grad_norm": 0.5127885084474579, "learning_rate": 4.965870974807829e-06, "loss": 0.1945, "step": 444 }, { "epoch": 0.1002619201847523, "grad_norm": 0.5302768350911209, "learning_rate": 4.96556352302901e-06, "loss": 0.2085, "step": 445 }, { "epoch": 0.10048722787056073, "grad_norm": 0.5240819112362862, "learning_rate": 4.965254702214668e-06, "loss": 0.1919, "step": 446 }, { "epoch": 0.10071253555636916, "grad_norm": 0.4954501664107016, "learning_rate": 4.96494451253628e-06, "loss": 0.1869, "step": 447 }, { "epoch": 0.1009378432421776, "grad_norm": 0.5242909937666382, "learning_rate": 4.964632954166081e-06, "loss": 0.2166, "step": 448 }, { "epoch": 0.10116315092798603, "grad_norm": 0.4544684683438984, "learning_rate": 4.964320027277071e-06, "loss": 0.1664, "step": 449 }, { "epoch": 0.10138845861379446, "grad_norm": 0.49379164665615777, "learning_rate": 4.964005732043003e-06, "loss": 0.1989, "step": 450 }, { "epoch": 0.10161376629960289, "grad_norm": 0.4942749092145873, "learning_rate": 4.963690068638397e-06, "loss": 0.191, "step": 451 }, { "epoch": 0.10183907398541132, "grad_norm": 0.5246799570633436, "learning_rate": 4.963373037238527e-06, "loss": 0.1971, "step": 452 }, { "epoch": 0.10206438167121976, "grad_norm": 0.5302756608759575, "learning_rate": 4.963054638019429e-06, "loss": 0.1921, "step": 453 }, { "epoch": 0.10228968935702819, "grad_norm": 0.49314602619260695, "learning_rate": 4.9627348711578996e-06, "loss": 0.1813, "step": 454 }, { "epoch": 0.10251499704283662, "grad_norm": 0.5039489074404687, "learning_rate": 4.962413736831491e-06, "loss": 0.2017, "step": 455 }, { "epoch": 0.10274030472864505, "grad_norm": 0.5533083712534903, "learning_rate": 4.962091235218518e-06, "loss": 0.2014, "step": 456 }, { "epoch": 0.10296561241445348, "grad_norm": 0.5200375326910797, "learning_rate": 4.961767366498055e-06, "loss": 0.201, "step": 457 }, { "epoch": 0.10319092010026192, "grad_norm": 0.5145908578521863, "learning_rate": 4.961442130849933e-06, "loss": 0.194, "step": 458 }, { "epoch": 0.10341622778607035, "grad_norm": 0.518272482827854, "learning_rate": 4.961115528454743e-06, "loss": 0.2125, "step": 459 }, { "epoch": 0.10364153547187878, "grad_norm": 0.4877010421344951, "learning_rate": 4.960787559493836e-06, "loss": 0.1833, "step": 460 }, { "epoch": 0.10386684315768721, "grad_norm": 0.5327688445453027, "learning_rate": 4.96045822414932e-06, "loss": 0.1971, "step": 461 }, { "epoch": 0.10409215084349564, "grad_norm": 0.5036310460862861, "learning_rate": 4.960127522604065e-06, "loss": 0.1895, "step": 462 }, { "epoch": 0.10431745852930407, "grad_norm": 0.5025301179355007, "learning_rate": 4.959795455041694e-06, "loss": 0.1883, "step": 463 }, { "epoch": 0.1045427662151125, "grad_norm": 0.5472446648835756, "learning_rate": 4.959462021646593e-06, "loss": 0.201, "step": 464 }, { "epoch": 0.10476807390092094, "grad_norm": 0.4730210205377272, "learning_rate": 4.959127222603905e-06, "loss": 0.1696, "step": 465 }, { "epoch": 0.10499338158672938, "grad_norm": 0.5300199033723536, "learning_rate": 4.958791058099533e-06, "loss": 0.2005, "step": 466 }, { "epoch": 0.10521868927253782, "grad_norm": 0.49556193439744467, "learning_rate": 4.958453528320135e-06, "loss": 0.1804, "step": 467 }, { "epoch": 0.10544399695834625, "grad_norm": 0.4865416664678349, "learning_rate": 4.95811463345313e-06, "loss": 0.1928, "step": 468 }, { "epoch": 0.10566930464415468, "grad_norm": 0.5363619996168147, "learning_rate": 4.957774373686692e-06, "loss": 0.21, "step": 469 }, { "epoch": 0.10589461232996311, "grad_norm": 0.5064762335561315, "learning_rate": 4.957432749209755e-06, "loss": 0.2008, "step": 470 }, { "epoch": 0.10611992001577154, "grad_norm": 0.5144385896465787, "learning_rate": 4.95708976021201e-06, "loss": 0.1943, "step": 471 }, { "epoch": 0.10634522770157998, "grad_norm": 0.4726824280116551, "learning_rate": 4.956745406883909e-06, "loss": 0.1933, "step": 472 }, { "epoch": 0.10657053538738841, "grad_norm": 0.47149073477224396, "learning_rate": 4.956399689416654e-06, "loss": 0.1814, "step": 473 }, { "epoch": 0.10679584307319684, "grad_norm": 0.5120111770818855, "learning_rate": 4.956052608002212e-06, "loss": 0.1997, "step": 474 }, { "epoch": 0.10702115075900527, "grad_norm": 0.5031173640502962, "learning_rate": 4.9557041628333046e-06, "loss": 0.1989, "step": 475 }, { "epoch": 0.1072464584448137, "grad_norm": 0.5063321024933504, "learning_rate": 4.9553543541034086e-06, "loss": 0.1733, "step": 476 }, { "epoch": 0.10747176613062213, "grad_norm": 0.4792989721434702, "learning_rate": 4.955003182006761e-06, "loss": 0.1894, "step": 477 }, { "epoch": 0.10769707381643057, "grad_norm": 0.5184160979761754, "learning_rate": 4.954650646738354e-06, "loss": 0.1981, "step": 478 }, { "epoch": 0.107922381502239, "grad_norm": 0.4994948610360651, "learning_rate": 4.954296748493938e-06, "loss": 0.1903, "step": 479 }, { "epoch": 0.10814768918804743, "grad_norm": 0.49550630583633326, "learning_rate": 4.953941487470017e-06, "loss": 0.1872, "step": 480 }, { "epoch": 0.10837299687385586, "grad_norm": 0.4979423138808952, "learning_rate": 4.9535848638638586e-06, "loss": 0.1916, "step": 481 }, { "epoch": 0.1085983045596643, "grad_norm": 0.5108535093532186, "learning_rate": 4.953226877873479e-06, "loss": 0.2137, "step": 482 }, { "epoch": 0.10882361224547273, "grad_norm": 0.4914789380535188, "learning_rate": 4.952867529697656e-06, "loss": 0.1898, "step": 483 }, { "epoch": 0.10904891993128116, "grad_norm": 0.4996741383080328, "learning_rate": 4.952506819535922e-06, "loss": 0.193, "step": 484 }, { "epoch": 0.10927422761708959, "grad_norm": 0.4756841066506028, "learning_rate": 4.952144747588566e-06, "loss": 0.2049, "step": 485 }, { "epoch": 0.10949953530289802, "grad_norm": 0.5312558416236551, "learning_rate": 4.951781314056633e-06, "loss": 0.196, "step": 486 }, { "epoch": 0.10972484298870645, "grad_norm": 0.47800268646991784, "learning_rate": 4.951416519141923e-06, "loss": 0.1961, "step": 487 }, { "epoch": 0.10995015067451488, "grad_norm": 0.4866331740965868, "learning_rate": 4.951050363046995e-06, "loss": 0.1779, "step": 488 }, { "epoch": 0.11017545836032332, "grad_norm": 0.5149337838452025, "learning_rate": 4.95068284597516e-06, "loss": 0.2117, "step": 489 }, { "epoch": 0.11040076604613175, "grad_norm": 0.5196237692139258, "learning_rate": 4.950313968130488e-06, "loss": 0.2082, "step": 490 }, { "epoch": 0.11062607373194018, "grad_norm": 0.5041250602592621, "learning_rate": 4.949943729717802e-06, "loss": 0.1971, "step": 491 }, { "epoch": 0.11085138141774861, "grad_norm": 0.476324372109053, "learning_rate": 4.949572130942683e-06, "loss": 0.1911, "step": 492 }, { "epoch": 0.11107668910355704, "grad_norm": 0.46691888238357065, "learning_rate": 4.949199172011464e-06, "loss": 0.1881, "step": 493 }, { "epoch": 0.11130199678936548, "grad_norm": 0.5370756908861958, "learning_rate": 4.948824853131237e-06, "loss": 0.2091, "step": 494 }, { "epoch": 0.11152730447517391, "grad_norm": 0.481306717114356, "learning_rate": 4.948449174509846e-06, "loss": 0.1981, "step": 495 }, { "epoch": 0.11175261216098234, "grad_norm": 0.5240621763320067, "learning_rate": 4.948072136355892e-06, "loss": 0.1869, "step": 496 }, { "epoch": 0.11197791984679077, "grad_norm": 0.5353721219133655, "learning_rate": 4.94769373887873e-06, "loss": 0.2026, "step": 497 }, { "epoch": 0.1122032275325992, "grad_norm": 0.48135528686973533, "learning_rate": 4.94731398228847e-06, "loss": 0.2066, "step": 498 }, { "epoch": 0.11242853521840764, "grad_norm": 0.5284129479374139, "learning_rate": 4.946932866795977e-06, "loss": 0.1989, "step": 499 }, { "epoch": 0.11265384290421607, "grad_norm": 0.5209491252410734, "learning_rate": 4.94655039261287e-06, "loss": 0.1991, "step": 500 }, { "epoch": 0.11265384290421607, "eval_loss": 0.19234976172447205, "eval_runtime": 56.8076, "eval_samples_per_second": 50.521, "eval_steps_per_second": 6.32, "step": 500 }, { "epoch": 0.1128791505900245, "grad_norm": 0.4663315177005309, "learning_rate": 4.946166559951523e-06, "loss": 0.1823, "step": 501 }, { "epoch": 0.11310445827583293, "grad_norm": 0.5043217933291648, "learning_rate": 4.9457813690250635e-06, "loss": 0.1964, "step": 502 }, { "epoch": 0.11332976596164136, "grad_norm": 0.5055573412209956, "learning_rate": 4.945394820047373e-06, "loss": 0.1881, "step": 503 }, { "epoch": 0.1135550736474498, "grad_norm": 0.48836811641356404, "learning_rate": 4.94500691323309e-06, "loss": 0.1848, "step": 504 }, { "epoch": 0.11378038133325823, "grad_norm": 0.5455722455051317, "learning_rate": 4.944617648797602e-06, "loss": 0.2136, "step": 505 }, { "epoch": 0.11400568901906666, "grad_norm": 0.4926313400228012, "learning_rate": 4.9442270269570545e-06, "loss": 0.1751, "step": 506 }, { "epoch": 0.11423099670487509, "grad_norm": 0.4915724081406104, "learning_rate": 4.943835047928346e-06, "loss": 0.1686, "step": 507 }, { "epoch": 0.11445630439068352, "grad_norm": 0.5428277431714841, "learning_rate": 4.943441711929126e-06, "loss": 0.2063, "step": 508 }, { "epoch": 0.11468161207649195, "grad_norm": 0.5182829108034406, "learning_rate": 4.9430470191778e-06, "loss": 0.2048, "step": 509 }, { "epoch": 0.11490691976230039, "grad_norm": 0.4919127976132606, "learning_rate": 4.942650969893527e-06, "loss": 0.1903, "step": 510 }, { "epoch": 0.11513222744810882, "grad_norm": 0.5136285948187803, "learning_rate": 4.942253564296217e-06, "loss": 0.2014, "step": 511 }, { "epoch": 0.11535753513391725, "grad_norm": 0.48212990917389625, "learning_rate": 4.941854802606537e-06, "loss": 0.1882, "step": 512 }, { "epoch": 0.11558284281972568, "grad_norm": 0.4586425822677684, "learning_rate": 4.9414546850459014e-06, "loss": 0.175, "step": 513 }, { "epoch": 0.11580815050553411, "grad_norm": 0.47705894227746765, "learning_rate": 4.941053211836482e-06, "loss": 0.1764, "step": 514 }, { "epoch": 0.11603345819134256, "grad_norm": 0.5167055192881297, "learning_rate": 4.940650383201202e-06, "loss": 0.1909, "step": 515 }, { "epoch": 0.11625876587715099, "grad_norm": 0.5060498826515504, "learning_rate": 4.940246199363737e-06, "loss": 0.1867, "step": 516 }, { "epoch": 0.11648407356295942, "grad_norm": 0.5104025646241316, "learning_rate": 4.939840660548515e-06, "loss": 0.1956, "step": 517 }, { "epoch": 0.11670938124876785, "grad_norm": 0.505292042282106, "learning_rate": 4.939433766980717e-06, "loss": 0.1794, "step": 518 }, { "epoch": 0.11693468893457629, "grad_norm": 0.5223802995592578, "learning_rate": 4.939025518886276e-06, "loss": 0.1986, "step": 519 }, { "epoch": 0.11715999662038472, "grad_norm": 0.47521330675412077, "learning_rate": 4.9386159164918764e-06, "loss": 0.183, "step": 520 }, { "epoch": 0.11738530430619315, "grad_norm": 0.5750076163542128, "learning_rate": 4.938204960024955e-06, "loss": 0.2033, "step": 521 }, { "epoch": 0.11761061199200158, "grad_norm": 0.5233779578600689, "learning_rate": 4.937792649713701e-06, "loss": 0.1864, "step": 522 }, { "epoch": 0.11783591967781001, "grad_norm": 0.46217847479401275, "learning_rate": 4.937378985787055e-06, "loss": 0.1753, "step": 523 }, { "epoch": 0.11806122736361845, "grad_norm": 0.4974554874716037, "learning_rate": 4.9369639684747095e-06, "loss": 0.1836, "step": 524 }, { "epoch": 0.11828653504942688, "grad_norm": 0.4923126383015336, "learning_rate": 4.936547598007107e-06, "loss": 0.193, "step": 525 }, { "epoch": 0.11851184273523531, "grad_norm": 0.5398798000068692, "learning_rate": 4.936129874615443e-06, "loss": 0.1755, "step": 526 }, { "epoch": 0.11873715042104374, "grad_norm": 0.47146590078183515, "learning_rate": 4.935710798531664e-06, "loss": 0.1971, "step": 527 }, { "epoch": 0.11896245810685217, "grad_norm": 0.5491858771837667, "learning_rate": 4.935290369988468e-06, "loss": 0.2056, "step": 528 }, { "epoch": 0.1191877657926606, "grad_norm": 0.5388147308207761, "learning_rate": 4.934868589219302e-06, "loss": 0.2001, "step": 529 }, { "epoch": 0.11941307347846904, "grad_norm": 0.5083952861725305, "learning_rate": 4.934445456458366e-06, "loss": 0.1888, "step": 530 }, { "epoch": 0.11963838116427747, "grad_norm": 0.5169297785302316, "learning_rate": 4.934020971940609e-06, "loss": 0.2023, "step": 531 }, { "epoch": 0.1198636888500859, "grad_norm": 0.5095162486691678, "learning_rate": 4.933595135901733e-06, "loss": 0.1906, "step": 532 }, { "epoch": 0.12008899653589433, "grad_norm": 0.48223521419999554, "learning_rate": 4.933167948578187e-06, "loss": 0.1838, "step": 533 }, { "epoch": 0.12031430422170276, "grad_norm": 0.5444055294135087, "learning_rate": 4.932739410207172e-06, "loss": 0.2002, "step": 534 }, { "epoch": 0.1205396119075112, "grad_norm": 0.5109454104003555, "learning_rate": 4.932309521026643e-06, "loss": 0.1927, "step": 535 }, { "epoch": 0.12076491959331963, "grad_norm": 0.48724028911776623, "learning_rate": 4.931878281275296e-06, "loss": 0.1844, "step": 536 }, { "epoch": 0.12099022727912806, "grad_norm": 0.4777988104796984, "learning_rate": 4.931445691192587e-06, "loss": 0.1785, "step": 537 }, { "epoch": 0.12121553496493649, "grad_norm": 0.5021184008826821, "learning_rate": 4.931011751018715e-06, "loss": 0.1883, "step": 538 }, { "epoch": 0.12144084265074492, "grad_norm": 0.45767088453705446, "learning_rate": 4.930576460994631e-06, "loss": 0.1744, "step": 539 }, { "epoch": 0.12166615033655336, "grad_norm": 0.5156528768504517, "learning_rate": 4.930139821362036e-06, "loss": 0.1957, "step": 540 }, { "epoch": 0.12189145802236179, "grad_norm": 0.49466412609385285, "learning_rate": 4.929701832363379e-06, "loss": 0.1932, "step": 541 }, { "epoch": 0.12211676570817022, "grad_norm": 0.454600799745477, "learning_rate": 4.929262494241859e-06, "loss": 0.173, "step": 542 }, { "epoch": 0.12234207339397865, "grad_norm": 0.48395076267227993, "learning_rate": 4.928821807241425e-06, "loss": 0.1864, "step": 543 }, { "epoch": 0.12256738107978708, "grad_norm": 0.472630796102665, "learning_rate": 4.928379771606773e-06, "loss": 0.1827, "step": 544 }, { "epoch": 0.12279268876559551, "grad_norm": 0.5131023372757877, "learning_rate": 4.927936387583348e-06, "loss": 0.2072, "step": 545 }, { "epoch": 0.12301799645140395, "grad_norm": 0.4599619189881397, "learning_rate": 4.927491655417347e-06, "loss": 0.1772, "step": 546 }, { "epoch": 0.12324330413721238, "grad_norm": 0.4936735496643357, "learning_rate": 4.927045575355712e-06, "loss": 0.2063, "step": 547 }, { "epoch": 0.12346861182302081, "grad_norm": 0.5223093419687126, "learning_rate": 4.926598147646134e-06, "loss": 0.1932, "step": 548 }, { "epoch": 0.12369391950882924, "grad_norm": 0.458455620184728, "learning_rate": 4.9261493725370546e-06, "loss": 0.1677, "step": 549 }, { "epoch": 0.12391922719463767, "grad_norm": 0.49240690720846386, "learning_rate": 4.9256992502776605e-06, "loss": 0.1839, "step": 550 }, { "epoch": 0.1241445348804461, "grad_norm": 0.47757607892838266, "learning_rate": 4.925247781117888e-06, "loss": 0.1733, "step": 551 }, { "epoch": 0.12436984256625454, "grad_norm": 0.4838343579732949, "learning_rate": 4.924794965308421e-06, "loss": 0.1763, "step": 552 }, { "epoch": 0.12459515025206297, "grad_norm": 0.5265568200531762, "learning_rate": 4.924340803100692e-06, "loss": 0.2014, "step": 553 }, { "epoch": 0.1248204579378714, "grad_norm": 0.48214538271531815, "learning_rate": 4.9238852947468796e-06, "loss": 0.1857, "step": 554 }, { "epoch": 0.12504576562367983, "grad_norm": 0.5343120785647222, "learning_rate": 4.923428440499912e-06, "loss": 0.1906, "step": 555 }, { "epoch": 0.12527107330948828, "grad_norm": 0.5235638112156517, "learning_rate": 4.922970240613461e-06, "loss": 0.191, "step": 556 }, { "epoch": 0.1254963809952967, "grad_norm": 0.4889226147171265, "learning_rate": 4.92251069534195e-06, "loss": 0.1949, "step": 557 }, { "epoch": 0.12572168868110514, "grad_norm": 0.5040294823792546, "learning_rate": 4.922049804940546e-06, "loss": 0.194, "step": 558 }, { "epoch": 0.12594699636691356, "grad_norm": 0.5391772652484352, "learning_rate": 4.9215875696651645e-06, "loss": 0.1944, "step": 559 }, { "epoch": 0.126172304052722, "grad_norm": 0.5025810534944168, "learning_rate": 4.9211239897724685e-06, "loss": 0.1982, "step": 560 }, { "epoch": 0.12639761173853042, "grad_norm": 0.4798927968328912, "learning_rate": 4.920659065519866e-06, "loss": 0.1968, "step": 561 }, { "epoch": 0.12662291942433887, "grad_norm": 0.5166664453113821, "learning_rate": 4.920192797165511e-06, "loss": 0.1843, "step": 562 }, { "epoch": 0.1268482271101473, "grad_norm": 0.567222736069852, "learning_rate": 4.919725184968307e-06, "loss": 0.1982, "step": 563 }, { "epoch": 0.12707353479595573, "grad_norm": 0.48741393189065063, "learning_rate": 4.9192562291879e-06, "loss": 0.184, "step": 564 }, { "epoch": 0.12729884248176415, "grad_norm": 0.5093830078546595, "learning_rate": 4.9187859300846845e-06, "loss": 0.1852, "step": 565 }, { "epoch": 0.1275241501675726, "grad_norm": 0.4947704192884275, "learning_rate": 4.9183142879198e-06, "loss": 0.1991, "step": 566 }, { "epoch": 0.12774945785338102, "grad_norm": 0.48047644785859844, "learning_rate": 4.917841302955132e-06, "loss": 0.189, "step": 567 }, { "epoch": 0.12797476553918946, "grad_norm": 0.49149253370292334, "learning_rate": 4.917366975453311e-06, "loss": 0.1846, "step": 568 }, { "epoch": 0.12820007322499788, "grad_norm": 0.4772556842972461, "learning_rate": 4.916891305677712e-06, "loss": 0.1812, "step": 569 }, { "epoch": 0.12842538091080632, "grad_norm": 0.4967161370998577, "learning_rate": 4.9164142938924595e-06, "loss": 0.1943, "step": 570 }, { "epoch": 0.12865068859661474, "grad_norm": 0.4966530996832579, "learning_rate": 4.9159359403624185e-06, "loss": 0.1963, "step": 571 }, { "epoch": 0.1288759962824232, "grad_norm": 0.49264929970143695, "learning_rate": 4.915456245353202e-06, "loss": 0.1752, "step": 572 }, { "epoch": 0.1291013039682316, "grad_norm": 0.4859626874941514, "learning_rate": 4.914975209131165e-06, "loss": 0.1937, "step": 573 }, { "epoch": 0.12932661165404005, "grad_norm": 0.5043817672029466, "learning_rate": 4.914492831963411e-06, "loss": 0.1937, "step": 574 }, { "epoch": 0.12955191933984847, "grad_norm": 0.5015582050349247, "learning_rate": 4.9140091141177856e-06, "loss": 0.201, "step": 575 }, { "epoch": 0.12977722702565692, "grad_norm": 0.5146518830297795, "learning_rate": 4.9135240558628786e-06, "loss": 0.2031, "step": 576 }, { "epoch": 0.13000253471146533, "grad_norm": 0.4854388066659389, "learning_rate": 4.913037657468025e-06, "loss": 0.1913, "step": 577 }, { "epoch": 0.13022784239727378, "grad_norm": 0.48652005999572784, "learning_rate": 4.9125499192033035e-06, "loss": 0.1823, "step": 578 }, { "epoch": 0.1304531500830822, "grad_norm": 0.46886496437160863, "learning_rate": 4.912060841339536e-06, "loss": 0.1681, "step": 579 }, { "epoch": 0.13067845776889064, "grad_norm": 0.4733097967338571, "learning_rate": 4.911570424148293e-06, "loss": 0.1781, "step": 580 }, { "epoch": 0.13090376545469906, "grad_norm": 0.4966002182337164, "learning_rate": 4.911078667901881e-06, "loss": 0.2003, "step": 581 }, { "epoch": 0.1311290731405075, "grad_norm": 0.46547777291654663, "learning_rate": 4.910585572873355e-06, "loss": 0.1814, "step": 582 }, { "epoch": 0.13135438082631593, "grad_norm": 0.500486589164084, "learning_rate": 4.9100911393365134e-06, "loss": 0.1627, "step": 583 }, { "epoch": 0.13157968851212437, "grad_norm": 0.4955517290997085, "learning_rate": 4.9095953675658945e-06, "loss": 0.1972, "step": 584 }, { "epoch": 0.1318049961979328, "grad_norm": 0.4988068045683488, "learning_rate": 4.909098257836784e-06, "loss": 0.1696, "step": 585 }, { "epoch": 0.13203030388374123, "grad_norm": 0.5068279577450977, "learning_rate": 4.908599810425208e-06, "loss": 0.1959, "step": 586 }, { "epoch": 0.13225561156954965, "grad_norm": 0.4847712941015659, "learning_rate": 4.908100025607935e-06, "loss": 0.1839, "step": 587 }, { "epoch": 0.1324809192553581, "grad_norm": 0.4745230551209773, "learning_rate": 4.907598903662477e-06, "loss": 0.1774, "step": 588 }, { "epoch": 0.13270622694116654, "grad_norm": 0.5141740196225114, "learning_rate": 4.90709644486709e-06, "loss": 0.2046, "step": 589 }, { "epoch": 0.13293153462697496, "grad_norm": 0.5221614658472497, "learning_rate": 4.906592649500767e-06, "loss": 0.1908, "step": 590 }, { "epoch": 0.1331568423127834, "grad_norm": 0.4940345638894506, "learning_rate": 4.906087517843251e-06, "loss": 0.1838, "step": 591 }, { "epoch": 0.13338214999859183, "grad_norm": 0.4688033247262004, "learning_rate": 4.9055810501750205e-06, "loss": 0.1766, "step": 592 }, { "epoch": 0.13360745768440027, "grad_norm": 0.48052014823475525, "learning_rate": 4.905073246777298e-06, "loss": 0.1848, "step": 593 }, { "epoch": 0.1338327653702087, "grad_norm": 0.48977849616782837, "learning_rate": 4.904564107932048e-06, "loss": 0.1737, "step": 594 }, { "epoch": 0.13405807305601714, "grad_norm": 0.49857557136653785, "learning_rate": 4.904053633921977e-06, "loss": 0.1812, "step": 595 }, { "epoch": 0.13428338074182555, "grad_norm": 0.5195821127291367, "learning_rate": 4.9035418250305314e-06, "loss": 0.1994, "step": 596 }, { "epoch": 0.134508688427634, "grad_norm": 0.4848753612469132, "learning_rate": 4.9030286815419e-06, "loss": 0.1746, "step": 597 }, { "epoch": 0.13473399611344242, "grad_norm": 0.4651188815320874, "learning_rate": 4.902514203741013e-06, "loss": 0.1717, "step": 598 }, { "epoch": 0.13495930379925086, "grad_norm": 0.49092504275491067, "learning_rate": 4.901998391913539e-06, "loss": 0.1742, "step": 599 }, { "epoch": 0.13518461148505928, "grad_norm": 0.48578109490095983, "learning_rate": 4.9014812463458905e-06, "loss": 0.1768, "step": 600 }, { "epoch": 0.13540991917086773, "grad_norm": 0.5097924287582863, "learning_rate": 4.9009627673252195e-06, "loss": 0.1925, "step": 601 }, { "epoch": 0.13563522685667614, "grad_norm": 0.49925568640506196, "learning_rate": 4.9004429551394155e-06, "loss": 0.1901, "step": 602 }, { "epoch": 0.1358605345424846, "grad_norm": 0.5160976182113821, "learning_rate": 4.899921810077114e-06, "loss": 0.194, "step": 603 }, { "epoch": 0.136085842228293, "grad_norm": 0.5130419470354679, "learning_rate": 4.899399332427685e-06, "loss": 0.181, "step": 604 }, { "epoch": 0.13631114991410145, "grad_norm": 0.49486082839466233, "learning_rate": 4.898875522481242e-06, "loss": 0.1853, "step": 605 }, { "epoch": 0.13653645759990987, "grad_norm": 0.4648043819729389, "learning_rate": 4.898350380528638e-06, "loss": 0.1893, "step": 606 }, { "epoch": 0.13676176528571832, "grad_norm": 0.4743562941887421, "learning_rate": 4.897823906861463e-06, "loss": 0.1919, "step": 607 }, { "epoch": 0.13698707297152674, "grad_norm": 0.49175914667017384, "learning_rate": 4.89729610177205e-06, "loss": 0.1876, "step": 608 }, { "epoch": 0.13721238065733518, "grad_norm": 0.5060333088563503, "learning_rate": 4.896766965553467e-06, "loss": 0.208, "step": 609 }, { "epoch": 0.1374376883431436, "grad_norm": 0.47294188413884825, "learning_rate": 4.896236498499526e-06, "loss": 0.1988, "step": 610 }, { "epoch": 0.13766299602895204, "grad_norm": 0.4606626602858298, "learning_rate": 4.8957047009047744e-06, "loss": 0.1752, "step": 611 }, { "epoch": 0.13788830371476046, "grad_norm": 0.47273517792676306, "learning_rate": 4.8951715730645e-06, "loss": 0.1884, "step": 612 }, { "epoch": 0.1381136114005689, "grad_norm": 0.46148619697171755, "learning_rate": 4.894637115274728e-06, "loss": 0.1885, "step": 613 }, { "epoch": 0.13833891908637733, "grad_norm": 0.5018754666434547, "learning_rate": 4.894101327832225e-06, "loss": 0.2106, "step": 614 }, { "epoch": 0.13856422677218577, "grad_norm": 0.4651687639877327, "learning_rate": 4.893564211034492e-06, "loss": 0.1792, "step": 615 }, { "epoch": 0.1387895344579942, "grad_norm": 0.47670934711892976, "learning_rate": 4.89302576517977e-06, "loss": 0.1896, "step": 616 }, { "epoch": 0.13901484214380264, "grad_norm": 0.4545326790236107, "learning_rate": 4.892485990567037e-06, "loss": 0.1833, "step": 617 }, { "epoch": 0.13924014982961105, "grad_norm": 0.5051419942175441, "learning_rate": 4.891944887496013e-06, "loss": 0.2025, "step": 618 }, { "epoch": 0.1394654575154195, "grad_norm": 0.511578758034126, "learning_rate": 4.891402456267149e-06, "loss": 0.1922, "step": 619 }, { "epoch": 0.13969076520122792, "grad_norm": 0.4935796436295554, "learning_rate": 4.890858697181638e-06, "loss": 0.1915, "step": 620 }, { "epoch": 0.13991607288703636, "grad_norm": 0.502114052015495, "learning_rate": 4.89031361054141e-06, "loss": 0.1922, "step": 621 }, { "epoch": 0.14014138057284478, "grad_norm": 0.510065283262986, "learning_rate": 4.8897671966491315e-06, "loss": 0.1826, "step": 622 }, { "epoch": 0.14036668825865323, "grad_norm": 0.4952918551445192, "learning_rate": 4.889219455808204e-06, "loss": 0.1854, "step": 623 }, { "epoch": 0.14059199594446165, "grad_norm": 0.5052374937308469, "learning_rate": 4.888670388322768e-06, "loss": 0.1985, "step": 624 }, { "epoch": 0.1408173036302701, "grad_norm": 0.49988160987866054, "learning_rate": 4.888119994497701e-06, "loss": 0.1749, "step": 625 }, { "epoch": 0.1410426113160785, "grad_norm": 0.5172442914773804, "learning_rate": 4.887568274638616e-06, "loss": 0.2019, "step": 626 }, { "epoch": 0.14126791900188695, "grad_norm": 0.5087009835749207, "learning_rate": 4.887015229051861e-06, "loss": 0.191, "step": 627 }, { "epoch": 0.14149322668769537, "grad_norm": 0.4959246585025635, "learning_rate": 4.886460858044524e-06, "loss": 0.1857, "step": 628 }, { "epoch": 0.14171853437350382, "grad_norm": 0.4723135163607331, "learning_rate": 4.885905161924426e-06, "loss": 0.1822, "step": 629 }, { "epoch": 0.14194384205931224, "grad_norm": 0.502793877350106, "learning_rate": 4.8853481410001225e-06, "loss": 0.1806, "step": 630 }, { "epoch": 0.14216914974512068, "grad_norm": 0.5082071597376282, "learning_rate": 4.8847897955809085e-06, "loss": 0.1923, "step": 631 }, { "epoch": 0.1423944574309291, "grad_norm": 0.46157106301262735, "learning_rate": 4.884230125976812e-06, "loss": 0.1673, "step": 632 }, { "epoch": 0.14261976511673755, "grad_norm": 0.4917291509510748, "learning_rate": 4.8836691324985955e-06, "loss": 0.1935, "step": 633 }, { "epoch": 0.14284507280254596, "grad_norm": 0.4936790877793207, "learning_rate": 4.883106815457758e-06, "loss": 0.193, "step": 634 }, { "epoch": 0.1430703804883544, "grad_norm": 0.5003405509113616, "learning_rate": 4.882543175166535e-06, "loss": 0.1792, "step": 635 }, { "epoch": 0.14329568817416283, "grad_norm": 0.5285886608390846, "learning_rate": 4.881978211937895e-06, "loss": 0.1984, "step": 636 }, { "epoch": 0.14352099585997127, "grad_norm": 0.49050879740196224, "learning_rate": 4.8814119260855374e-06, "loss": 0.1872, "step": 637 }, { "epoch": 0.14374630354577972, "grad_norm": 0.48682747731833537, "learning_rate": 4.8808443179239025e-06, "loss": 0.1755, "step": 638 }, { "epoch": 0.14397161123158814, "grad_norm": 0.4676177116452303, "learning_rate": 4.880275387768162e-06, "loss": 0.1748, "step": 639 }, { "epoch": 0.14419691891739658, "grad_norm": 0.4951558991868934, "learning_rate": 4.87970513593422e-06, "loss": 0.1843, "step": 640 }, { "epoch": 0.144422226603205, "grad_norm": 0.5213133487107826, "learning_rate": 4.879133562738719e-06, "loss": 0.2134, "step": 641 }, { "epoch": 0.14464753428901345, "grad_norm": 0.46810947829384675, "learning_rate": 4.878560668499029e-06, "loss": 0.1927, "step": 642 }, { "epoch": 0.14487284197482186, "grad_norm": 0.47590414235063444, "learning_rate": 4.8779864535332585e-06, "loss": 0.1743, "step": 643 }, { "epoch": 0.1450981496606303, "grad_norm": 0.4995226249566914, "learning_rate": 4.877410918160247e-06, "loss": 0.1815, "step": 644 }, { "epoch": 0.14532345734643873, "grad_norm": 0.5278580391531498, "learning_rate": 4.876834062699569e-06, "loss": 0.2039, "step": 645 }, { "epoch": 0.14554876503224717, "grad_norm": 0.4843992433234831, "learning_rate": 4.87625588747153e-06, "loss": 0.1935, "step": 646 }, { "epoch": 0.1457740727180556, "grad_norm": 0.4854790733499744, "learning_rate": 4.875676392797169e-06, "loss": 0.1816, "step": 647 }, { "epoch": 0.14599938040386404, "grad_norm": 0.5496275694036449, "learning_rate": 4.875095578998258e-06, "loss": 0.2017, "step": 648 }, { "epoch": 0.14622468808967246, "grad_norm": 0.47840998998543804, "learning_rate": 4.874513446397301e-06, "loss": 0.1718, "step": 649 }, { "epoch": 0.1464499957754809, "grad_norm": 0.472469437986709, "learning_rate": 4.873929995317535e-06, "loss": 0.1813, "step": 650 }, { "epoch": 0.14667530346128932, "grad_norm": 0.49186188067027603, "learning_rate": 4.873345226082929e-06, "loss": 0.1874, "step": 651 }, { "epoch": 0.14690061114709776, "grad_norm": 0.47817285908812057, "learning_rate": 4.872759139018183e-06, "loss": 0.1849, "step": 652 }, { "epoch": 0.14712591883290618, "grad_norm": 0.5128159653562537, "learning_rate": 4.872171734448728e-06, "loss": 0.2035, "step": 653 }, { "epoch": 0.14735122651871463, "grad_norm": 0.4996885022639693, "learning_rate": 4.87158301270073e-06, "loss": 0.1796, "step": 654 }, { "epoch": 0.14757653420452305, "grad_norm": 0.4504567590267657, "learning_rate": 4.870992974101084e-06, "loss": 0.1778, "step": 655 }, { "epoch": 0.1478018418903315, "grad_norm": 0.5086172528062239, "learning_rate": 4.870401618977415e-06, "loss": 0.2074, "step": 656 }, { "epoch": 0.1480271495761399, "grad_norm": 0.49588512436516297, "learning_rate": 4.869808947658082e-06, "loss": 0.1972, "step": 657 }, { "epoch": 0.14825245726194836, "grad_norm": 0.4917383983984287, "learning_rate": 4.869214960472172e-06, "loss": 0.1919, "step": 658 }, { "epoch": 0.14847776494775677, "grad_norm": 0.510112771596669, "learning_rate": 4.868619657749505e-06, "loss": 0.1939, "step": 659 }, { "epoch": 0.14870307263356522, "grad_norm": 0.5446853659905414, "learning_rate": 4.868023039820629e-06, "loss": 0.1979, "step": 660 }, { "epoch": 0.14892838031937364, "grad_norm": 0.47307794313317136, "learning_rate": 4.867425107016826e-06, "loss": 0.1736, "step": 661 }, { "epoch": 0.14915368800518208, "grad_norm": 0.4845079148956392, "learning_rate": 4.8668258596701035e-06, "loss": 0.166, "step": 662 }, { "epoch": 0.1493789956909905, "grad_norm": 0.47209894690292553, "learning_rate": 4.866225298113203e-06, "loss": 0.175, "step": 663 }, { "epoch": 0.14960430337679895, "grad_norm": 0.5525456803515458, "learning_rate": 4.865623422679593e-06, "loss": 0.1965, "step": 664 }, { "epoch": 0.14982961106260737, "grad_norm": 0.5276283864774272, "learning_rate": 4.865020233703472e-06, "loss": 0.1967, "step": 665 }, { "epoch": 0.1500549187484158, "grad_norm": 0.5237611032703069, "learning_rate": 4.864415731519769e-06, "loss": 0.1847, "step": 666 }, { "epoch": 0.15028022643422423, "grad_norm": 0.49191733964287326, "learning_rate": 4.863809916464142e-06, "loss": 0.1843, "step": 667 }, { "epoch": 0.15050553412003267, "grad_norm": 0.5046503834610336, "learning_rate": 4.8632027888729765e-06, "loss": 0.1873, "step": 668 }, { "epoch": 0.1507308418058411, "grad_norm": 0.4761077464697414, "learning_rate": 4.862594349083389e-06, "loss": 0.1659, "step": 669 }, { "epoch": 0.15095614949164954, "grad_norm": 0.49773885573902543, "learning_rate": 4.861984597433223e-06, "loss": 0.1857, "step": 670 }, { "epoch": 0.15118145717745796, "grad_norm": 0.4657062167693691, "learning_rate": 4.861373534261049e-06, "loss": 0.171, "step": 671 }, { "epoch": 0.1514067648632664, "grad_norm": 0.525891884598973, "learning_rate": 4.860761159906171e-06, "loss": 0.1975, "step": 672 }, { "epoch": 0.15163207254907482, "grad_norm": 0.4543247451924693, "learning_rate": 4.8601474747086145e-06, "loss": 0.1705, "step": 673 }, { "epoch": 0.15185738023488327, "grad_norm": 0.49799202938449977, "learning_rate": 4.859532479009138e-06, "loss": 0.192, "step": 674 }, { "epoch": 0.15208268792069168, "grad_norm": 0.48331962531733624, "learning_rate": 4.8589161731492255e-06, "loss": 0.1753, "step": 675 }, { "epoch": 0.15230799560650013, "grad_norm": 0.5156829087990366, "learning_rate": 4.858298557471089e-06, "loss": 0.1893, "step": 676 }, { "epoch": 0.15253330329230855, "grad_norm": 0.5170730850666883, "learning_rate": 4.857679632317664e-06, "loss": 0.1965, "step": 677 }, { "epoch": 0.152758610978117, "grad_norm": 0.5072378427728541, "learning_rate": 4.857059398032622e-06, "loss": 0.1816, "step": 678 }, { "epoch": 0.1529839186639254, "grad_norm": 0.4898983755820026, "learning_rate": 4.856437854960352e-06, "loss": 0.1849, "step": 679 }, { "epoch": 0.15320922634973386, "grad_norm": 0.5264472680458602, "learning_rate": 4.855815003445975e-06, "loss": 0.1971, "step": 680 }, { "epoch": 0.15343453403554227, "grad_norm": 0.5152650134807022, "learning_rate": 4.855190843835338e-06, "loss": 0.1893, "step": 681 }, { "epoch": 0.15365984172135072, "grad_norm": 0.5578264199187081, "learning_rate": 4.8545653764750125e-06, "loss": 0.2006, "step": 682 }, { "epoch": 0.15388514940715914, "grad_norm": 0.46453655362572005, "learning_rate": 4.853938601712297e-06, "loss": 0.1805, "step": 683 }, { "epoch": 0.15411045709296758, "grad_norm": 0.5213029966256437, "learning_rate": 4.853310519895217e-06, "loss": 0.1858, "step": 684 }, { "epoch": 0.154335764778776, "grad_norm": 0.4915514653572081, "learning_rate": 4.852681131372522e-06, "loss": 0.1969, "step": 685 }, { "epoch": 0.15456107246458445, "grad_norm": 0.5281210443216867, "learning_rate": 4.85205043649369e-06, "loss": 0.1912, "step": 686 }, { "epoch": 0.1547863801503929, "grad_norm": 0.46846923934248047, "learning_rate": 4.851418435608919e-06, "loss": 0.1802, "step": 687 }, { "epoch": 0.1550116878362013, "grad_norm": 0.4798823473223289, "learning_rate": 4.850785129069139e-06, "loss": 0.1799, "step": 688 }, { "epoch": 0.15523699552200976, "grad_norm": 0.47986210011175, "learning_rate": 4.850150517225999e-06, "loss": 0.1797, "step": 689 }, { "epoch": 0.15546230320781818, "grad_norm": 0.487039403659413, "learning_rate": 4.849514600431877e-06, "loss": 0.1751, "step": 690 }, { "epoch": 0.15568761089362662, "grad_norm": 0.5225794971495527, "learning_rate": 4.848877379039874e-06, "loss": 0.197, "step": 691 }, { "epoch": 0.15591291857943504, "grad_norm": 0.48848225992863903, "learning_rate": 4.848238853403813e-06, "loss": 0.1862, "step": 692 }, { "epoch": 0.15613822626524348, "grad_norm": 0.512252643675161, "learning_rate": 4.847599023878245e-06, "loss": 0.1908, "step": 693 }, { "epoch": 0.1563635339510519, "grad_norm": 0.47856894671061945, "learning_rate": 4.846957890818444e-06, "loss": 0.1895, "step": 694 }, { "epoch": 0.15658884163686035, "grad_norm": 0.46385857470598735, "learning_rate": 4.846315454580406e-06, "loss": 0.1796, "step": 695 }, { "epoch": 0.15681414932266877, "grad_norm": 0.4768661328540527, "learning_rate": 4.845671715520853e-06, "loss": 0.181, "step": 696 }, { "epoch": 0.1570394570084772, "grad_norm": 0.4603458107546864, "learning_rate": 4.845026673997229e-06, "loss": 0.1729, "step": 697 }, { "epoch": 0.15726476469428563, "grad_norm": 0.4829231227643716, "learning_rate": 4.844380330367701e-06, "loss": 0.1846, "step": 698 }, { "epoch": 0.15749007238009408, "grad_norm": 0.5013746953211577, "learning_rate": 4.843732684991161e-06, "loss": 0.1892, "step": 699 }, { "epoch": 0.1577153800659025, "grad_norm": 0.5322417230269074, "learning_rate": 4.84308373822722e-06, "loss": 0.2024, "step": 700 }, { "epoch": 0.15794068775171094, "grad_norm": 0.4811274861286996, "learning_rate": 4.842433490436217e-06, "loss": 0.178, "step": 701 }, { "epoch": 0.15816599543751936, "grad_norm": 0.48964366249033114, "learning_rate": 4.841781941979207e-06, "loss": 0.1952, "step": 702 }, { "epoch": 0.1583913031233278, "grad_norm": 0.49770700891913777, "learning_rate": 4.8411290932179734e-06, "loss": 0.1836, "step": 703 }, { "epoch": 0.15861661080913622, "grad_norm": 0.4773017770591324, "learning_rate": 4.840474944515017e-06, "loss": 0.1874, "step": 704 }, { "epoch": 0.15884191849494467, "grad_norm": 0.519710604598844, "learning_rate": 4.839819496233562e-06, "loss": 0.1914, "step": 705 }, { "epoch": 0.15906722618075309, "grad_norm": 0.4842922130946162, "learning_rate": 4.839162748737556e-06, "loss": 0.1822, "step": 706 }, { "epoch": 0.15929253386656153, "grad_norm": 0.5118687486652845, "learning_rate": 4.838504702391665e-06, "loss": 0.1861, "step": 707 }, { "epoch": 0.15951784155236995, "grad_norm": 0.4944719362975502, "learning_rate": 4.8378453575612785e-06, "loss": 0.1975, "step": 708 }, { "epoch": 0.1597431492381784, "grad_norm": 0.4645568912744376, "learning_rate": 4.837184714612506e-06, "loss": 0.1745, "step": 709 }, { "epoch": 0.1599684569239868, "grad_norm": 0.47279405579710293, "learning_rate": 4.836522773912178e-06, "loss": 0.1779, "step": 710 }, { "epoch": 0.16019376460979526, "grad_norm": 0.5222576281593992, "learning_rate": 4.835859535827844e-06, "loss": 0.1875, "step": 711 }, { "epoch": 0.16041907229560368, "grad_norm": 0.542641983491453, "learning_rate": 4.835195000727778e-06, "loss": 0.1942, "step": 712 }, { "epoch": 0.16064437998141212, "grad_norm": 0.5250273856847852, "learning_rate": 4.834529168980969e-06, "loss": 0.1933, "step": 713 }, { "epoch": 0.16086968766722054, "grad_norm": 0.48171719680672287, "learning_rate": 4.83386204095713e-06, "loss": 0.1853, "step": 714 }, { "epoch": 0.16109499535302899, "grad_norm": 0.4851873781747077, "learning_rate": 4.833193617026692e-06, "loss": 0.1847, "step": 715 }, { "epoch": 0.1613203030388374, "grad_norm": 0.5027455855127103, "learning_rate": 4.832523897560806e-06, "loss": 0.1756, "step": 716 }, { "epoch": 0.16154561072464585, "grad_norm": 0.4754560532712098, "learning_rate": 4.831852882931342e-06, "loss": 0.1824, "step": 717 }, { "epoch": 0.16177091841045427, "grad_norm": 0.49960393612057585, "learning_rate": 4.83118057351089e-06, "loss": 0.1909, "step": 718 }, { "epoch": 0.1619962260962627, "grad_norm": 0.4715628869762656, "learning_rate": 4.830506969672758e-06, "loss": 0.1803, "step": 719 }, { "epoch": 0.16222153378207113, "grad_norm": 0.5079866257652638, "learning_rate": 4.829832071790972e-06, "loss": 0.1946, "step": 720 }, { "epoch": 0.16244684146787958, "grad_norm": 0.4964010052003955, "learning_rate": 4.829155880240279e-06, "loss": 0.1897, "step": 721 }, { "epoch": 0.162672149153688, "grad_norm": 0.488861953463059, "learning_rate": 4.828478395396143e-06, "loss": 0.1789, "step": 722 }, { "epoch": 0.16289745683949644, "grad_norm": 0.47540675122642734, "learning_rate": 4.8277996176347465e-06, "loss": 0.1727, "step": 723 }, { "epoch": 0.16312276452530486, "grad_norm": 0.4492582480994961, "learning_rate": 4.827119547332988e-06, "loss": 0.1736, "step": 724 }, { "epoch": 0.1633480722111133, "grad_norm": 0.493647328406061, "learning_rate": 4.826438184868486e-06, "loss": 0.1814, "step": 725 }, { "epoch": 0.16357337989692172, "grad_norm": 0.4902518098783279, "learning_rate": 4.825755530619576e-06, "loss": 0.1799, "step": 726 }, { "epoch": 0.16379868758273017, "grad_norm": 0.46936919297661067, "learning_rate": 4.825071584965308e-06, "loss": 0.1826, "step": 727 }, { "epoch": 0.16402399526853859, "grad_norm": 0.5080991373094106, "learning_rate": 4.824386348285456e-06, "loss": 0.1945, "step": 728 }, { "epoch": 0.16424930295434703, "grad_norm": 0.5139713054072338, "learning_rate": 4.823699820960502e-06, "loss": 0.1903, "step": 729 }, { "epoch": 0.16447461064015545, "grad_norm": 0.47537280678853283, "learning_rate": 4.8230120033716525e-06, "loss": 0.1824, "step": 730 }, { "epoch": 0.1646999183259639, "grad_norm": 0.4648039961115506, "learning_rate": 4.822322895900825e-06, "loss": 0.1739, "step": 731 }, { "epoch": 0.1649252260117723, "grad_norm": 0.47135193169677986, "learning_rate": 4.821632498930656e-06, "loss": 0.1713, "step": 732 }, { "epoch": 0.16515053369758076, "grad_norm": 0.4664425826746682, "learning_rate": 4.820940812844496e-06, "loss": 0.1737, "step": 733 }, { "epoch": 0.16537584138338918, "grad_norm": 0.4884241535946506, "learning_rate": 4.820247838026414e-06, "loss": 0.1822, "step": 734 }, { "epoch": 0.16560114906919762, "grad_norm": 0.4570363476774046, "learning_rate": 4.819553574861192e-06, "loss": 0.1656, "step": 735 }, { "epoch": 0.16582645675500607, "grad_norm": 0.5376549865137639, "learning_rate": 4.81885802373433e-06, "loss": 0.1727, "step": 736 }, { "epoch": 0.1660517644408145, "grad_norm": 0.5108995349206131, "learning_rate": 4.818161185032039e-06, "loss": 0.1998, "step": 737 }, { "epoch": 0.16627707212662293, "grad_norm": 0.5079851929625643, "learning_rate": 4.8174630591412495e-06, "loss": 0.1982, "step": 738 }, { "epoch": 0.16650237981243135, "grad_norm": 0.46933454329148294, "learning_rate": 4.816763646449605e-06, "loss": 0.1876, "step": 739 }, { "epoch": 0.1667276874982398, "grad_norm": 0.47786743858752834, "learning_rate": 4.816062947345462e-06, "loss": 0.1806, "step": 740 }, { "epoch": 0.16695299518404821, "grad_norm": 0.5149184082938212, "learning_rate": 4.815360962217894e-06, "loss": 0.2055, "step": 741 }, { "epoch": 0.16717830286985666, "grad_norm": 0.4839519889099994, "learning_rate": 4.814657691456685e-06, "loss": 0.1807, "step": 742 }, { "epoch": 0.16740361055566508, "grad_norm": 0.5097448035834458, "learning_rate": 4.813953135452338e-06, "loss": 0.1883, "step": 743 }, { "epoch": 0.16762891824147352, "grad_norm": 0.49981444304743583, "learning_rate": 4.813247294596065e-06, "loss": 0.1873, "step": 744 }, { "epoch": 0.16785422592728194, "grad_norm": 0.49909466388052154, "learning_rate": 4.812540169279793e-06, "loss": 0.184, "step": 745 }, { "epoch": 0.1680795336130904, "grad_norm": 0.4853088099017004, "learning_rate": 4.8118317598961625e-06, "loss": 0.1828, "step": 746 }, { "epoch": 0.1683048412988988, "grad_norm": 0.512848849396602, "learning_rate": 4.811122066838527e-06, "loss": 0.1743, "step": 747 }, { "epoch": 0.16853014898470725, "grad_norm": 0.4945737090434059, "learning_rate": 4.810411090500952e-06, "loss": 0.1841, "step": 748 }, { "epoch": 0.16875545667051567, "grad_norm": 0.484533971919501, "learning_rate": 4.809698831278217e-06, "loss": 0.1681, "step": 749 }, { "epoch": 0.16898076435632411, "grad_norm": 0.5274434401849899, "learning_rate": 4.808985289565813e-06, "loss": 0.2138, "step": 750 }, { "epoch": 0.16920607204213253, "grad_norm": 0.5429817492746462, "learning_rate": 4.808270465759943e-06, "loss": 0.1932, "step": 751 }, { "epoch": 0.16943137972794098, "grad_norm": 0.4898706441354639, "learning_rate": 4.807554360257522e-06, "loss": 0.2004, "step": 752 }, { "epoch": 0.1696566874137494, "grad_norm": 0.49637968065289056, "learning_rate": 4.806836973456175e-06, "loss": 0.1822, "step": 753 }, { "epoch": 0.16988199509955784, "grad_norm": 0.5084117755419836, "learning_rate": 4.8061183057542424e-06, "loss": 0.1715, "step": 754 }, { "epoch": 0.17010730278536626, "grad_norm": 0.5109495087867058, "learning_rate": 4.8053983575507735e-06, "loss": 0.1765, "step": 755 }, { "epoch": 0.1703326104711747, "grad_norm": 0.4928451254071443, "learning_rate": 4.804677129245527e-06, "loss": 0.1859, "step": 756 }, { "epoch": 0.17055791815698312, "grad_norm": 0.46419293537472645, "learning_rate": 4.8039546212389765e-06, "loss": 0.1706, "step": 757 }, { "epoch": 0.17078322584279157, "grad_norm": 0.4830003007409314, "learning_rate": 4.803230833932302e-06, "loss": 0.1832, "step": 758 }, { "epoch": 0.1710085335286, "grad_norm": 0.46911559787991525, "learning_rate": 4.802505767727395e-06, "loss": 0.1775, "step": 759 }, { "epoch": 0.17123384121440843, "grad_norm": 0.5119510412459805, "learning_rate": 4.80177942302686e-06, "loss": 0.1921, "step": 760 }, { "epoch": 0.17145914890021685, "grad_norm": 0.4866339023029943, "learning_rate": 4.8010518002340065e-06, "loss": 0.1753, "step": 761 }, { "epoch": 0.1716844565860253, "grad_norm": 0.4615836386800967, "learning_rate": 4.800322899752859e-06, "loss": 0.1692, "step": 762 }, { "epoch": 0.17190976427183371, "grad_norm": 0.5165845262276799, "learning_rate": 4.799592721988147e-06, "loss": 0.1963, "step": 763 }, { "epoch": 0.17213507195764216, "grad_norm": 0.45977940689859065, "learning_rate": 4.798861267345312e-06, "loss": 0.1699, "step": 764 }, { "epoch": 0.17236037964345058, "grad_norm": 0.4787043266174051, "learning_rate": 4.798128536230502e-06, "loss": 0.1871, "step": 765 }, { "epoch": 0.17258568732925902, "grad_norm": 0.4888347653144028, "learning_rate": 4.797394529050577e-06, "loss": 0.1896, "step": 766 }, { "epoch": 0.17281099501506744, "grad_norm": 0.4914002260078396, "learning_rate": 4.796659246213103e-06, "loss": 0.1819, "step": 767 }, { "epoch": 0.1730363027008759, "grad_norm": 0.4731805055670691, "learning_rate": 4.795922688126355e-06, "loss": 0.1655, "step": 768 }, { "epoch": 0.1732616103866843, "grad_norm": 0.5111009862183389, "learning_rate": 4.795184855199316e-06, "loss": 0.2014, "step": 769 }, { "epoch": 0.17348691807249275, "grad_norm": 0.5137505817476531, "learning_rate": 4.794445747841679e-06, "loss": 0.1966, "step": 770 }, { "epoch": 0.17371222575830117, "grad_norm": 0.50296528263997, "learning_rate": 4.79370536646384e-06, "loss": 0.1944, "step": 771 }, { "epoch": 0.17393753344410962, "grad_norm": 0.48127568016573596, "learning_rate": 4.792963711476908e-06, "loss": 0.1809, "step": 772 }, { "epoch": 0.17416284112991803, "grad_norm": 0.4572815368441324, "learning_rate": 4.792220783292694e-06, "loss": 0.1683, "step": 773 }, { "epoch": 0.17438814881572648, "grad_norm": 0.48725892749917493, "learning_rate": 4.791476582323719e-06, "loss": 0.179, "step": 774 }, { "epoch": 0.1746134565015349, "grad_norm": 0.49300456728333963, "learning_rate": 4.790731108983211e-06, "loss": 0.184, "step": 775 }, { "epoch": 0.17483876418734334, "grad_norm": 0.4787596622664503, "learning_rate": 4.7899843636851014e-06, "loss": 0.1717, "step": 776 }, { "epoch": 0.17506407187315176, "grad_norm": 0.48381195789429615, "learning_rate": 4.789236346844034e-06, "loss": 0.1835, "step": 777 }, { "epoch": 0.1752893795589602, "grad_norm": 0.4933319234608218, "learning_rate": 4.78848705887535e-06, "loss": 0.184, "step": 778 }, { "epoch": 0.17551468724476862, "grad_norm": 0.5095035300039361, "learning_rate": 4.7877365001951045e-06, "loss": 0.1809, "step": 779 }, { "epoch": 0.17573999493057707, "grad_norm": 0.4940075922499647, "learning_rate": 4.786984671220053e-06, "loss": 0.1647, "step": 780 }, { "epoch": 0.1759653026163855, "grad_norm": 0.5158596908756363, "learning_rate": 4.786231572367659e-06, "loss": 0.1945, "step": 781 }, { "epoch": 0.17619061030219393, "grad_norm": 0.462830581703408, "learning_rate": 4.785477204056089e-06, "loss": 0.1778, "step": 782 }, { "epoch": 0.17641591798800235, "grad_norm": 0.5339185435557559, "learning_rate": 4.784721566704217e-06, "loss": 0.187, "step": 783 }, { "epoch": 0.1766412256738108, "grad_norm": 0.5257175194638931, "learning_rate": 4.78396466073162e-06, "loss": 0.1816, "step": 784 }, { "epoch": 0.17686653335961924, "grad_norm": 0.5028416895616926, "learning_rate": 4.7832064865585795e-06, "loss": 0.1794, "step": 785 }, { "epoch": 0.17709184104542766, "grad_norm": 0.47469854396033484, "learning_rate": 4.78244704460608e-06, "loss": 0.1762, "step": 786 }, { "epoch": 0.1773171487312361, "grad_norm": 0.5164293026713752, "learning_rate": 4.781686335295813e-06, "loss": 0.1823, "step": 787 }, { "epoch": 0.17754245641704453, "grad_norm": 0.5341407689751599, "learning_rate": 4.7809243590501725e-06, "loss": 0.1819, "step": 788 }, { "epoch": 0.17776776410285297, "grad_norm": 0.5041897167760184, "learning_rate": 4.780161116292254e-06, "loss": 0.1838, "step": 789 }, { "epoch": 0.1779930717886614, "grad_norm": 0.4777200934140291, "learning_rate": 4.779396607445858e-06, "loss": 0.1859, "step": 790 }, { "epoch": 0.17821837947446983, "grad_norm": 0.5105523444886348, "learning_rate": 4.778630832935489e-06, "loss": 0.18, "step": 791 }, { "epoch": 0.17844368716027825, "grad_norm": 0.5049092473527552, "learning_rate": 4.777863793186351e-06, "loss": 0.1772, "step": 792 }, { "epoch": 0.1786689948460867, "grad_norm": 0.47907869755384375, "learning_rate": 4.777095488624355e-06, "loss": 0.1684, "step": 793 }, { "epoch": 0.17889430253189512, "grad_norm": 0.5311995304246812, "learning_rate": 4.776325919676109e-06, "loss": 0.1981, "step": 794 }, { "epoch": 0.17911961021770356, "grad_norm": 0.48060992323771873, "learning_rate": 4.775555086768929e-06, "loss": 0.1708, "step": 795 }, { "epoch": 0.17934491790351198, "grad_norm": 0.4675293098501905, "learning_rate": 4.774782990330828e-06, "loss": 0.1703, "step": 796 }, { "epoch": 0.17957022558932043, "grad_norm": 0.4930964788341703, "learning_rate": 4.774009630790522e-06, "loss": 0.1921, "step": 797 }, { "epoch": 0.17979553327512884, "grad_norm": 0.46495600709912344, "learning_rate": 4.77323500857743e-06, "loss": 0.1599, "step": 798 }, { "epoch": 0.1800208409609373, "grad_norm": 0.47676582472244317, "learning_rate": 4.77245912412167e-06, "loss": 0.1834, "step": 799 }, { "epoch": 0.1802461486467457, "grad_norm": 0.4895979712857663, "learning_rate": 4.771681977854062e-06, "loss": 0.1761, "step": 800 }, { "epoch": 0.18047145633255415, "grad_norm": 0.4932387130749087, "learning_rate": 4.7709035702061275e-06, "loss": 0.1781, "step": 801 }, { "epoch": 0.18069676401836257, "grad_norm": 0.4779913850319623, "learning_rate": 4.770123901610085e-06, "loss": 0.1824, "step": 802 }, { "epoch": 0.18092207170417102, "grad_norm": 0.48771377451063314, "learning_rate": 4.7693429724988565e-06, "loss": 0.1819, "step": 803 }, { "epoch": 0.18114737938997943, "grad_norm": 0.48970848842547954, "learning_rate": 4.768560783306064e-06, "loss": 0.1903, "step": 804 }, { "epoch": 0.18137268707578788, "grad_norm": 0.45068406552687224, "learning_rate": 4.767777334466025e-06, "loss": 0.1704, "step": 805 }, { "epoch": 0.1815979947615963, "grad_norm": 0.5051420899177393, "learning_rate": 4.7669926264137625e-06, "loss": 0.1837, "step": 806 }, { "epoch": 0.18182330244740474, "grad_norm": 0.48811515671301603, "learning_rate": 4.766206659584994e-06, "loss": 0.1641, "step": 807 }, { "epoch": 0.18204861013321316, "grad_norm": 0.4851157603919634, "learning_rate": 4.765419434416138e-06, "loss": 0.1825, "step": 808 }, { "epoch": 0.1822739178190216, "grad_norm": 0.5082718059063132, "learning_rate": 4.7646309513443115e-06, "loss": 0.1985, "step": 809 }, { "epoch": 0.18249922550483003, "grad_norm": 0.4791172574916238, "learning_rate": 4.763841210807329e-06, "loss": 0.1722, "step": 810 }, { "epoch": 0.18272453319063847, "grad_norm": 0.4901673009180037, "learning_rate": 4.763050213243705e-06, "loss": 0.1835, "step": 811 }, { "epoch": 0.1829498408764469, "grad_norm": 0.49925031809883547, "learning_rate": 4.762257959092651e-06, "loss": 0.1934, "step": 812 }, { "epoch": 0.18317514856225534, "grad_norm": 0.46501080227779795, "learning_rate": 4.7614644487940755e-06, "loss": 0.1817, "step": 813 }, { "epoch": 0.18340045624806375, "grad_norm": 0.483217674981779, "learning_rate": 4.760669682788584e-06, "loss": 0.1872, "step": 814 }, { "epoch": 0.1836257639338722, "grad_norm": 0.5447110598554856, "learning_rate": 4.759873661517484e-06, "loss": 0.1946, "step": 815 }, { "epoch": 0.18385107161968062, "grad_norm": 0.4963607265926464, "learning_rate": 4.759076385422773e-06, "loss": 0.1673, "step": 816 }, { "epoch": 0.18407637930548906, "grad_norm": 0.49715295354011035, "learning_rate": 4.75827785494715e-06, "loss": 0.1822, "step": 817 }, { "epoch": 0.18430168699129748, "grad_norm": 0.4468255135593684, "learning_rate": 4.7574780705340094e-06, "loss": 0.1664, "step": 818 }, { "epoch": 0.18452699467710593, "grad_norm": 0.5020265233870463, "learning_rate": 4.756677032627442e-06, "loss": 0.189, "step": 819 }, { "epoch": 0.18475230236291434, "grad_norm": 0.49870372725644924, "learning_rate": 4.755874741672233e-06, "loss": 0.1778, "step": 820 }, { "epoch": 0.1849776100487228, "grad_norm": 0.457171013269724, "learning_rate": 4.755071198113865e-06, "loss": 0.1725, "step": 821 }, { "epoch": 0.1852029177345312, "grad_norm": 0.5092879280720725, "learning_rate": 4.754266402398517e-06, "loss": 0.1813, "step": 822 }, { "epoch": 0.18542822542033965, "grad_norm": 0.5166784139586021, "learning_rate": 4.753460354973061e-06, "loss": 0.1866, "step": 823 }, { "epoch": 0.18565353310614807, "grad_norm": 0.4861158761653109, "learning_rate": 4.752653056285066e-06, "loss": 0.1717, "step": 824 }, { "epoch": 0.18587884079195652, "grad_norm": 0.46905214327282685, "learning_rate": 4.751844506782793e-06, "loss": 0.1862, "step": 825 }, { "epoch": 0.18610414847776494, "grad_norm": 0.44862810839169803, "learning_rate": 4.7510347069152015e-06, "loss": 0.1644, "step": 826 }, { "epoch": 0.18632945616357338, "grad_norm": 0.4975474972517688, "learning_rate": 4.750223657131942e-06, "loss": 0.1822, "step": 827 }, { "epoch": 0.1865547638493818, "grad_norm": 0.4765117099823277, "learning_rate": 4.74941135788336e-06, "loss": 0.1779, "step": 828 }, { "epoch": 0.18678007153519025, "grad_norm": 0.45414080502387294, "learning_rate": 4.748597809620496e-06, "loss": 0.1569, "step": 829 }, { "epoch": 0.18700537922099866, "grad_norm": 0.5041355859519643, "learning_rate": 4.747783012795083e-06, "loss": 0.1986, "step": 830 }, { "epoch": 0.1872306869068071, "grad_norm": 0.5051258971770674, "learning_rate": 4.746966967859547e-06, "loss": 0.1951, "step": 831 }, { "epoch": 0.18745599459261553, "grad_norm": 0.49081074092309795, "learning_rate": 4.746149675267005e-06, "loss": 0.1804, "step": 832 }, { "epoch": 0.18768130227842397, "grad_norm": 0.4441868113966922, "learning_rate": 4.745331135471274e-06, "loss": 0.1585, "step": 833 }, { "epoch": 0.18790660996423242, "grad_norm": 0.46703122083044185, "learning_rate": 4.744511348926855e-06, "loss": 0.176, "step": 834 }, { "epoch": 0.18813191765004084, "grad_norm": 0.5208426656798611, "learning_rate": 4.743690316088945e-06, "loss": 0.1815, "step": 835 }, { "epoch": 0.18835722533584928, "grad_norm": 0.5164756735393626, "learning_rate": 4.742868037413435e-06, "loss": 0.1972, "step": 836 }, { "epoch": 0.1885825330216577, "grad_norm": 0.49649766308699994, "learning_rate": 4.742044513356904e-06, "loss": 0.1754, "step": 837 }, { "epoch": 0.18880784070746615, "grad_norm": 0.49273564586087437, "learning_rate": 4.741219744376624e-06, "loss": 0.1712, "step": 838 }, { "epoch": 0.18903314839327456, "grad_norm": 0.5486282244513546, "learning_rate": 4.74039373093056e-06, "loss": 0.2004, "step": 839 }, { "epoch": 0.189258456079083, "grad_norm": 0.4748075709655306, "learning_rate": 4.739566473477365e-06, "loss": 0.1686, "step": 840 }, { "epoch": 0.18948376376489143, "grad_norm": 0.48522772612578585, "learning_rate": 4.738737972476385e-06, "loss": 0.1665, "step": 841 }, { "epoch": 0.18970907145069987, "grad_norm": 0.5329091938352456, "learning_rate": 4.737908228387656e-06, "loss": 0.1864, "step": 842 }, { "epoch": 0.1899343791365083, "grad_norm": 0.5047847276742794, "learning_rate": 4.737077241671904e-06, "loss": 0.1929, "step": 843 }, { "epoch": 0.19015968682231674, "grad_norm": 0.46222373907325087, "learning_rate": 4.736245012790543e-06, "loss": 0.1741, "step": 844 }, { "epoch": 0.19038499450812515, "grad_norm": 0.5185005587102809, "learning_rate": 4.735411542205681e-06, "loss": 0.1912, "step": 845 }, { "epoch": 0.1906103021939336, "grad_norm": 0.5116459064135198, "learning_rate": 4.734576830380113e-06, "loss": 0.176, "step": 846 }, { "epoch": 0.19083560987974202, "grad_norm": 0.4666206667887288, "learning_rate": 4.733740877777322e-06, "loss": 0.1706, "step": 847 }, { "epoch": 0.19106091756555046, "grad_norm": 0.4967074435816717, "learning_rate": 4.732903684861482e-06, "loss": 0.1873, "step": 848 }, { "epoch": 0.19128622525135888, "grad_norm": 0.5365788421064641, "learning_rate": 4.732065252097455e-06, "loss": 0.1847, "step": 849 }, { "epoch": 0.19151153293716733, "grad_norm": 0.4977595692595492, "learning_rate": 4.731225579950791e-06, "loss": 0.1698, "step": 850 }, { "epoch": 0.19173684062297575, "grad_norm": 0.5220653425157609, "learning_rate": 4.730384668887731e-06, "loss": 0.1858, "step": 851 }, { "epoch": 0.1919621483087842, "grad_norm": 0.48911620839265174, "learning_rate": 4.7295425193751974e-06, "loss": 0.1704, "step": 852 }, { "epoch": 0.1921874559945926, "grad_norm": 0.5340840544161292, "learning_rate": 4.728699131880808e-06, "loss": 0.1962, "step": 853 }, { "epoch": 0.19241276368040106, "grad_norm": 0.4885150418015092, "learning_rate": 4.727854506872863e-06, "loss": 0.1618, "step": 854 }, { "epoch": 0.19263807136620947, "grad_norm": 0.549170399664639, "learning_rate": 4.727008644820351e-06, "loss": 0.1914, "step": 855 }, { "epoch": 0.19286337905201792, "grad_norm": 0.4526719253769709, "learning_rate": 4.726161546192949e-06, "loss": 0.169, "step": 856 }, { "epoch": 0.19308868673782634, "grad_norm": 0.5149778393372377, "learning_rate": 4.725313211461018e-06, "loss": 0.1922, "step": 857 }, { "epoch": 0.19331399442363478, "grad_norm": 0.5379703316294756, "learning_rate": 4.724463641095606e-06, "loss": 0.1789, "step": 858 }, { "epoch": 0.1935393021094432, "grad_norm": 0.4923803644073318, "learning_rate": 4.72361283556845e-06, "loss": 0.1811, "step": 859 }, { "epoch": 0.19376460979525165, "grad_norm": 0.48133842551153155, "learning_rate": 4.7227607953519686e-06, "loss": 0.1743, "step": 860 }, { "epoch": 0.19398991748106006, "grad_norm": 0.5108863471576068, "learning_rate": 4.7219075209192686e-06, "loss": 0.1844, "step": 861 }, { "epoch": 0.1942152251668685, "grad_norm": 0.472380424434701, "learning_rate": 4.721053012744142e-06, "loss": 0.169, "step": 862 }, { "epoch": 0.19444053285267693, "grad_norm": 0.4919449085715017, "learning_rate": 4.720197271301064e-06, "loss": 0.1816, "step": 863 }, { "epoch": 0.19466584053848537, "grad_norm": 0.4749560870382914, "learning_rate": 4.719340297065198e-06, "loss": 0.1737, "step": 864 }, { "epoch": 0.1948911482242938, "grad_norm": 0.5267170058766248, "learning_rate": 4.718482090512389e-06, "loss": 0.187, "step": 865 }, { "epoch": 0.19511645591010224, "grad_norm": 0.4582779313083357, "learning_rate": 4.717622652119166e-06, "loss": 0.1596, "step": 866 }, { "epoch": 0.19534176359591066, "grad_norm": 0.4770738218995198, "learning_rate": 4.716761982362744e-06, "loss": 0.1872, "step": 867 }, { "epoch": 0.1955670712817191, "grad_norm": 0.4935886966800019, "learning_rate": 4.715900081721021e-06, "loss": 0.1841, "step": 868 }, { "epoch": 0.19579237896752752, "grad_norm": 0.5217854576232532, "learning_rate": 4.715036950672578e-06, "loss": 0.175, "step": 869 }, { "epoch": 0.19601768665333597, "grad_norm": 0.4785621830298838, "learning_rate": 4.71417258969668e-06, "loss": 0.1786, "step": 870 }, { "epoch": 0.19624299433914438, "grad_norm": 0.46485530569258604, "learning_rate": 4.713306999273273e-06, "loss": 0.1761, "step": 871 }, { "epoch": 0.19646830202495283, "grad_norm": 0.44349726343989515, "learning_rate": 4.712440179882989e-06, "loss": 0.1579, "step": 872 }, { "epoch": 0.19669360971076125, "grad_norm": 0.5228158669046902, "learning_rate": 4.711572132007139e-06, "loss": 0.184, "step": 873 }, { "epoch": 0.1969189173965697, "grad_norm": 0.4669706169996906, "learning_rate": 4.710702856127718e-06, "loss": 0.1706, "step": 874 }, { "epoch": 0.1971442250823781, "grad_norm": 0.47828996129737333, "learning_rate": 4.709832352727404e-06, "loss": 0.1643, "step": 875 }, { "epoch": 0.19736953276818656, "grad_norm": 0.49478309332034437, "learning_rate": 4.708960622289552e-06, "loss": 0.1975, "step": 876 }, { "epoch": 0.19759484045399497, "grad_norm": 0.5005389974519691, "learning_rate": 4.708087665298204e-06, "loss": 0.1914, "step": 877 }, { "epoch": 0.19782014813980342, "grad_norm": 0.4596868416283332, "learning_rate": 4.70721348223808e-06, "loss": 0.1817, "step": 878 }, { "epoch": 0.19804545582561184, "grad_norm": 0.45406849764461193, "learning_rate": 4.706338073594581e-06, "loss": 0.171, "step": 879 }, { "epoch": 0.19827076351142028, "grad_norm": 0.4994377731966007, "learning_rate": 4.705461439853789e-06, "loss": 0.1893, "step": 880 }, { "epoch": 0.1984960711972287, "grad_norm": 0.471170874746787, "learning_rate": 4.704583581502465e-06, "loss": 0.1765, "step": 881 }, { "epoch": 0.19872137888303715, "grad_norm": 0.5304815833710694, "learning_rate": 4.703704499028052e-06, "loss": 0.2032, "step": 882 }, { "epoch": 0.1989466865688456, "grad_norm": 0.47495896919317876, "learning_rate": 4.702824192918672e-06, "loss": 0.1711, "step": 883 }, { "epoch": 0.199171994254654, "grad_norm": 0.48490606250503726, "learning_rate": 4.701942663663126e-06, "loss": 0.1733, "step": 884 }, { "epoch": 0.19939730194046246, "grad_norm": 0.48968905810940944, "learning_rate": 4.7010599117508936e-06, "loss": 0.1752, "step": 885 }, { "epoch": 0.19962260962627087, "grad_norm": 0.47967576032175524, "learning_rate": 4.700175937672134e-06, "loss": 0.1873, "step": 886 }, { "epoch": 0.19984791731207932, "grad_norm": 0.5026032086154385, "learning_rate": 4.699290741917686e-06, "loss": 0.173, "step": 887 }, { "epoch": 0.20007322499788774, "grad_norm": 0.4879917529677437, "learning_rate": 4.698404324979066e-06, "loss": 0.1778, "step": 888 }, { "epoch": 0.20029853268369618, "grad_norm": 0.476247466553313, "learning_rate": 4.697516687348466e-06, "loss": 0.1812, "step": 889 }, { "epoch": 0.2005238403695046, "grad_norm": 0.5052080033610629, "learning_rate": 4.696627829518761e-06, "loss": 0.1901, "step": 890 }, { "epoch": 0.20074914805531305, "grad_norm": 0.48885147290925035, "learning_rate": 4.695737751983499e-06, "loss": 0.1822, "step": 891 }, { "epoch": 0.20097445574112147, "grad_norm": 0.49482722093377984, "learning_rate": 4.6948464552369075e-06, "loss": 0.1757, "step": 892 }, { "epoch": 0.2011997634269299, "grad_norm": 0.4734093310501187, "learning_rate": 4.69395393977389e-06, "loss": 0.1755, "step": 893 }, { "epoch": 0.20142507111273833, "grad_norm": 0.4708301078264215, "learning_rate": 4.693060206090028e-06, "loss": 0.1754, "step": 894 }, { "epoch": 0.20165037879854678, "grad_norm": 0.4783063902662343, "learning_rate": 4.692165254681576e-06, "loss": 0.1833, "step": 895 }, { "epoch": 0.2018756864843552, "grad_norm": 0.4884538782786743, "learning_rate": 4.69126908604547e-06, "loss": 0.1793, "step": 896 }, { "epoch": 0.20210099417016364, "grad_norm": 0.5113830943043448, "learning_rate": 4.690371700679317e-06, "loss": 0.1871, "step": 897 }, { "epoch": 0.20232630185597206, "grad_norm": 0.48752649520933233, "learning_rate": 4.689473099081403e-06, "loss": 0.1882, "step": 898 }, { "epoch": 0.2025516095417805, "grad_norm": 0.47456621778022884, "learning_rate": 4.688573281750688e-06, "loss": 0.1612, "step": 899 }, { "epoch": 0.20277691722758892, "grad_norm": 0.5307489124130947, "learning_rate": 4.687672249186805e-06, "loss": 0.1729, "step": 900 }, { "epoch": 0.20300222491339737, "grad_norm": 0.4664993822416109, "learning_rate": 4.686770001890067e-06, "loss": 0.163, "step": 901 }, { "epoch": 0.20322753259920578, "grad_norm": 0.5132544106040071, "learning_rate": 4.685866540361456e-06, "loss": 0.1893, "step": 902 }, { "epoch": 0.20345284028501423, "grad_norm": 0.5141748664372958, "learning_rate": 4.684961865102631e-06, "loss": 0.1643, "step": 903 }, { "epoch": 0.20367814797082265, "grad_norm": 0.4955830704478595, "learning_rate": 4.684055976615924e-06, "loss": 0.1639, "step": 904 }, { "epoch": 0.2039034556566311, "grad_norm": 0.48236349752009133, "learning_rate": 4.683148875404343e-06, "loss": 0.1797, "step": 905 }, { "epoch": 0.2041287633424395, "grad_norm": 0.5427867503596624, "learning_rate": 4.682240561971565e-06, "loss": 0.1779, "step": 906 }, { "epoch": 0.20435407102824796, "grad_norm": 0.5251684159747709, "learning_rate": 4.681331036821945e-06, "loss": 0.1873, "step": 907 }, { "epoch": 0.20457937871405638, "grad_norm": 0.5375306958337258, "learning_rate": 4.680420300460505e-06, "loss": 0.1949, "step": 908 }, { "epoch": 0.20480468639986482, "grad_norm": 0.46761640654985387, "learning_rate": 4.679508353392946e-06, "loss": 0.1653, "step": 909 }, { "epoch": 0.20502999408567324, "grad_norm": 0.46505944697352847, "learning_rate": 4.678595196125638e-06, "loss": 0.166, "step": 910 }, { "epoch": 0.20525530177148169, "grad_norm": 0.47542350967948827, "learning_rate": 4.677680829165623e-06, "loss": 0.1734, "step": 911 }, { "epoch": 0.2054806094572901, "grad_norm": 0.5038195396548182, "learning_rate": 4.676765253020613e-06, "loss": 0.1685, "step": 912 }, { "epoch": 0.20570591714309855, "grad_norm": 0.5456800961270842, "learning_rate": 4.675848468198995e-06, "loss": 0.1952, "step": 913 }, { "epoch": 0.20593122482890697, "grad_norm": 0.47717326012790684, "learning_rate": 4.674930475209827e-06, "loss": 0.1717, "step": 914 }, { "epoch": 0.2061565325147154, "grad_norm": 0.48833151208415465, "learning_rate": 4.674011274562833e-06, "loss": 0.1624, "step": 915 }, { "epoch": 0.20638184020052383, "grad_norm": 0.5108677291834829, "learning_rate": 4.673090866768412e-06, "loss": 0.1791, "step": 916 }, { "epoch": 0.20660714788633228, "grad_norm": 0.4859524561550817, "learning_rate": 4.672169252337633e-06, "loss": 0.165, "step": 917 }, { "epoch": 0.2068324555721407, "grad_norm": 0.47506801577016344, "learning_rate": 4.671246431782234e-06, "loss": 0.1787, "step": 918 }, { "epoch": 0.20705776325794914, "grad_norm": 0.4716251215451779, "learning_rate": 4.670322405614621e-06, "loss": 0.1676, "step": 919 }, { "epoch": 0.20728307094375756, "grad_norm": 0.4888915064606123, "learning_rate": 4.669397174347874e-06, "loss": 0.1956, "step": 920 }, { "epoch": 0.207508378629566, "grad_norm": 0.5008862880596165, "learning_rate": 4.668470738495738e-06, "loss": 0.1841, "step": 921 }, { "epoch": 0.20773368631537442, "grad_norm": 0.4634482781420155, "learning_rate": 4.667543098572627e-06, "loss": 0.1839, "step": 922 }, { "epoch": 0.20795899400118287, "grad_norm": 0.47429225006707093, "learning_rate": 4.6666142550936286e-06, "loss": 0.1634, "step": 923 }, { "epoch": 0.20818430168699129, "grad_norm": 0.4661754900600514, "learning_rate": 4.665684208574492e-06, "loss": 0.1853, "step": 924 }, { "epoch": 0.20840960937279973, "grad_norm": 0.46752723808840096, "learning_rate": 4.664752959531638e-06, "loss": 0.1731, "step": 925 }, { "epoch": 0.20863491705860815, "grad_norm": 0.46142532792339114, "learning_rate": 4.6638205084821544e-06, "loss": 0.1707, "step": 926 }, { "epoch": 0.2088602247444166, "grad_norm": 0.4891328446858097, "learning_rate": 4.6628868559437964e-06, "loss": 0.1763, "step": 927 }, { "epoch": 0.209085532430225, "grad_norm": 0.5031128198962433, "learning_rate": 4.661952002434988e-06, "loss": 0.2023, "step": 928 }, { "epoch": 0.20931084011603346, "grad_norm": 0.48714514401150427, "learning_rate": 4.661015948474815e-06, "loss": 0.1839, "step": 929 }, { "epoch": 0.20953614780184188, "grad_norm": 0.4746123845118674, "learning_rate": 4.660078694583037e-06, "loss": 0.1782, "step": 930 }, { "epoch": 0.20976145548765032, "grad_norm": 0.4664763084833289, "learning_rate": 4.659140241280075e-06, "loss": 0.1757, "step": 931 }, { "epoch": 0.20998676317345877, "grad_norm": 0.4746618538561004, "learning_rate": 4.658200589087016e-06, "loss": 0.1776, "step": 932 }, { "epoch": 0.21021207085926719, "grad_norm": 0.49077310248388495, "learning_rate": 4.657259738525615e-06, "loss": 0.1899, "step": 933 }, { "epoch": 0.21043737854507563, "grad_norm": 0.4451674520587689, "learning_rate": 4.656317690118291e-06, "loss": 0.1683, "step": 934 }, { "epoch": 0.21066268623088405, "grad_norm": 0.44999286347626516, "learning_rate": 4.655374444388127e-06, "loss": 0.1715, "step": 935 }, { "epoch": 0.2108879939166925, "grad_norm": 0.5160001217026832, "learning_rate": 4.654430001858874e-06, "loss": 0.1885, "step": 936 }, { "epoch": 0.2111133016025009, "grad_norm": 0.4842525872624328, "learning_rate": 4.653484363054947e-06, "loss": 0.1876, "step": 937 }, { "epoch": 0.21133860928830936, "grad_norm": 0.49958988628695056, "learning_rate": 4.6525375285014195e-06, "loss": 0.1904, "step": 938 }, { "epoch": 0.21156391697411778, "grad_norm": 0.4504957704587616, "learning_rate": 4.651589498724037e-06, "loss": 0.1727, "step": 939 }, { "epoch": 0.21178922465992622, "grad_norm": 0.5015059809260454, "learning_rate": 4.650640274249205e-06, "loss": 0.1941, "step": 940 }, { "epoch": 0.21201453234573464, "grad_norm": 0.476504641954251, "learning_rate": 4.649689855603992e-06, "loss": 0.175, "step": 941 }, { "epoch": 0.2122398400315431, "grad_norm": 0.46840211368639195, "learning_rate": 4.648738243316128e-06, "loss": 0.1796, "step": 942 }, { "epoch": 0.2124651477173515, "grad_norm": 0.4652139525522428, "learning_rate": 4.647785437914011e-06, "loss": 0.1748, "step": 943 }, { "epoch": 0.21269045540315995, "grad_norm": 0.45886845605683524, "learning_rate": 4.646831439926696e-06, "loss": 0.1675, "step": 944 }, { "epoch": 0.21291576308896837, "grad_norm": 0.47682867214434344, "learning_rate": 4.645876249883903e-06, "loss": 0.1786, "step": 945 }, { "epoch": 0.21314107077477681, "grad_norm": 0.4452826345255325, "learning_rate": 4.644919868316014e-06, "loss": 0.1556, "step": 946 }, { "epoch": 0.21336637846058523, "grad_norm": 0.5027037628671592, "learning_rate": 4.643962295754073e-06, "loss": 0.2023, "step": 947 }, { "epoch": 0.21359168614639368, "grad_norm": 0.4838475880709884, "learning_rate": 4.643003532729783e-06, "loss": 0.1697, "step": 948 }, { "epoch": 0.2138169938322021, "grad_norm": 0.4792563042333324, "learning_rate": 4.642043579775509e-06, "loss": 0.1774, "step": 949 }, { "epoch": 0.21404230151801054, "grad_norm": 0.48434516547216877, "learning_rate": 4.641082437424277e-06, "loss": 0.1788, "step": 950 }, { "epoch": 0.21426760920381896, "grad_norm": 0.4661730340891147, "learning_rate": 4.640120106209776e-06, "loss": 0.1753, "step": 951 }, { "epoch": 0.2144929168896274, "grad_norm": 0.5180843258347066, "learning_rate": 4.639156586666349e-06, "loss": 0.1857, "step": 952 }, { "epoch": 0.21471822457543582, "grad_norm": 0.514047193034028, "learning_rate": 4.638191879329005e-06, "loss": 0.1944, "step": 953 }, { "epoch": 0.21494353226124427, "grad_norm": 0.4694837100451456, "learning_rate": 4.63722598473341e-06, "loss": 0.1686, "step": 954 }, { "epoch": 0.2151688399470527, "grad_norm": 0.49442445420102776, "learning_rate": 4.636258903415888e-06, "loss": 0.1755, "step": 955 }, { "epoch": 0.21539414763286113, "grad_norm": 0.48350627421275655, "learning_rate": 4.635290635913425e-06, "loss": 0.1772, "step": 956 }, { "epoch": 0.21561945531866955, "grad_norm": 0.4694859909948316, "learning_rate": 4.63432118276366e-06, "loss": 0.1923, "step": 957 }, { "epoch": 0.215844763004478, "grad_norm": 0.5416110255621837, "learning_rate": 4.633350544504899e-06, "loss": 0.179, "step": 958 }, { "epoch": 0.21607007069028641, "grad_norm": 0.4465924442555772, "learning_rate": 4.632378721676098e-06, "loss": 0.1742, "step": 959 }, { "epoch": 0.21629537837609486, "grad_norm": 0.458481081744707, "learning_rate": 4.6314057148168765e-06, "loss": 0.1745, "step": 960 }, { "epoch": 0.21652068606190328, "grad_norm": 0.4547664974315061, "learning_rate": 4.6304315244675065e-06, "loss": 0.1774, "step": 961 }, { "epoch": 0.21674599374771172, "grad_norm": 0.5000337392694555, "learning_rate": 4.629456151168921e-06, "loss": 0.1848, "step": 962 }, { "epoch": 0.21697130143352014, "grad_norm": 0.45255086182737697, "learning_rate": 4.628479595462708e-06, "loss": 0.1767, "step": 963 }, { "epoch": 0.2171966091193286, "grad_norm": 0.5050866405244006, "learning_rate": 4.627501857891113e-06, "loss": 0.1794, "step": 964 }, { "epoch": 0.217421916805137, "grad_norm": 0.5046057153343321, "learning_rate": 4.626522938997037e-06, "loss": 0.1901, "step": 965 }, { "epoch": 0.21764722449094545, "grad_norm": 0.49654858884976455, "learning_rate": 4.625542839324036e-06, "loss": 0.1896, "step": 966 }, { "epoch": 0.21787253217675387, "grad_norm": 0.4619603328007907, "learning_rate": 4.624561559416324e-06, "loss": 0.1722, "step": 967 }, { "epoch": 0.21809783986256231, "grad_norm": 0.4714237924999947, "learning_rate": 4.623579099818769e-06, "loss": 0.1811, "step": 968 }, { "epoch": 0.21832314754837073, "grad_norm": 0.4727906799742656, "learning_rate": 4.6225954610768945e-06, "loss": 0.1641, "step": 969 }, { "epoch": 0.21854845523417918, "grad_norm": 0.4988859251827525, "learning_rate": 4.621610643736878e-06, "loss": 0.1807, "step": 970 }, { "epoch": 0.2187737629199876, "grad_norm": 0.5060976705176674, "learning_rate": 4.620624648345552e-06, "loss": 0.1749, "step": 971 }, { "epoch": 0.21899907060579604, "grad_norm": 0.47793522949791656, "learning_rate": 4.6196374754504024e-06, "loss": 0.1826, "step": 972 }, { "epoch": 0.21922437829160446, "grad_norm": 0.5033809177523444, "learning_rate": 4.61864912559957e-06, "loss": 0.1944, "step": 973 }, { "epoch": 0.2194496859774129, "grad_norm": 0.45111023806215533, "learning_rate": 4.617659599341849e-06, "loss": 0.1621, "step": 974 }, { "epoch": 0.21967499366322132, "grad_norm": 0.548417164765913, "learning_rate": 4.616668897226686e-06, "loss": 0.1767, "step": 975 }, { "epoch": 0.21990030134902977, "grad_norm": 0.5226621764784928, "learning_rate": 4.615677019804182e-06, "loss": 0.1796, "step": 976 }, { "epoch": 0.2201256090348382, "grad_norm": 0.46896809226111524, "learning_rate": 4.6146839676250875e-06, "loss": 0.1679, "step": 977 }, { "epoch": 0.22035091672064663, "grad_norm": 0.4889116397075663, "learning_rate": 4.6136897412408084e-06, "loss": 0.1894, "step": 978 }, { "epoch": 0.22057622440645505, "grad_norm": 0.45531634545026695, "learning_rate": 4.612694341203403e-06, "loss": 0.1757, "step": 979 }, { "epoch": 0.2208015320922635, "grad_norm": 0.5139339558980113, "learning_rate": 4.611697768065577e-06, "loss": 0.1963, "step": 980 }, { "epoch": 0.22102683977807194, "grad_norm": 0.461006150958274, "learning_rate": 4.610700022380692e-06, "loss": 0.174, "step": 981 }, { "epoch": 0.22125214746388036, "grad_norm": 0.4468605106113933, "learning_rate": 4.609701104702759e-06, "loss": 0.1587, "step": 982 }, { "epoch": 0.2214774551496888, "grad_norm": 0.43419083405156583, "learning_rate": 4.6087010155864394e-06, "loss": 0.1551, "step": 983 }, { "epoch": 0.22170276283549722, "grad_norm": 0.48688751814505576, "learning_rate": 4.607699755587046e-06, "loss": 0.1872, "step": 984 }, { "epoch": 0.22192807052130567, "grad_norm": 0.49965055488304555, "learning_rate": 4.60669732526054e-06, "loss": 0.1955, "step": 985 }, { "epoch": 0.2221533782071141, "grad_norm": 0.4834893714431329, "learning_rate": 4.605693725163536e-06, "loss": 0.1856, "step": 986 }, { "epoch": 0.22237868589292253, "grad_norm": 0.4990375863342495, "learning_rate": 4.6046889558532925e-06, "loss": 0.18, "step": 987 }, { "epoch": 0.22260399357873095, "grad_norm": 0.484244316222459, "learning_rate": 4.603683017887722e-06, "loss": 0.1725, "step": 988 }, { "epoch": 0.2228293012645394, "grad_norm": 0.467849362229871, "learning_rate": 4.602675911825386e-06, "loss": 0.172, "step": 989 }, { "epoch": 0.22305460895034782, "grad_norm": 0.46897024187918585, "learning_rate": 4.6016676382254895e-06, "loss": 0.1688, "step": 990 }, { "epoch": 0.22327991663615626, "grad_norm": 0.4773396813265902, "learning_rate": 4.600658197647892e-06, "loss": 0.1685, "step": 991 }, { "epoch": 0.22350522432196468, "grad_norm": 0.5064121299266291, "learning_rate": 4.5996475906530955e-06, "loss": 0.1806, "step": 992 }, { "epoch": 0.22373053200777313, "grad_norm": 0.47765732789338444, "learning_rate": 4.598635817802256e-06, "loss": 0.1718, "step": 993 }, { "epoch": 0.22395583969358154, "grad_norm": 0.5291926414513833, "learning_rate": 4.597622879657171e-06, "loss": 0.1725, "step": 994 }, { "epoch": 0.22418114737939, "grad_norm": 0.4797571989600491, "learning_rate": 4.596608776780287e-06, "loss": 0.1706, "step": 995 }, { "epoch": 0.2244064550651984, "grad_norm": 0.4864708362280325, "learning_rate": 4.595593509734699e-06, "loss": 0.1766, "step": 996 }, { "epoch": 0.22463176275100685, "grad_norm": 0.45907950569959, "learning_rate": 4.594577079084146e-06, "loss": 0.1674, "step": 997 }, { "epoch": 0.22485707043681527, "grad_norm": 0.5055242979755723, "learning_rate": 4.593559485393015e-06, "loss": 0.1752, "step": 998 }, { "epoch": 0.22508237812262372, "grad_norm": 0.5001209119984309, "learning_rate": 4.592540729226336e-06, "loss": 0.1821, "step": 999 }, { "epoch": 0.22530768580843213, "grad_norm": 0.5040109793881857, "learning_rate": 4.591520811149787e-06, "loss": 0.1785, "step": 1000 }, { "epoch": 0.22530768580843213, "eval_loss": 0.1766696572303772, "eval_runtime": 57.3568, "eval_samples_per_second": 50.038, "eval_steps_per_second": 6.259, "step": 1000 }, { "epoch": 0.22553299349424058, "grad_norm": 0.48462064594502124, "learning_rate": 4.590499731729692e-06, "loss": 0.1686, "step": 1001 }, { "epoch": 0.225758301180049, "grad_norm": 0.4601104950424043, "learning_rate": 4.589477491533016e-06, "loss": 0.1556, "step": 1002 }, { "epoch": 0.22598360886585744, "grad_norm": 0.49464098877745066, "learning_rate": 4.588454091127373e-06, "loss": 0.1796, "step": 1003 }, { "epoch": 0.22620891655166586, "grad_norm": 0.5148232425969892, "learning_rate": 4.587429531081019e-06, "loss": 0.1914, "step": 1004 }, { "epoch": 0.2264342242374743, "grad_norm": 0.47731591236053894, "learning_rate": 4.586403811962852e-06, "loss": 0.1604, "step": 1005 }, { "epoch": 0.22665953192328273, "grad_norm": 0.4551856162700306, "learning_rate": 4.585376934342418e-06, "loss": 0.1641, "step": 1006 }, { "epoch": 0.22688483960909117, "grad_norm": 0.4637388238429355, "learning_rate": 4.584348898789901e-06, "loss": 0.1695, "step": 1007 }, { "epoch": 0.2271101472948996, "grad_norm": 0.46640928542419985, "learning_rate": 4.583319705876133e-06, "loss": 0.1778, "step": 1008 }, { "epoch": 0.22733545498070803, "grad_norm": 0.4910752180840778, "learning_rate": 4.5822893561725864e-06, "loss": 0.1709, "step": 1009 }, { "epoch": 0.22756076266651645, "grad_norm": 0.4975885196146549, "learning_rate": 4.581257850251376e-06, "loss": 0.1906, "step": 1010 }, { "epoch": 0.2277860703523249, "grad_norm": 0.48897736900788225, "learning_rate": 4.580225188685257e-06, "loss": 0.1707, "step": 1011 }, { "epoch": 0.22801137803813332, "grad_norm": 0.43130461495166755, "learning_rate": 4.579191372047631e-06, "loss": 0.1618, "step": 1012 }, { "epoch": 0.22823668572394176, "grad_norm": 0.5346355301760246, "learning_rate": 4.578156400912535e-06, "loss": 0.2052, "step": 1013 }, { "epoch": 0.22846199340975018, "grad_norm": 0.5120793734577033, "learning_rate": 4.577120275854649e-06, "loss": 0.1786, "step": 1014 }, { "epoch": 0.22868730109555863, "grad_norm": 0.47045662878311806, "learning_rate": 4.576082997449298e-06, "loss": 0.1801, "step": 1015 }, { "epoch": 0.22891260878136704, "grad_norm": 0.4897903682435324, "learning_rate": 4.5750445662724426e-06, "loss": 0.182, "step": 1016 }, { "epoch": 0.2291379164671755, "grad_norm": 0.4496213951131232, "learning_rate": 4.574004982900684e-06, "loss": 0.1726, "step": 1017 }, { "epoch": 0.2293632241529839, "grad_norm": 0.4796218909713557, "learning_rate": 4.572964247911265e-06, "loss": 0.1806, "step": 1018 }, { "epoch": 0.22958853183879235, "grad_norm": 0.48130542448321, "learning_rate": 4.5719223618820666e-06, "loss": 0.1884, "step": 1019 }, { "epoch": 0.22981383952460077, "grad_norm": 0.46705890705030956, "learning_rate": 4.5708793253916104e-06, "loss": 0.171, "step": 1020 }, { "epoch": 0.23003914721040922, "grad_norm": 0.4598433200960336, "learning_rate": 4.569835139019054e-06, "loss": 0.1625, "step": 1021 }, { "epoch": 0.23026445489621764, "grad_norm": 0.4798183615941154, "learning_rate": 4.568789803344196e-06, "loss": 0.1854, "step": 1022 }, { "epoch": 0.23048976258202608, "grad_norm": 0.5049857914668311, "learning_rate": 4.567743318947472e-06, "loss": 0.1723, "step": 1023 }, { "epoch": 0.2307150702678345, "grad_norm": 0.5133377849430174, "learning_rate": 4.566695686409957e-06, "loss": 0.1969, "step": 1024 }, { "epoch": 0.23094037795364294, "grad_norm": 0.4700007173190981, "learning_rate": 4.56564690631336e-06, "loss": 0.175, "step": 1025 }, { "epoch": 0.23116568563945136, "grad_norm": 0.47259284661188733, "learning_rate": 4.564596979240031e-06, "loss": 0.1648, "step": 1026 }, { "epoch": 0.2313909933252598, "grad_norm": 0.46806795756718333, "learning_rate": 4.563545905772956e-06, "loss": 0.1684, "step": 1027 }, { "epoch": 0.23161630101106823, "grad_norm": 0.45862602476089215, "learning_rate": 4.562493686495756e-06, "loss": 0.1684, "step": 1028 }, { "epoch": 0.23184160869687667, "grad_norm": 0.4633999365729002, "learning_rate": 4.56144032199269e-06, "loss": 0.1586, "step": 1029 }, { "epoch": 0.23206691638268512, "grad_norm": 0.46864023859664566, "learning_rate": 4.56038581284865e-06, "loss": 0.167, "step": 1030 }, { "epoch": 0.23229222406849354, "grad_norm": 0.4565233615702026, "learning_rate": 4.559330159649166e-06, "loss": 0.1701, "step": 1031 }, { "epoch": 0.23251753175430198, "grad_norm": 0.46694580167435407, "learning_rate": 4.558273362980406e-06, "loss": 0.1682, "step": 1032 }, { "epoch": 0.2327428394401104, "grad_norm": 0.448802180941964, "learning_rate": 4.557215423429167e-06, "loss": 0.1692, "step": 1033 }, { "epoch": 0.23296814712591885, "grad_norm": 0.46242148368245545, "learning_rate": 4.556156341582884e-06, "loss": 0.1757, "step": 1034 }, { "epoch": 0.23319345481172726, "grad_norm": 0.4702960441016916, "learning_rate": 4.555096118029625e-06, "loss": 0.1649, "step": 1035 }, { "epoch": 0.2334187624975357, "grad_norm": 0.4603866097849357, "learning_rate": 4.5540347533580935e-06, "loss": 0.1743, "step": 1036 }, { "epoch": 0.23364407018334413, "grad_norm": 0.5016444270737936, "learning_rate": 4.5529722481576265e-06, "loss": 0.1659, "step": 1037 }, { "epoch": 0.23386937786915257, "grad_norm": 0.46197661033483206, "learning_rate": 4.551908603018191e-06, "loss": 0.1718, "step": 1038 }, { "epoch": 0.234094685554961, "grad_norm": 0.4576051232314433, "learning_rate": 4.550843818530392e-06, "loss": 0.1754, "step": 1039 }, { "epoch": 0.23431999324076944, "grad_norm": 0.48921900894809295, "learning_rate": 4.549777895285464e-06, "loss": 0.1681, "step": 1040 }, { "epoch": 0.23454530092657785, "grad_norm": 0.4969985437197727, "learning_rate": 4.548710833875273e-06, "loss": 0.1776, "step": 1041 }, { "epoch": 0.2347706086123863, "grad_norm": 0.4926942037575357, "learning_rate": 4.547642634892321e-06, "loss": 0.1711, "step": 1042 }, { "epoch": 0.23499591629819472, "grad_norm": 0.5099757228479024, "learning_rate": 4.5465732989297365e-06, "loss": 0.169, "step": 1043 }, { "epoch": 0.23522122398400316, "grad_norm": 0.4941141847733085, "learning_rate": 4.545502826581284e-06, "loss": 0.1758, "step": 1044 }, { "epoch": 0.23544653166981158, "grad_norm": 0.4992853772276655, "learning_rate": 4.5444312184413554e-06, "loss": 0.1896, "step": 1045 }, { "epoch": 0.23567183935562003, "grad_norm": 0.5077593719343416, "learning_rate": 4.543358475104975e-06, "loss": 0.1731, "step": 1046 }, { "epoch": 0.23589714704142845, "grad_norm": 0.432534135028472, "learning_rate": 4.5422845971677985e-06, "loss": 0.1606, "step": 1047 }, { "epoch": 0.2361224547272369, "grad_norm": 0.46569122347590924, "learning_rate": 4.541209585226109e-06, "loss": 0.1645, "step": 1048 }, { "epoch": 0.2363477624130453, "grad_norm": 0.4851233704311572, "learning_rate": 4.5401334398768195e-06, "loss": 0.164, "step": 1049 }, { "epoch": 0.23657307009885375, "grad_norm": 0.49161973544017507, "learning_rate": 4.539056161717477e-06, "loss": 0.1704, "step": 1050 }, { "epoch": 0.23679837778466217, "grad_norm": 0.4705002761219879, "learning_rate": 4.53797775134625e-06, "loss": 0.1572, "step": 1051 }, { "epoch": 0.23702368547047062, "grad_norm": 0.5074699444832521, "learning_rate": 4.536898209361942e-06, "loss": 0.1817, "step": 1052 }, { "epoch": 0.23724899315627904, "grad_norm": 0.4753176300220157, "learning_rate": 4.535817536363981e-06, "loss": 0.1679, "step": 1053 }, { "epoch": 0.23747430084208748, "grad_norm": 0.45572087945610384, "learning_rate": 4.5347357329524254e-06, "loss": 0.1557, "step": 1054 }, { "epoch": 0.2376996085278959, "grad_norm": 0.47954801189962154, "learning_rate": 4.53365279972796e-06, "loss": 0.1739, "step": 1055 }, { "epoch": 0.23792491621370435, "grad_norm": 0.4936922163351292, "learning_rate": 4.532568737291898e-06, "loss": 0.1708, "step": 1056 }, { "epoch": 0.23815022389951276, "grad_norm": 0.487882288980118, "learning_rate": 4.531483546246177e-06, "loss": 0.1867, "step": 1057 }, { "epoch": 0.2383755315853212, "grad_norm": 0.5022570562698242, "learning_rate": 4.530397227193365e-06, "loss": 0.1763, "step": 1058 }, { "epoch": 0.23860083927112963, "grad_norm": 0.5098565893687929, "learning_rate": 4.529309780736654e-06, "loss": 0.1821, "step": 1059 }, { "epoch": 0.23882614695693807, "grad_norm": 0.5471270750459719, "learning_rate": 4.528221207479862e-06, "loss": 0.1897, "step": 1060 }, { "epoch": 0.2390514546427465, "grad_norm": 0.46289499595120953, "learning_rate": 4.527131508027433e-06, "loss": 0.1579, "step": 1061 }, { "epoch": 0.23927676232855494, "grad_norm": 0.49338984383186363, "learning_rate": 4.5260406829844364e-06, "loss": 0.1807, "step": 1062 }, { "epoch": 0.23950207001436336, "grad_norm": 0.5029016610935466, "learning_rate": 4.524948732956568e-06, "loss": 0.1948, "step": 1063 }, { "epoch": 0.2397273777001718, "grad_norm": 0.49997268202495476, "learning_rate": 4.523855658550146e-06, "loss": 0.1792, "step": 1064 }, { "epoch": 0.23995268538598022, "grad_norm": 0.48933115982084735, "learning_rate": 4.522761460372114e-06, "loss": 0.178, "step": 1065 }, { "epoch": 0.24017799307178866, "grad_norm": 0.47946740350317474, "learning_rate": 4.521666139030039e-06, "loss": 0.1677, "step": 1066 }, { "epoch": 0.24040330075759708, "grad_norm": 0.4184658820157611, "learning_rate": 4.520569695132113e-06, "loss": 0.15, "step": 1067 }, { "epoch": 0.24062860844340553, "grad_norm": 0.5197049444095906, "learning_rate": 4.51947212928715e-06, "loss": 0.1794, "step": 1068 }, { "epoch": 0.24085391612921395, "grad_norm": 0.5201746421812771, "learning_rate": 4.518373442104587e-06, "loss": 0.1807, "step": 1069 }, { "epoch": 0.2410792238150224, "grad_norm": 0.4459522405235029, "learning_rate": 4.5172736341944845e-06, "loss": 0.1659, "step": 1070 }, { "epoch": 0.2413045315008308, "grad_norm": 0.4914213660575991, "learning_rate": 4.516172706167525e-06, "loss": 0.1759, "step": 1071 }, { "epoch": 0.24152983918663926, "grad_norm": 0.47911999094848545, "learning_rate": 4.515070658635013e-06, "loss": 0.1768, "step": 1072 }, { "epoch": 0.24175514687244767, "grad_norm": 0.4892612077155755, "learning_rate": 4.513967492208874e-06, "loss": 0.1715, "step": 1073 }, { "epoch": 0.24198045455825612, "grad_norm": 0.5259185134683907, "learning_rate": 4.512863207501654e-06, "loss": 0.1916, "step": 1074 }, { "epoch": 0.24220576224406454, "grad_norm": 0.459465389299004, "learning_rate": 4.511757805126523e-06, "loss": 0.172, "step": 1075 }, { "epoch": 0.24243106992987298, "grad_norm": 0.47910539989958506, "learning_rate": 4.510651285697269e-06, "loss": 0.1726, "step": 1076 }, { "epoch": 0.2426563776156814, "grad_norm": 0.4701014911994522, "learning_rate": 4.509543649828302e-06, "loss": 0.1681, "step": 1077 }, { "epoch": 0.24288168530148985, "grad_norm": 0.46213489937923835, "learning_rate": 4.5084348981346495e-06, "loss": 0.1719, "step": 1078 }, { "epoch": 0.2431069929872983, "grad_norm": 0.4935132048768077, "learning_rate": 4.507325031231959e-06, "loss": 0.1825, "step": 1079 }, { "epoch": 0.2433323006731067, "grad_norm": 0.48392629751946686, "learning_rate": 4.506214049736502e-06, "loss": 0.1713, "step": 1080 }, { "epoch": 0.24355760835891516, "grad_norm": 0.5080595176990748, "learning_rate": 4.505101954265161e-06, "loss": 0.1903, "step": 1081 }, { "epoch": 0.24378291604472357, "grad_norm": 0.4673535025076435, "learning_rate": 4.503988745435443e-06, "loss": 0.1608, "step": 1082 }, { "epoch": 0.24400822373053202, "grad_norm": 0.4821960030125238, "learning_rate": 4.502874423865473e-06, "loss": 0.1817, "step": 1083 }, { "epoch": 0.24423353141634044, "grad_norm": 0.4825223552549021, "learning_rate": 4.5017589901739885e-06, "loss": 0.1703, "step": 1084 }, { "epoch": 0.24445883910214888, "grad_norm": 0.49338394902001637, "learning_rate": 4.500642444980352e-06, "loss": 0.1737, "step": 1085 }, { "epoch": 0.2446841467879573, "grad_norm": 0.4545645277155261, "learning_rate": 4.499524788904537e-06, "loss": 0.1636, "step": 1086 }, { "epoch": 0.24490945447376575, "grad_norm": 0.48306624324737196, "learning_rate": 4.498406022567137e-06, "loss": 0.1765, "step": 1087 }, { "epoch": 0.24513476215957417, "grad_norm": 0.4749043885580964, "learning_rate": 4.497286146589361e-06, "loss": 0.1693, "step": 1088 }, { "epoch": 0.2453600698453826, "grad_norm": 0.4712090941375913, "learning_rate": 4.4961651615930344e-06, "loss": 0.1647, "step": 1089 }, { "epoch": 0.24558537753119103, "grad_norm": 0.48622341817676057, "learning_rate": 4.4950430682005995e-06, "loss": 0.1672, "step": 1090 }, { "epoch": 0.24581068521699947, "grad_norm": 0.46188635009124457, "learning_rate": 4.493919867035112e-06, "loss": 0.1477, "step": 1091 }, { "epoch": 0.2460359929028079, "grad_norm": 0.4548856414210293, "learning_rate": 4.492795558720242e-06, "loss": 0.1578, "step": 1092 }, { "epoch": 0.24626130058861634, "grad_norm": 0.46865916685520176, "learning_rate": 4.491670143880279e-06, "loss": 0.1577, "step": 1093 }, { "epoch": 0.24648660827442476, "grad_norm": 0.47334725360934393, "learning_rate": 4.490543623140123e-06, "loss": 0.1785, "step": 1094 }, { "epoch": 0.2467119159602332, "grad_norm": 0.4903085986965832, "learning_rate": 4.489415997125288e-06, "loss": 0.1787, "step": 1095 }, { "epoch": 0.24693722364604162, "grad_norm": 0.4987257674352878, "learning_rate": 4.488287266461904e-06, "loss": 0.1822, "step": 1096 }, { "epoch": 0.24716253133185007, "grad_norm": 0.49514915506271256, "learning_rate": 4.487157431776712e-06, "loss": 0.1786, "step": 1097 }, { "epoch": 0.24738783901765848, "grad_norm": 0.4895397147505216, "learning_rate": 4.486026493697067e-06, "loss": 0.176, "step": 1098 }, { "epoch": 0.24761314670346693, "grad_norm": 0.4750865000533062, "learning_rate": 4.484894452850937e-06, "loss": 0.1662, "step": 1099 }, { "epoch": 0.24783845438927535, "grad_norm": 0.5231586359914755, "learning_rate": 4.483761309866902e-06, "loss": 0.1932, "step": 1100 }, { "epoch": 0.2480637620750838, "grad_norm": 0.49524527920469363, "learning_rate": 4.482627065374155e-06, "loss": 0.1854, "step": 1101 }, { "epoch": 0.2482890697608922, "grad_norm": 0.4674761617417515, "learning_rate": 4.481491720002499e-06, "loss": 0.162, "step": 1102 }, { "epoch": 0.24851437744670066, "grad_norm": 0.49289841749272967, "learning_rate": 4.4803552743823495e-06, "loss": 0.1754, "step": 1103 }, { "epoch": 0.24873968513250908, "grad_norm": 0.5096228946461686, "learning_rate": 4.479217729144731e-06, "loss": 0.1813, "step": 1104 }, { "epoch": 0.24896499281831752, "grad_norm": 0.4709558119876889, "learning_rate": 4.478079084921282e-06, "loss": 0.159, "step": 1105 }, { "epoch": 0.24919030050412594, "grad_norm": 0.46290162539634433, "learning_rate": 4.476939342344246e-06, "loss": 0.163, "step": 1106 }, { "epoch": 0.24941560818993438, "grad_norm": 0.47236146938099277, "learning_rate": 4.475798502046484e-06, "loss": 0.1705, "step": 1107 }, { "epoch": 0.2496409158757428, "grad_norm": 0.4759949192730458, "learning_rate": 4.474656564661458e-06, "loss": 0.1652, "step": 1108 }, { "epoch": 0.24986622356155125, "grad_norm": 0.49013257055563464, "learning_rate": 4.473513530823246e-06, "loss": 0.1851, "step": 1109 }, { "epoch": 0.25009153124735967, "grad_norm": 0.46874665727501796, "learning_rate": 4.472369401166531e-06, "loss": 0.1724, "step": 1110 }, { "epoch": 0.2503168389331681, "grad_norm": 0.4621201673920498, "learning_rate": 4.471224176326605e-06, "loss": 0.1675, "step": 1111 }, { "epoch": 0.25054214661897656, "grad_norm": 0.45656340706343534, "learning_rate": 4.47007785693937e-06, "loss": 0.1751, "step": 1112 }, { "epoch": 0.250767454304785, "grad_norm": 0.48141602205933126, "learning_rate": 4.468930443641333e-06, "loss": 0.1751, "step": 1113 }, { "epoch": 0.2509927619905934, "grad_norm": 0.4858128449295977, "learning_rate": 4.467781937069611e-06, "loss": 0.1775, "step": 1114 }, { "epoch": 0.2512180696764018, "grad_norm": 0.49393776518442545, "learning_rate": 4.466632337861926e-06, "loss": 0.1693, "step": 1115 }, { "epoch": 0.2514433773622103, "grad_norm": 0.5125458797038791, "learning_rate": 4.465481646656608e-06, "loss": 0.1867, "step": 1116 }, { "epoch": 0.2516686850480187, "grad_norm": 0.475803038903292, "learning_rate": 4.464329864092593e-06, "loss": 0.1716, "step": 1117 }, { "epoch": 0.2518939927338271, "grad_norm": 0.5419619115552078, "learning_rate": 4.463176990809423e-06, "loss": 0.1828, "step": 1118 }, { "epoch": 0.25211930041963554, "grad_norm": 0.4639433198703533, "learning_rate": 4.462023027447246e-06, "loss": 0.1723, "step": 1119 }, { "epoch": 0.252344608105444, "grad_norm": 0.46793145441851886, "learning_rate": 4.460867974646814e-06, "loss": 0.1629, "step": 1120 }, { "epoch": 0.25256991579125243, "grad_norm": 0.5411894292034929, "learning_rate": 4.459711833049485e-06, "loss": 0.1961, "step": 1121 }, { "epoch": 0.25279522347706085, "grad_norm": 0.4989548751939043, "learning_rate": 4.45855460329722e-06, "loss": 0.1731, "step": 1122 }, { "epoch": 0.25302053116286927, "grad_norm": 0.5243782674199724, "learning_rate": 4.457396286032589e-06, "loss": 0.1776, "step": 1123 }, { "epoch": 0.25324583884867774, "grad_norm": 0.4886069442128809, "learning_rate": 4.45623688189876e-06, "loss": 0.1707, "step": 1124 }, { "epoch": 0.25347114653448616, "grad_norm": 0.5012040772255312, "learning_rate": 4.455076391539507e-06, "loss": 0.1812, "step": 1125 }, { "epoch": 0.2536964542202946, "grad_norm": 0.571500324082683, "learning_rate": 4.453914815599206e-06, "loss": 0.1659, "step": 1126 }, { "epoch": 0.25392176190610305, "grad_norm": 0.4948654591758799, "learning_rate": 4.45275215472284e-06, "loss": 0.1757, "step": 1127 }, { "epoch": 0.25414706959191147, "grad_norm": 0.4544375354575435, "learning_rate": 4.451588409555988e-06, "loss": 0.162, "step": 1128 }, { "epoch": 0.2543723772777199, "grad_norm": 0.46357726308267455, "learning_rate": 4.450423580744837e-06, "loss": 0.1668, "step": 1129 }, { "epoch": 0.2545976849635283, "grad_norm": 0.4820056825601605, "learning_rate": 4.4492576689361705e-06, "loss": 0.1599, "step": 1130 }, { "epoch": 0.2548229926493368, "grad_norm": 0.5079568505038684, "learning_rate": 4.448090674777377e-06, "loss": 0.1823, "step": 1131 }, { "epoch": 0.2550483003351452, "grad_norm": 0.49772082318847466, "learning_rate": 4.446922598916445e-06, "loss": 0.1695, "step": 1132 }, { "epoch": 0.2552736080209536, "grad_norm": 0.4698431427484417, "learning_rate": 4.4457534420019644e-06, "loss": 0.1637, "step": 1133 }, { "epoch": 0.25549891570676203, "grad_norm": 0.46380041243007897, "learning_rate": 4.444583204683123e-06, "loss": 0.1797, "step": 1134 }, { "epoch": 0.2557242233925705, "grad_norm": 0.4697980613946506, "learning_rate": 4.44341188760971e-06, "loss": 0.1632, "step": 1135 }, { "epoch": 0.2559495310783789, "grad_norm": 0.4872747387611847, "learning_rate": 4.4422394914321145e-06, "loss": 0.1778, "step": 1136 }, { "epoch": 0.25617483876418734, "grad_norm": 0.485343327730141, "learning_rate": 4.4410660168013255e-06, "loss": 0.171, "step": 1137 }, { "epoch": 0.25640014644999576, "grad_norm": 0.4391103312259274, "learning_rate": 4.439891464368927e-06, "loss": 0.1644, "step": 1138 }, { "epoch": 0.25662545413580423, "grad_norm": 0.46132359473885676, "learning_rate": 4.438715834787107e-06, "loss": 0.1701, "step": 1139 }, { "epoch": 0.25685076182161265, "grad_norm": 0.4947273838629531, "learning_rate": 4.437539128708647e-06, "loss": 0.1766, "step": 1140 }, { "epoch": 0.25707606950742107, "grad_norm": 0.44906983895296854, "learning_rate": 4.436361346786929e-06, "loss": 0.1631, "step": 1141 }, { "epoch": 0.2573013771932295, "grad_norm": 0.4603708623495546, "learning_rate": 4.435182489675931e-06, "loss": 0.1599, "step": 1142 }, { "epoch": 0.25752668487903796, "grad_norm": 0.45802671302164594, "learning_rate": 4.4340025580302285e-06, "loss": 0.1729, "step": 1143 }, { "epoch": 0.2577519925648464, "grad_norm": 0.4650391611868575, "learning_rate": 4.432821552504994e-06, "loss": 0.1609, "step": 1144 }, { "epoch": 0.2579773002506548, "grad_norm": 0.47776899229826364, "learning_rate": 4.431639473755994e-06, "loss": 0.1692, "step": 1145 }, { "epoch": 0.2582026079364632, "grad_norm": 0.4748238799376714, "learning_rate": 4.430456322439596e-06, "loss": 0.1711, "step": 1146 }, { "epoch": 0.2584279156222717, "grad_norm": 0.4965122101302463, "learning_rate": 4.429272099212757e-06, "loss": 0.1768, "step": 1147 }, { "epoch": 0.2586532233080801, "grad_norm": 0.47030228526971696, "learning_rate": 4.4280868047330325e-06, "loss": 0.163, "step": 1148 }, { "epoch": 0.2588785309938885, "grad_norm": 0.5011109705302049, "learning_rate": 4.4269004396585735e-06, "loss": 0.1799, "step": 1149 }, { "epoch": 0.25910383867969694, "grad_norm": 0.4918342025169798, "learning_rate": 4.425713004648123e-06, "loss": 0.1645, "step": 1150 }, { "epoch": 0.2593291463655054, "grad_norm": 0.4592219591074875, "learning_rate": 4.424524500361021e-06, "loss": 0.1554, "step": 1151 }, { "epoch": 0.25955445405131383, "grad_norm": 0.4759222622101723, "learning_rate": 4.423334927457198e-06, "loss": 0.1596, "step": 1152 }, { "epoch": 0.25977976173712225, "grad_norm": 0.49223761810181915, "learning_rate": 4.42214428659718e-06, "loss": 0.1767, "step": 1153 }, { "epoch": 0.26000506942293067, "grad_norm": 0.4728211158131068, "learning_rate": 4.420952578442086e-06, "loss": 0.1693, "step": 1154 }, { "epoch": 0.26023037710873914, "grad_norm": 0.4899834125737949, "learning_rate": 4.419759803653627e-06, "loss": 0.1756, "step": 1155 }, { "epoch": 0.26045568479454756, "grad_norm": 0.5042200638482935, "learning_rate": 4.4185659628941054e-06, "loss": 0.172, "step": 1156 }, { "epoch": 0.260680992480356, "grad_norm": 0.4381712609704442, "learning_rate": 4.417371056826417e-06, "loss": 0.1512, "step": 1157 }, { "epoch": 0.2609063001661644, "grad_norm": 0.49604617909817605, "learning_rate": 4.416175086114049e-06, "loss": 0.1821, "step": 1158 }, { "epoch": 0.26113160785197287, "grad_norm": 0.4805931915635769, "learning_rate": 4.414978051421081e-06, "loss": 0.1751, "step": 1159 }, { "epoch": 0.2613569155377813, "grad_norm": 0.5108089694144428, "learning_rate": 4.4137799534121785e-06, "loss": 0.178, "step": 1160 }, { "epoch": 0.2615822232235897, "grad_norm": 0.47149666287470404, "learning_rate": 4.412580792752601e-06, "loss": 0.1698, "step": 1161 }, { "epoch": 0.2618075309093981, "grad_norm": 0.46335594977891076, "learning_rate": 4.4113805701082e-06, "loss": 0.1702, "step": 1162 }, { "epoch": 0.2620328385952066, "grad_norm": 0.48331517281476644, "learning_rate": 4.410179286145414e-06, "loss": 0.1643, "step": 1163 }, { "epoch": 0.262258146281015, "grad_norm": 0.45649384365543133, "learning_rate": 4.408976941531269e-06, "loss": 0.1731, "step": 1164 }, { "epoch": 0.26248345396682343, "grad_norm": 0.48853109585960214, "learning_rate": 4.407773536933384e-06, "loss": 0.1872, "step": 1165 }, { "epoch": 0.26270876165263185, "grad_norm": 0.47250454754886273, "learning_rate": 4.406569073019965e-06, "loss": 0.1739, "step": 1166 }, { "epoch": 0.2629340693384403, "grad_norm": 0.4868051127801619, "learning_rate": 4.4053635504598045e-06, "loss": 0.1725, "step": 1167 }, { "epoch": 0.26315937702424874, "grad_norm": 0.46804005017622746, "learning_rate": 4.404156969922284e-06, "loss": 0.1702, "step": 1168 }, { "epoch": 0.26338468471005716, "grad_norm": 0.4953399325232565, "learning_rate": 4.402949332077375e-06, "loss": 0.1727, "step": 1169 }, { "epoch": 0.2636099923958656, "grad_norm": 0.5050035534953599, "learning_rate": 4.401740637595633e-06, "loss": 0.1638, "step": 1170 }, { "epoch": 0.26383530008167405, "grad_norm": 0.5061533696389707, "learning_rate": 4.400530887148199e-06, "loss": 0.1678, "step": 1171 }, { "epoch": 0.26406060776748247, "grad_norm": 0.48214334053632324, "learning_rate": 4.3993200814068035e-06, "loss": 0.1592, "step": 1172 }, { "epoch": 0.2642859154532909, "grad_norm": 0.47051364612550667, "learning_rate": 4.398108221043764e-06, "loss": 0.1843, "step": 1173 }, { "epoch": 0.2645112231390993, "grad_norm": 0.47107601682684574, "learning_rate": 4.396895306731978e-06, "loss": 0.173, "step": 1174 }, { "epoch": 0.2647365308249078, "grad_norm": 0.5005898656550045, "learning_rate": 4.395681339144933e-06, "loss": 0.1902, "step": 1175 }, { "epoch": 0.2649618385107162, "grad_norm": 0.4676621595113707, "learning_rate": 4.394466318956701e-06, "loss": 0.1656, "step": 1176 }, { "epoch": 0.2651871461965246, "grad_norm": 0.49204892601984884, "learning_rate": 4.393250246841935e-06, "loss": 0.1693, "step": 1177 }, { "epoch": 0.2654124538823331, "grad_norm": 0.46310285597788936, "learning_rate": 4.392033123475876e-06, "loss": 0.1681, "step": 1178 }, { "epoch": 0.2656377615681415, "grad_norm": 0.47878735086717533, "learning_rate": 4.390814949534348e-06, "loss": 0.168, "step": 1179 }, { "epoch": 0.2658630692539499, "grad_norm": 0.49825319716859656, "learning_rate": 4.389595725693756e-06, "loss": 0.1696, "step": 1180 }, { "epoch": 0.26608837693975834, "grad_norm": 0.4455782342121541, "learning_rate": 4.388375452631091e-06, "loss": 0.1615, "step": 1181 }, { "epoch": 0.2663136846255668, "grad_norm": 0.43357859168921514, "learning_rate": 4.387154131023924e-06, "loss": 0.1551, "step": 1182 }, { "epoch": 0.26653899231137523, "grad_norm": 0.5089345638479391, "learning_rate": 4.385931761550411e-06, "loss": 0.176, "step": 1183 }, { "epoch": 0.26676429999718365, "grad_norm": 0.5266595479693508, "learning_rate": 4.384708344889285e-06, "loss": 0.1695, "step": 1184 }, { "epoch": 0.26698960768299207, "grad_norm": 0.475099637405834, "learning_rate": 4.383483881719867e-06, "loss": 0.1779, "step": 1185 }, { "epoch": 0.26721491536880054, "grad_norm": 0.4787739839641768, "learning_rate": 4.382258372722054e-06, "loss": 0.1725, "step": 1186 }, { "epoch": 0.26744022305460896, "grad_norm": 0.49494278847012774, "learning_rate": 4.381031818576326e-06, "loss": 0.165, "step": 1187 }, { "epoch": 0.2676655307404174, "grad_norm": 0.46344787955776795, "learning_rate": 4.379804219963742e-06, "loss": 0.1673, "step": 1188 }, { "epoch": 0.2678908384262258, "grad_norm": 0.46819728716710135, "learning_rate": 4.378575577565945e-06, "loss": 0.1685, "step": 1189 }, { "epoch": 0.26811614611203427, "grad_norm": 0.4634414488798126, "learning_rate": 4.377345892065149e-06, "loss": 0.1572, "step": 1190 }, { "epoch": 0.2683414537978427, "grad_norm": 0.47565010962702786, "learning_rate": 4.376115164144157e-06, "loss": 0.1718, "step": 1191 }, { "epoch": 0.2685667614836511, "grad_norm": 0.47429751678202337, "learning_rate": 4.374883394486343e-06, "loss": 0.1706, "step": 1192 }, { "epoch": 0.2687920691694595, "grad_norm": 0.5085699899597834, "learning_rate": 4.373650583775666e-06, "loss": 0.1648, "step": 1193 }, { "epoch": 0.269017376855268, "grad_norm": 0.5146572632392027, "learning_rate": 4.3724167326966575e-06, "loss": 0.1878, "step": 1194 }, { "epoch": 0.2692426845410764, "grad_norm": 0.4627246459636679, "learning_rate": 4.37118184193443e-06, "loss": 0.1572, "step": 1195 }, { "epoch": 0.26946799222688483, "grad_norm": 0.5042679214482405, "learning_rate": 4.3699459121746726e-06, "loss": 0.1789, "step": 1196 }, { "epoch": 0.26969329991269325, "grad_norm": 0.4715362603580668, "learning_rate": 4.368708944103649e-06, "loss": 0.1565, "step": 1197 }, { "epoch": 0.2699186075985017, "grad_norm": 0.46224782361941275, "learning_rate": 4.367470938408204e-06, "loss": 0.1638, "step": 1198 }, { "epoch": 0.27014391528431014, "grad_norm": 0.48457161847040103, "learning_rate": 4.366231895775755e-06, "loss": 0.1697, "step": 1199 }, { "epoch": 0.27036922297011856, "grad_norm": 0.45322439544830784, "learning_rate": 4.364991816894296e-06, "loss": 0.1492, "step": 1200 }, { "epoch": 0.270594530655927, "grad_norm": 0.5475044658907094, "learning_rate": 4.3637507024523975e-06, "loss": 0.1988, "step": 1201 }, { "epoch": 0.27081983834173545, "grad_norm": 0.4898131791403852, "learning_rate": 4.362508553139203e-06, "loss": 0.1777, "step": 1202 }, { "epoch": 0.27104514602754387, "grad_norm": 0.4540719396050229, "learning_rate": 4.361265369644432e-06, "loss": 0.1652, "step": 1203 }, { "epoch": 0.2712704537133523, "grad_norm": 0.4827482306962776, "learning_rate": 4.360021152658378e-06, "loss": 0.1797, "step": 1204 }, { "epoch": 0.2714957613991607, "grad_norm": 0.465499233492623, "learning_rate": 4.3587759028719075e-06, "loss": 0.1607, "step": 1205 }, { "epoch": 0.2717210690849692, "grad_norm": 0.45932842008835173, "learning_rate": 4.357529620976463e-06, "loss": 0.1658, "step": 1206 }, { "epoch": 0.2719463767707776, "grad_norm": 0.49105618480613644, "learning_rate": 4.356282307664057e-06, "loss": 0.1708, "step": 1207 }, { "epoch": 0.272171684456586, "grad_norm": 0.4699501932404702, "learning_rate": 4.355033963627277e-06, "loss": 0.1623, "step": 1208 }, { "epoch": 0.27239699214239443, "grad_norm": 0.4432815322488692, "learning_rate": 4.353784589559282e-06, "loss": 0.1523, "step": 1209 }, { "epoch": 0.2726222998282029, "grad_norm": 0.4999895546959578, "learning_rate": 4.352534186153802e-06, "loss": 0.1714, "step": 1210 }, { "epoch": 0.2728476075140113, "grad_norm": 0.5376208111714382, "learning_rate": 4.35128275410514e-06, "loss": 0.1684, "step": 1211 }, { "epoch": 0.27307291519981974, "grad_norm": 0.5005445898532112, "learning_rate": 4.3500302941081685e-06, "loss": 0.1766, "step": 1212 }, { "epoch": 0.27329822288562816, "grad_norm": 0.4916858385322702, "learning_rate": 4.348776806858334e-06, "loss": 0.1746, "step": 1213 }, { "epoch": 0.27352353057143663, "grad_norm": 0.5038191015625039, "learning_rate": 4.3475222930516484e-06, "loss": 0.1761, "step": 1214 }, { "epoch": 0.27374883825724505, "grad_norm": 0.482264627560053, "learning_rate": 4.346266753384699e-06, "loss": 0.166, "step": 1215 }, { "epoch": 0.27397414594305347, "grad_norm": 0.4782417492804113, "learning_rate": 4.345010188554638e-06, "loss": 0.1682, "step": 1216 }, { "epoch": 0.2741994536288619, "grad_norm": 0.5225392937950303, "learning_rate": 4.343752599259192e-06, "loss": 0.1775, "step": 1217 }, { "epoch": 0.27442476131467036, "grad_norm": 0.46930424506116875, "learning_rate": 4.34249398619665e-06, "loss": 0.1685, "step": 1218 }, { "epoch": 0.2746500690004788, "grad_norm": 0.4810395604573289, "learning_rate": 4.341234350065876e-06, "loss": 0.1804, "step": 1219 }, { "epoch": 0.2748753766862872, "grad_norm": 0.457383636891631, "learning_rate": 4.339973691566297e-06, "loss": 0.1616, "step": 1220 }, { "epoch": 0.2751006843720956, "grad_norm": 0.4711529416779021, "learning_rate": 4.33871201139791e-06, "loss": 0.1617, "step": 1221 }, { "epoch": 0.2753259920579041, "grad_norm": 0.5160260790743614, "learning_rate": 4.337449310261279e-06, "loss": 0.1684, "step": 1222 }, { "epoch": 0.2755512997437125, "grad_norm": 0.517349396391902, "learning_rate": 4.336185588857535e-06, "loss": 0.1765, "step": 1223 }, { "epoch": 0.2757766074295209, "grad_norm": 0.49587416173024024, "learning_rate": 4.334920847888376e-06, "loss": 0.1709, "step": 1224 }, { "epoch": 0.2760019151153294, "grad_norm": 0.48410326600056597, "learning_rate": 4.333655088056065e-06, "loss": 0.1767, "step": 1225 }, { "epoch": 0.2762272228011378, "grad_norm": 0.5468232901412815, "learning_rate": 4.332388310063431e-06, "loss": 0.1876, "step": 1226 }, { "epoch": 0.27645253048694624, "grad_norm": 0.5317933634678655, "learning_rate": 4.331120514613869e-06, "loss": 0.1727, "step": 1227 }, { "epoch": 0.27667783817275465, "grad_norm": 0.5022448146956019, "learning_rate": 4.329851702411339e-06, "loss": 0.1721, "step": 1228 }, { "epoch": 0.2769031458585631, "grad_norm": 0.46481136517383537, "learning_rate": 4.328581874160363e-06, "loss": 0.1558, "step": 1229 }, { "epoch": 0.27712845354437154, "grad_norm": 0.5284031800079505, "learning_rate": 4.327311030566033e-06, "loss": 0.1578, "step": 1230 }, { "epoch": 0.27735376123017996, "grad_norm": 0.5138307368684882, "learning_rate": 4.326039172333997e-06, "loss": 0.1783, "step": 1231 }, { "epoch": 0.2775790689159884, "grad_norm": 0.5042489855251965, "learning_rate": 4.324766300170473e-06, "loss": 0.1771, "step": 1232 }, { "epoch": 0.27780437660179685, "grad_norm": 0.5161807185989821, "learning_rate": 4.323492414782239e-06, "loss": 0.1695, "step": 1233 }, { "epoch": 0.27802968428760527, "grad_norm": 0.49481011856084106, "learning_rate": 4.322217516876635e-06, "loss": 0.1584, "step": 1234 }, { "epoch": 0.2782549919734137, "grad_norm": 0.5297983704670794, "learning_rate": 4.320941607161567e-06, "loss": 0.1819, "step": 1235 }, { "epoch": 0.2784802996592221, "grad_norm": 0.504028057895518, "learning_rate": 4.3196646863454975e-06, "loss": 0.1692, "step": 1236 }, { "epoch": 0.2787056073450306, "grad_norm": 0.48754978064018856, "learning_rate": 4.3183867551374535e-06, "loss": 0.1696, "step": 1237 }, { "epoch": 0.278930915030839, "grad_norm": 0.5186780873159246, "learning_rate": 4.317107814247022e-06, "loss": 0.1887, "step": 1238 }, { "epoch": 0.2791562227166474, "grad_norm": 0.49208775545274297, "learning_rate": 4.3158278643843544e-06, "loss": 0.1772, "step": 1239 }, { "epoch": 0.27938153040245584, "grad_norm": 0.4834696449410449, "learning_rate": 4.314546906260156e-06, "loss": 0.1743, "step": 1240 }, { "epoch": 0.2796068380882643, "grad_norm": 0.4940735421374972, "learning_rate": 4.313264940585695e-06, "loss": 0.1657, "step": 1241 }, { "epoch": 0.2798321457740727, "grad_norm": 0.48222847794101337, "learning_rate": 4.3119819680728e-06, "loss": 0.1571, "step": 1242 }, { "epoch": 0.28005745345988114, "grad_norm": 0.504598014321156, "learning_rate": 4.310697989433858e-06, "loss": 0.1766, "step": 1243 }, { "epoch": 0.28028276114568956, "grad_norm": 0.4661881543796616, "learning_rate": 4.3094130053818164e-06, "loss": 0.1629, "step": 1244 }, { "epoch": 0.28050806883149804, "grad_norm": 0.48427367373674574, "learning_rate": 4.308127016630176e-06, "loss": 0.1695, "step": 1245 }, { "epoch": 0.28073337651730645, "grad_norm": 0.5031029441970696, "learning_rate": 4.306840023892998e-06, "loss": 0.1697, "step": 1246 }, { "epoch": 0.2809586842031149, "grad_norm": 0.4682614760551946, "learning_rate": 4.305552027884904e-06, "loss": 0.1615, "step": 1247 }, { "epoch": 0.2811839918889233, "grad_norm": 0.46375682682480657, "learning_rate": 4.304263029321069e-06, "loss": 0.1721, "step": 1248 }, { "epoch": 0.28140929957473176, "grad_norm": 0.46393825922775633, "learning_rate": 4.302973028917226e-06, "loss": 0.1762, "step": 1249 }, { "epoch": 0.2816346072605402, "grad_norm": 0.46920503962002086, "learning_rate": 4.301682027389663e-06, "loss": 0.1721, "step": 1250 }, { "epoch": 0.2818599149463486, "grad_norm": 0.48383966890417285, "learning_rate": 4.300390025455227e-06, "loss": 0.1679, "step": 1251 }, { "epoch": 0.282085222632157, "grad_norm": 0.4950523538046429, "learning_rate": 4.299097023831318e-06, "loss": 0.1671, "step": 1252 }, { "epoch": 0.2823105303179655, "grad_norm": 0.45179000541068043, "learning_rate": 4.2978030232358904e-06, "loss": 0.1684, "step": 1253 }, { "epoch": 0.2825358380037739, "grad_norm": 0.4569613433988033, "learning_rate": 4.2965080243874555e-06, "loss": 0.1561, "step": 1254 }, { "epoch": 0.2827611456895823, "grad_norm": 0.49036731780666165, "learning_rate": 4.295212028005078e-06, "loss": 0.1771, "step": 1255 }, { "epoch": 0.28298645337539075, "grad_norm": 0.4504370205410457, "learning_rate": 4.293915034808376e-06, "loss": 0.1557, "step": 1256 }, { "epoch": 0.2832117610611992, "grad_norm": 0.47106233823900273, "learning_rate": 4.292617045517521e-06, "loss": 0.1606, "step": 1257 }, { "epoch": 0.28343706874700764, "grad_norm": 0.4313797520186185, "learning_rate": 4.29131806085324e-06, "loss": 0.1452, "step": 1258 }, { "epoch": 0.28366237643281605, "grad_norm": 0.4568166083233466, "learning_rate": 4.290018081536807e-06, "loss": 0.1629, "step": 1259 }, { "epoch": 0.2838876841186245, "grad_norm": 0.47667485262652165, "learning_rate": 4.288717108290056e-06, "loss": 0.1626, "step": 1260 }, { "epoch": 0.28411299180443295, "grad_norm": 0.4807412272846728, "learning_rate": 4.287415141835368e-06, "loss": 0.1846, "step": 1261 }, { "epoch": 0.28433829949024136, "grad_norm": 0.4681601937744111, "learning_rate": 4.2861121828956745e-06, "loss": 0.1701, "step": 1262 }, { "epoch": 0.2845636071760498, "grad_norm": 0.48879201661900573, "learning_rate": 4.284808232194462e-06, "loss": 0.1765, "step": 1263 }, { "epoch": 0.2847889148618582, "grad_norm": 0.4676260132902741, "learning_rate": 4.283503290455765e-06, "loss": 0.1676, "step": 1264 }, { "epoch": 0.2850142225476667, "grad_norm": 0.49374539412848295, "learning_rate": 4.28219735840417e-06, "loss": 0.1711, "step": 1265 }, { "epoch": 0.2852395302334751, "grad_norm": 0.5133291385107003, "learning_rate": 4.28089043676481e-06, "loss": 0.1751, "step": 1266 }, { "epoch": 0.2854648379192835, "grad_norm": 0.5015169344912714, "learning_rate": 4.279582526263371e-06, "loss": 0.1799, "step": 1267 }, { "epoch": 0.2856901456050919, "grad_norm": 0.49564604641852966, "learning_rate": 4.27827362762609e-06, "loss": 0.1868, "step": 1268 }, { "epoch": 0.2859154532909004, "grad_norm": 0.4826518697709936, "learning_rate": 4.276963741579745e-06, "loss": 0.1724, "step": 1269 }, { "epoch": 0.2861407609767088, "grad_norm": 0.47198024006927375, "learning_rate": 4.275652868851669e-06, "loss": 0.1626, "step": 1270 }, { "epoch": 0.28636606866251724, "grad_norm": 0.4887317789419866, "learning_rate": 4.2743410101697405e-06, "loss": 0.1759, "step": 1271 }, { "epoch": 0.28659137634832565, "grad_norm": 0.48415900082739516, "learning_rate": 4.2730281662623866e-06, "loss": 0.1731, "step": 1272 }, { "epoch": 0.28681668403413413, "grad_norm": 0.48162186592846623, "learning_rate": 4.271714337858579e-06, "loss": 0.1681, "step": 1273 }, { "epoch": 0.28704199171994255, "grad_norm": 0.4555532662421832, "learning_rate": 4.270399525687839e-06, "loss": 0.1653, "step": 1274 }, { "epoch": 0.28726729940575096, "grad_norm": 0.49380043518937744, "learning_rate": 4.269083730480232e-06, "loss": 0.1841, "step": 1275 }, { "epoch": 0.28749260709155944, "grad_norm": 0.470663539027122, "learning_rate": 4.267766952966369e-06, "loss": 0.1682, "step": 1276 }, { "epoch": 0.28771791477736786, "grad_norm": 0.4861182617285624, "learning_rate": 4.26644919387741e-06, "loss": 0.1688, "step": 1277 }, { "epoch": 0.2879432224631763, "grad_norm": 0.45164008956634005, "learning_rate": 4.265130453945056e-06, "loss": 0.1592, "step": 1278 }, { "epoch": 0.2881685301489847, "grad_norm": 0.4813784687003862, "learning_rate": 4.263810733901554e-06, "loss": 0.1819, "step": 1279 }, { "epoch": 0.28839383783479317, "grad_norm": 0.4737265609854097, "learning_rate": 4.262490034479697e-06, "loss": 0.1662, "step": 1280 }, { "epoch": 0.2886191455206016, "grad_norm": 0.44082202687660105, "learning_rate": 4.261168356412818e-06, "loss": 0.1631, "step": 1281 }, { "epoch": 0.28884445320641, "grad_norm": 0.43808182321709044, "learning_rate": 4.259845700434797e-06, "loss": 0.1473, "step": 1282 }, { "epoch": 0.2890697608922184, "grad_norm": 0.5104052734846787, "learning_rate": 4.258522067280055e-06, "loss": 0.171, "step": 1283 }, { "epoch": 0.2892950685780269, "grad_norm": 0.48126358259683144, "learning_rate": 4.257197457683556e-06, "loss": 0.1602, "step": 1284 }, { "epoch": 0.2895203762638353, "grad_norm": 0.4860327782364687, "learning_rate": 4.2558718723808055e-06, "loss": 0.1777, "step": 1285 }, { "epoch": 0.28974568394964373, "grad_norm": 0.4909158607880032, "learning_rate": 4.254545312107854e-06, "loss": 0.1749, "step": 1286 }, { "epoch": 0.28997099163545215, "grad_norm": 0.515698974564528, "learning_rate": 4.253217777601289e-06, "loss": 0.1832, "step": 1287 }, { "epoch": 0.2901962993212606, "grad_norm": 0.49937903200840367, "learning_rate": 4.251889269598241e-06, "loss": 0.1877, "step": 1288 }, { "epoch": 0.29042160700706904, "grad_norm": 0.4583839347106229, "learning_rate": 4.250559788836382e-06, "loss": 0.1636, "step": 1289 }, { "epoch": 0.29064691469287746, "grad_norm": 0.4891595487665644, "learning_rate": 4.249229336053924e-06, "loss": 0.1742, "step": 1290 }, { "epoch": 0.2908722223786859, "grad_norm": 0.46454308268647515, "learning_rate": 4.247897911989615e-06, "loss": 0.1672, "step": 1291 }, { "epoch": 0.29109753006449435, "grad_norm": 0.45860463509021276, "learning_rate": 4.2465655173827465e-06, "loss": 0.1687, "step": 1292 }, { "epoch": 0.29132283775030277, "grad_norm": 0.46340134333777605, "learning_rate": 4.245232152973148e-06, "loss": 0.1575, "step": 1293 }, { "epoch": 0.2915481454361112, "grad_norm": 0.4709196024735463, "learning_rate": 4.243897819501187e-06, "loss": 0.156, "step": 1294 }, { "epoch": 0.2917734531219196, "grad_norm": 0.4720281820205427, "learning_rate": 4.242562517707768e-06, "loss": 0.1607, "step": 1295 }, { "epoch": 0.2919987608077281, "grad_norm": 0.4950495777319756, "learning_rate": 4.241226248334335e-06, "loss": 0.1771, "step": 1296 }, { "epoch": 0.2922240684935365, "grad_norm": 0.44321757258749245, "learning_rate": 4.23988901212287e-06, "loss": 0.1501, "step": 1297 }, { "epoch": 0.2924493761793449, "grad_norm": 0.46641258630583354, "learning_rate": 4.238550809815889e-06, "loss": 0.1577, "step": 1298 }, { "epoch": 0.29267468386515333, "grad_norm": 0.4355684264835874, "learning_rate": 4.237211642156446e-06, "loss": 0.1539, "step": 1299 }, { "epoch": 0.2928999915509618, "grad_norm": 0.46022050786059787, "learning_rate": 4.23587150988813e-06, "loss": 0.1657, "step": 1300 }, { "epoch": 0.2931252992367702, "grad_norm": 0.50555996380803, "learning_rate": 4.234530413755069e-06, "loss": 0.1861, "step": 1301 }, { "epoch": 0.29335060692257864, "grad_norm": 0.45770806516472096, "learning_rate": 4.233188354501921e-06, "loss": 0.1658, "step": 1302 }, { "epoch": 0.29357591460838706, "grad_norm": 0.4822249847330653, "learning_rate": 4.231845332873883e-06, "loss": 0.1675, "step": 1303 }, { "epoch": 0.29380122229419553, "grad_norm": 0.47560738464013713, "learning_rate": 4.230501349616683e-06, "loss": 0.1639, "step": 1304 }, { "epoch": 0.29402652998000395, "grad_norm": 0.486640763683581, "learning_rate": 4.2291564054765876e-06, "loss": 0.1708, "step": 1305 }, { "epoch": 0.29425183766581237, "grad_norm": 0.4772344480015269, "learning_rate": 4.227810501200393e-06, "loss": 0.1705, "step": 1306 }, { "epoch": 0.2944771453516208, "grad_norm": 0.49761222589037885, "learning_rate": 4.226463637535429e-06, "loss": 0.1856, "step": 1307 }, { "epoch": 0.29470245303742926, "grad_norm": 0.48453435723802374, "learning_rate": 4.225115815229559e-06, "loss": 0.1709, "step": 1308 }, { "epoch": 0.2949277607232377, "grad_norm": 0.46816646410647905, "learning_rate": 4.22376703503118e-06, "loss": 0.1676, "step": 1309 }, { "epoch": 0.2951530684090461, "grad_norm": 0.44980063168003603, "learning_rate": 4.222417297689217e-06, "loss": 0.1492, "step": 1310 }, { "epoch": 0.2953783760948545, "grad_norm": 0.475968174132624, "learning_rate": 4.22106660395313e-06, "loss": 0.174, "step": 1311 }, { "epoch": 0.295603683780663, "grad_norm": 0.4966748118697132, "learning_rate": 4.219714954572909e-06, "loss": 0.1704, "step": 1312 }, { "epoch": 0.2958289914664714, "grad_norm": 0.4864155888357422, "learning_rate": 4.218362350299075e-06, "loss": 0.1754, "step": 1313 }, { "epoch": 0.2960542991522798, "grad_norm": 0.5153435414046714, "learning_rate": 4.217008791882678e-06, "loss": 0.186, "step": 1314 }, { "epoch": 0.29627960683808824, "grad_norm": 0.4542256078427076, "learning_rate": 4.215654280075297e-06, "loss": 0.1723, "step": 1315 }, { "epoch": 0.2965049145238967, "grad_norm": 0.4805611576250163, "learning_rate": 4.214298815629046e-06, "loss": 0.1767, "step": 1316 }, { "epoch": 0.29673022220970513, "grad_norm": 0.47706024349233667, "learning_rate": 4.212942399296559e-06, "loss": 0.1555, "step": 1317 }, { "epoch": 0.29695552989551355, "grad_norm": 0.48748152898974967, "learning_rate": 4.211585031831007e-06, "loss": 0.1846, "step": 1318 }, { "epoch": 0.29718083758132197, "grad_norm": 0.46300199198371683, "learning_rate": 4.210226713986085e-06, "loss": 0.168, "step": 1319 }, { "epoch": 0.29740614526713044, "grad_norm": 0.4687496463200866, "learning_rate": 4.208867446516015e-06, "loss": 0.1666, "step": 1320 }, { "epoch": 0.29763145295293886, "grad_norm": 0.46195114020388633, "learning_rate": 4.2075072301755486e-06, "loss": 0.1525, "step": 1321 }, { "epoch": 0.2978567606387473, "grad_norm": 0.4655949755044446, "learning_rate": 4.206146065719963e-06, "loss": 0.165, "step": 1322 }, { "epoch": 0.29808206832455575, "grad_norm": 0.476253279421997, "learning_rate": 4.204783953905062e-06, "loss": 0.1787, "step": 1323 }, { "epoch": 0.29830737601036417, "grad_norm": 0.4483571352663244, "learning_rate": 4.203420895487175e-06, "loss": 0.1661, "step": 1324 }, { "epoch": 0.2985326836961726, "grad_norm": 0.47256546371837377, "learning_rate": 4.202056891223159e-06, "loss": 0.1659, "step": 1325 }, { "epoch": 0.298757991381981, "grad_norm": 0.4891184537367563, "learning_rate": 4.200691941870392e-06, "loss": 0.1632, "step": 1326 }, { "epoch": 0.2989832990677895, "grad_norm": 0.44451431166176403, "learning_rate": 4.199326048186783e-06, "loss": 0.1548, "step": 1327 }, { "epoch": 0.2992086067535979, "grad_norm": 0.4822388959051435, "learning_rate": 4.197959210930759e-06, "loss": 0.1783, "step": 1328 }, { "epoch": 0.2994339144394063, "grad_norm": 0.4859493329890237, "learning_rate": 4.196591430861275e-06, "loss": 0.1687, "step": 1329 }, { "epoch": 0.29965922212521473, "grad_norm": 0.45993151866349613, "learning_rate": 4.195222708737809e-06, "loss": 0.1656, "step": 1330 }, { "epoch": 0.2998845298110232, "grad_norm": 0.5027414331804484, "learning_rate": 4.193853045320359e-06, "loss": 0.1896, "step": 1331 }, { "epoch": 0.3001098374968316, "grad_norm": 0.4741251417191081, "learning_rate": 4.192482441369451e-06, "loss": 0.1576, "step": 1332 }, { "epoch": 0.30033514518264004, "grad_norm": 0.47610034599935547, "learning_rate": 4.191110897646129e-06, "loss": 0.1618, "step": 1333 }, { "epoch": 0.30056045286844846, "grad_norm": 0.4884119475840316, "learning_rate": 4.189738414911959e-06, "loss": 0.1707, "step": 1334 }, { "epoch": 0.30078576055425693, "grad_norm": 0.487269387089356, "learning_rate": 4.188364993929032e-06, "loss": 0.1846, "step": 1335 }, { "epoch": 0.30101106824006535, "grad_norm": 0.4695057148911935, "learning_rate": 4.186990635459954e-06, "loss": 0.1675, "step": 1336 }, { "epoch": 0.30123637592587377, "grad_norm": 0.4681903788160959, "learning_rate": 4.185615340267858e-06, "loss": 0.1578, "step": 1337 }, { "epoch": 0.3014616836116822, "grad_norm": 0.4872111988165553, "learning_rate": 4.184239109116393e-06, "loss": 0.1809, "step": 1338 }, { "epoch": 0.30168699129749066, "grad_norm": 0.44572910840509217, "learning_rate": 4.182861942769729e-06, "loss": 0.1624, "step": 1339 }, { "epoch": 0.3019122989832991, "grad_norm": 0.46711380169986927, "learning_rate": 4.181483841992556e-06, "loss": 0.1608, "step": 1340 }, { "epoch": 0.3021376066691075, "grad_norm": 0.4704023219402658, "learning_rate": 4.18010480755008e-06, "loss": 0.1654, "step": 1341 }, { "epoch": 0.3023629143549159, "grad_norm": 0.47212377938185984, "learning_rate": 4.178724840208029e-06, "loss": 0.1655, "step": 1342 }, { "epoch": 0.3025882220407244, "grad_norm": 0.45808277304699035, "learning_rate": 4.1773439407326474e-06, "loss": 0.1606, "step": 1343 }, { "epoch": 0.3028135297265328, "grad_norm": 0.4450063295423093, "learning_rate": 4.175962109890697e-06, "loss": 0.153, "step": 1344 }, { "epoch": 0.3030388374123412, "grad_norm": 0.5067429640735599, "learning_rate": 4.174579348449456e-06, "loss": 0.1782, "step": 1345 }, { "epoch": 0.30326414509814964, "grad_norm": 0.4744420430173419, "learning_rate": 4.1731956571767215e-06, "loss": 0.1586, "step": 1346 }, { "epoch": 0.3034894527839581, "grad_norm": 0.4826024239192088, "learning_rate": 4.171811036840805e-06, "loss": 0.1614, "step": 1347 }, { "epoch": 0.30371476046976653, "grad_norm": 0.47082954610521216, "learning_rate": 4.170425488210534e-06, "loss": 0.1706, "step": 1348 }, { "epoch": 0.30394006815557495, "grad_norm": 0.4599812085595551, "learning_rate": 4.169039012055255e-06, "loss": 0.1622, "step": 1349 }, { "epoch": 0.30416537584138337, "grad_norm": 0.4658111624747646, "learning_rate": 4.167651609144822e-06, "loss": 0.1649, "step": 1350 }, { "epoch": 0.30439068352719184, "grad_norm": 0.48755223540020975, "learning_rate": 4.166263280249613e-06, "loss": 0.1669, "step": 1351 }, { "epoch": 0.30461599121300026, "grad_norm": 0.45303825299908984, "learning_rate": 4.164874026140511e-06, "loss": 0.1643, "step": 1352 }, { "epoch": 0.3048412988988087, "grad_norm": 0.445021146008286, "learning_rate": 4.163483847588919e-06, "loss": 0.1558, "step": 1353 }, { "epoch": 0.3050666065846171, "grad_norm": 0.47655572636339527, "learning_rate": 4.1620927453667515e-06, "loss": 0.1687, "step": 1354 }, { "epoch": 0.30529191427042557, "grad_norm": 0.47234459669526374, "learning_rate": 4.160700720246435e-06, "loss": 0.1628, "step": 1355 }, { "epoch": 0.305517221956234, "grad_norm": 0.4531327995219235, "learning_rate": 4.159307773000909e-06, "loss": 0.1516, "step": 1356 }, { "epoch": 0.3057425296420424, "grad_norm": 0.4992459885733053, "learning_rate": 4.1579139044036265e-06, "loss": 0.1661, "step": 1357 }, { "epoch": 0.3059678373278508, "grad_norm": 0.45234523767743423, "learning_rate": 4.15651911522855e-06, "loss": 0.1591, "step": 1358 }, { "epoch": 0.3061931450136593, "grad_norm": 0.5144568617456705, "learning_rate": 4.155123406250153e-06, "loss": 0.1842, "step": 1359 }, { "epoch": 0.3064184526994677, "grad_norm": 0.4492467197923174, "learning_rate": 4.153726778243422e-06, "loss": 0.159, "step": 1360 }, { "epoch": 0.30664376038527613, "grad_norm": 0.49125517977000377, "learning_rate": 4.152329231983852e-06, "loss": 0.1784, "step": 1361 }, { "epoch": 0.30686906807108455, "grad_norm": 0.48383581152863314, "learning_rate": 4.150930768247449e-06, "loss": 0.1761, "step": 1362 }, { "epoch": 0.307094375756893, "grad_norm": 0.45251816032706066, "learning_rate": 4.149531387810727e-06, "loss": 0.1717, "step": 1363 }, { "epoch": 0.30731968344270144, "grad_norm": 0.4714618478306759, "learning_rate": 4.148131091450709e-06, "loss": 0.1601, "step": 1364 }, { "epoch": 0.30754499112850986, "grad_norm": 0.46639711128092015, "learning_rate": 4.14672987994493e-06, "loss": 0.1492, "step": 1365 }, { "epoch": 0.3077702988143183, "grad_norm": 0.5019168227780006, "learning_rate": 4.145327754071427e-06, "loss": 0.1785, "step": 1366 }, { "epoch": 0.30799560650012675, "grad_norm": 0.5115191763440133, "learning_rate": 4.1439247146087515e-06, "loss": 0.1774, "step": 1367 }, { "epoch": 0.30822091418593517, "grad_norm": 0.44762710081650325, "learning_rate": 4.142520762335957e-06, "loss": 0.1489, "step": 1368 }, { "epoch": 0.3084462218717436, "grad_norm": 0.46505395959631984, "learning_rate": 4.141115898032607e-06, "loss": 0.1622, "step": 1369 }, { "epoch": 0.308671529557552, "grad_norm": 0.49062895275209234, "learning_rate": 4.13971012247877e-06, "loss": 0.1589, "step": 1370 }, { "epoch": 0.3088968372433605, "grad_norm": 0.4878995152656453, "learning_rate": 4.138303436455019e-06, "loss": 0.1713, "step": 1371 }, { "epoch": 0.3091221449291689, "grad_norm": 0.4841704377642727, "learning_rate": 4.136895840742437e-06, "loss": 0.1707, "step": 1372 }, { "epoch": 0.3093474526149773, "grad_norm": 0.46594485906792216, "learning_rate": 4.1354873361226074e-06, "loss": 0.1569, "step": 1373 }, { "epoch": 0.3095727603007858, "grad_norm": 0.4799789377866047, "learning_rate": 4.134077923377622e-06, "loss": 0.1643, "step": 1374 }, { "epoch": 0.3097980679865942, "grad_norm": 0.46024391817092386, "learning_rate": 4.132667603290075e-06, "loss": 0.175, "step": 1375 }, { "epoch": 0.3100233756724026, "grad_norm": 0.5004998885019434, "learning_rate": 4.131256376643062e-06, "loss": 0.184, "step": 1376 }, { "epoch": 0.31024868335821104, "grad_norm": 0.4496587322418268, "learning_rate": 4.129844244220188e-06, "loss": 0.1567, "step": 1377 }, { "epoch": 0.3104739910440195, "grad_norm": 0.4381923644413625, "learning_rate": 4.128431206805556e-06, "loss": 0.1571, "step": 1378 }, { "epoch": 0.31069929872982793, "grad_norm": 0.48281884874936964, "learning_rate": 4.127017265183772e-06, "loss": 0.1576, "step": 1379 }, { "epoch": 0.31092460641563635, "grad_norm": 0.486608789161625, "learning_rate": 4.125602420139947e-06, "loss": 0.1736, "step": 1380 }, { "epoch": 0.31114991410144477, "grad_norm": 0.43631321179674804, "learning_rate": 4.124186672459691e-06, "loss": 0.1521, "step": 1381 }, { "epoch": 0.31137522178725324, "grad_norm": 0.46337301384693746, "learning_rate": 4.122770022929114e-06, "loss": 0.1693, "step": 1382 }, { "epoch": 0.31160052947306166, "grad_norm": 0.49510374301430243, "learning_rate": 4.121352472334832e-06, "loss": 0.165, "step": 1383 }, { "epoch": 0.3118258371588701, "grad_norm": 0.4898587365242441, "learning_rate": 4.119934021463956e-06, "loss": 0.1734, "step": 1384 }, { "epoch": 0.3120511448446785, "grad_norm": 0.48861456754196575, "learning_rate": 4.1185146711040995e-06, "loss": 0.1704, "step": 1385 }, { "epoch": 0.31227645253048697, "grad_norm": 0.49674442096608046, "learning_rate": 4.117094422043374e-06, "loss": 0.1752, "step": 1386 }, { "epoch": 0.3125017602162954, "grad_norm": 0.5090139483439499, "learning_rate": 4.115673275070392e-06, "loss": 0.1706, "step": 1387 }, { "epoch": 0.3127270679021038, "grad_norm": 0.46111045623593994, "learning_rate": 4.114251230974263e-06, "loss": 0.1633, "step": 1388 }, { "epoch": 0.3129523755879122, "grad_norm": 0.4906796666813092, "learning_rate": 4.1128282905445945e-06, "loss": 0.1788, "step": 1389 }, { "epoch": 0.3131776832737207, "grad_norm": 0.5216823967451348, "learning_rate": 4.1114044545714935e-06, "loss": 0.1779, "step": 1390 }, { "epoch": 0.3134029909595291, "grad_norm": 0.4630187597122319, "learning_rate": 4.1099797238455615e-06, "loss": 0.175, "step": 1391 }, { "epoch": 0.31362829864533753, "grad_norm": 0.4448150220936651, "learning_rate": 4.1085540991579e-06, "loss": 0.1599, "step": 1392 }, { "epoch": 0.31385360633114595, "grad_norm": 0.47815315348196197, "learning_rate": 4.107127581300105e-06, "loss": 0.18, "step": 1393 }, { "epoch": 0.3140789140169544, "grad_norm": 0.4647279014720571, "learning_rate": 4.105700171064267e-06, "loss": 0.1679, "step": 1394 }, { "epoch": 0.31430422170276284, "grad_norm": 0.4755973884828333, "learning_rate": 4.104271869242975e-06, "loss": 0.1613, "step": 1395 }, { "epoch": 0.31452952938857126, "grad_norm": 0.5158289979464339, "learning_rate": 4.102842676629313e-06, "loss": 0.1814, "step": 1396 }, { "epoch": 0.3147548370743797, "grad_norm": 0.4612219449801974, "learning_rate": 4.101412594016855e-06, "loss": 0.1704, "step": 1397 }, { "epoch": 0.31498014476018815, "grad_norm": 0.48199287409838215, "learning_rate": 4.0999816221996755e-06, "loss": 0.157, "step": 1398 }, { "epoch": 0.31520545244599657, "grad_norm": 0.49588565495852915, "learning_rate": 4.098549761972339e-06, "loss": 0.1698, "step": 1399 }, { "epoch": 0.315430760131805, "grad_norm": 0.47443029199155873, "learning_rate": 4.097117014129903e-06, "loss": 0.1705, "step": 1400 }, { "epoch": 0.3156560678176134, "grad_norm": 0.4739011675953338, "learning_rate": 4.095683379467922e-06, "loss": 0.1746, "step": 1401 }, { "epoch": 0.3158813755034219, "grad_norm": 0.5006985353128484, "learning_rate": 4.094248858782436e-06, "loss": 0.1749, "step": 1402 }, { "epoch": 0.3161066831892303, "grad_norm": 0.48832615004361674, "learning_rate": 4.092813452869983e-06, "loss": 0.1748, "step": 1403 }, { "epoch": 0.3163319908750387, "grad_norm": 0.5047270198243387, "learning_rate": 4.091377162527592e-06, "loss": 0.1741, "step": 1404 }, { "epoch": 0.31655729856084713, "grad_norm": 0.48286040196834346, "learning_rate": 4.089939988552778e-06, "loss": 0.1703, "step": 1405 }, { "epoch": 0.3167826062466556, "grad_norm": 0.4772214512873062, "learning_rate": 4.088501931743551e-06, "loss": 0.1844, "step": 1406 }, { "epoch": 0.317007913932464, "grad_norm": 0.4700248977848288, "learning_rate": 4.087062992898413e-06, "loss": 0.1622, "step": 1407 }, { "epoch": 0.31723322161827244, "grad_norm": 0.49138673902246965, "learning_rate": 4.08562317281635e-06, "loss": 0.1733, "step": 1408 }, { "epoch": 0.31745852930408086, "grad_norm": 0.48634809473907503, "learning_rate": 4.084182472296842e-06, "loss": 0.171, "step": 1409 }, { "epoch": 0.31768383698988933, "grad_norm": 0.4931411565996177, "learning_rate": 4.082740892139856e-06, "loss": 0.171, "step": 1410 }, { "epoch": 0.31790914467569775, "grad_norm": 0.49274535939778674, "learning_rate": 4.081298433145847e-06, "loss": 0.1591, "step": 1411 }, { "epoch": 0.31813445236150617, "grad_norm": 0.4698037285578985, "learning_rate": 4.07985509611576e-06, "loss": 0.1667, "step": 1412 }, { "epoch": 0.3183597600473146, "grad_norm": 0.47696407295357496, "learning_rate": 4.078410881851026e-06, "loss": 0.1697, "step": 1413 }, { "epoch": 0.31858506773312306, "grad_norm": 0.4827409751476058, "learning_rate": 4.076965791153562e-06, "loss": 0.1672, "step": 1414 }, { "epoch": 0.3188103754189315, "grad_norm": 0.4841572963600362, "learning_rate": 4.075519824825775e-06, "loss": 0.1804, "step": 1415 }, { "epoch": 0.3190356831047399, "grad_norm": 0.46260361476399064, "learning_rate": 4.074072983670555e-06, "loss": 0.17, "step": 1416 }, { "epoch": 0.3192609907905483, "grad_norm": 0.4828365970593339, "learning_rate": 4.072625268491279e-06, "loss": 0.1716, "step": 1417 }, { "epoch": 0.3194862984763568, "grad_norm": 0.5020748613802973, "learning_rate": 4.071176680091809e-06, "loss": 0.1864, "step": 1418 }, { "epoch": 0.3197116061621652, "grad_norm": 0.46429356070269806, "learning_rate": 4.069727219276493e-06, "loss": 0.1533, "step": 1419 }, { "epoch": 0.3199369138479736, "grad_norm": 0.47253365356597143, "learning_rate": 4.068276886850162e-06, "loss": 0.1539, "step": 1420 }, { "epoch": 0.3201622215337821, "grad_norm": 0.43290691768683814, "learning_rate": 4.066825683618132e-06, "loss": 0.1523, "step": 1421 }, { "epoch": 0.3203875292195905, "grad_norm": 0.5150603139209966, "learning_rate": 4.065373610386201e-06, "loss": 0.1906, "step": 1422 }, { "epoch": 0.32061283690539893, "grad_norm": 0.4565348014998822, "learning_rate": 4.063920667960652e-06, "loss": 0.1645, "step": 1423 }, { "epoch": 0.32083814459120735, "grad_norm": 0.4605096118081446, "learning_rate": 4.06246685714825e-06, "loss": 0.1629, "step": 1424 }, { "epoch": 0.3210634522770158, "grad_norm": 0.4622701306481583, "learning_rate": 4.061012178756242e-06, "loss": 0.1683, "step": 1425 }, { "epoch": 0.32128875996282424, "grad_norm": 0.4859787260584599, "learning_rate": 4.059556633592356e-06, "loss": 0.1594, "step": 1426 }, { "epoch": 0.32151406764863266, "grad_norm": 0.48877370296912087, "learning_rate": 4.058100222464802e-06, "loss": 0.167, "step": 1427 }, { "epoch": 0.3217393753344411, "grad_norm": 0.4627117579487038, "learning_rate": 4.056642946182271e-06, "loss": 0.1526, "step": 1428 }, { "epoch": 0.32196468302024955, "grad_norm": 0.46613918156888673, "learning_rate": 4.0551848055539345e-06, "loss": 0.1653, "step": 1429 }, { "epoch": 0.32218999070605797, "grad_norm": 0.503917741619968, "learning_rate": 4.0537258013894434e-06, "loss": 0.172, "step": 1430 }, { "epoch": 0.3224152983918664, "grad_norm": 0.49156732022138344, "learning_rate": 4.052265934498929e-06, "loss": 0.1743, "step": 1431 }, { "epoch": 0.3226406060776748, "grad_norm": 0.483390765528641, "learning_rate": 4.0508052056929995e-06, "loss": 0.158, "step": 1432 }, { "epoch": 0.3228659137634833, "grad_norm": 0.46973338880450943, "learning_rate": 4.049343615782744e-06, "loss": 0.1647, "step": 1433 }, { "epoch": 0.3230912214492917, "grad_norm": 0.47841122050317536, "learning_rate": 4.047881165579729e-06, "loss": 0.1714, "step": 1434 }, { "epoch": 0.3233165291351001, "grad_norm": 0.4701686511398567, "learning_rate": 4.046417855895999e-06, "loss": 0.1657, "step": 1435 }, { "epoch": 0.32354183682090853, "grad_norm": 0.4860143638189218, "learning_rate": 4.044953687544074e-06, "loss": 0.1715, "step": 1436 }, { "epoch": 0.323767144506717, "grad_norm": 0.45709754277717596, "learning_rate": 4.043488661336953e-06, "loss": 0.1552, "step": 1437 }, { "epoch": 0.3239924521925254, "grad_norm": 0.4679828926968299, "learning_rate": 4.042022778088111e-06, "loss": 0.1598, "step": 1438 }, { "epoch": 0.32421775987833384, "grad_norm": 0.47861779407877914, "learning_rate": 4.0405560386114975e-06, "loss": 0.175, "step": 1439 }, { "epoch": 0.32444306756414226, "grad_norm": 0.47207995414404774, "learning_rate": 4.039088443721538e-06, "loss": 0.1481, "step": 1440 }, { "epoch": 0.32466837524995074, "grad_norm": 0.4701578331333167, "learning_rate": 4.0376199942331335e-06, "loss": 0.1621, "step": 1441 }, { "epoch": 0.32489368293575915, "grad_norm": 0.46083892406658933, "learning_rate": 4.03615069096166e-06, "loss": 0.1662, "step": 1442 }, { "epoch": 0.32511899062156757, "grad_norm": 0.4713751574792367, "learning_rate": 4.034680534722966e-06, "loss": 0.172, "step": 1443 }, { "epoch": 0.325344298307376, "grad_norm": 0.4800419145752851, "learning_rate": 4.033209526333375e-06, "loss": 0.1635, "step": 1444 }, { "epoch": 0.32556960599318446, "grad_norm": 0.5177479216025512, "learning_rate": 4.0317376666096815e-06, "loss": 0.1825, "step": 1445 }, { "epoch": 0.3257949136789929, "grad_norm": 0.507169424745277, "learning_rate": 4.030264956369158e-06, "loss": 0.1634, "step": 1446 }, { "epoch": 0.3260202213648013, "grad_norm": 0.49970993985240464, "learning_rate": 4.028791396429541e-06, "loss": 0.1706, "step": 1447 }, { "epoch": 0.3262455290506097, "grad_norm": 0.48030204853136504, "learning_rate": 4.0273169876090475e-06, "loss": 0.1708, "step": 1448 }, { "epoch": 0.3264708367364182, "grad_norm": 0.4610761076996909, "learning_rate": 4.02584173072636e-06, "loss": 0.158, "step": 1449 }, { "epoch": 0.3266961444222266, "grad_norm": 0.464686959031091, "learning_rate": 4.024365626600632e-06, "loss": 0.1542, "step": 1450 }, { "epoch": 0.326921452108035, "grad_norm": 0.48376124217372546, "learning_rate": 4.022888676051492e-06, "loss": 0.1625, "step": 1451 }, { "epoch": 0.32714675979384344, "grad_norm": 0.4703117306092047, "learning_rate": 4.021410879899035e-06, "loss": 0.1669, "step": 1452 }, { "epoch": 0.3273720674796519, "grad_norm": 0.48567988701291553, "learning_rate": 4.019932238963824e-06, "loss": 0.1627, "step": 1453 }, { "epoch": 0.32759737516546034, "grad_norm": 0.47111752684595165, "learning_rate": 4.018452754066895e-06, "loss": 0.1675, "step": 1454 }, { "epoch": 0.32782268285126875, "grad_norm": 0.46905414134840095, "learning_rate": 4.016972426029751e-06, "loss": 0.163, "step": 1455 }, { "epoch": 0.32804799053707717, "grad_norm": 0.4647820038576828, "learning_rate": 4.015491255674362e-06, "loss": 0.1509, "step": 1456 }, { "epoch": 0.32827329822288565, "grad_norm": 0.48517318183312064, "learning_rate": 4.014009243823167e-06, "loss": 0.178, "step": 1457 }, { "epoch": 0.32849860590869406, "grad_norm": 0.4928940400289568, "learning_rate": 4.012526391299073e-06, "loss": 0.1638, "step": 1458 }, { "epoch": 0.3287239135945025, "grad_norm": 0.48786731867399125, "learning_rate": 4.01104269892545e-06, "loss": 0.1738, "step": 1459 }, { "epoch": 0.3289492212803109, "grad_norm": 0.49032486420130994, "learning_rate": 4.0095581675261405e-06, "loss": 0.1634, "step": 1460 }, { "epoch": 0.3291745289661194, "grad_norm": 0.5219124278013928, "learning_rate": 4.008072797925447e-06, "loss": 0.172, "step": 1461 }, { "epoch": 0.3293998366519278, "grad_norm": 0.45938422120363903, "learning_rate": 4.006586590948141e-06, "loss": 0.1567, "step": 1462 }, { "epoch": 0.3296251443377362, "grad_norm": 0.49674977160069705, "learning_rate": 4.005099547419458e-06, "loss": 0.1807, "step": 1463 }, { "epoch": 0.3298504520235446, "grad_norm": 0.49291053419468084, "learning_rate": 4.003611668165097e-06, "loss": 0.1635, "step": 1464 }, { "epoch": 0.3300757597093531, "grad_norm": 0.47909951524830563, "learning_rate": 4.0021229540112226e-06, "loss": 0.1685, "step": 1465 }, { "epoch": 0.3303010673951615, "grad_norm": 0.4967423779350966, "learning_rate": 4.000633405784461e-06, "loss": 0.1698, "step": 1466 }, { "epoch": 0.33052637508096994, "grad_norm": 0.4947671017226202, "learning_rate": 3.999143024311904e-06, "loss": 0.1705, "step": 1467 }, { "epoch": 0.33075168276677835, "grad_norm": 0.5001281128990965, "learning_rate": 3.997651810421106e-06, "loss": 0.1758, "step": 1468 }, { "epoch": 0.33097699045258683, "grad_norm": 0.47166189147867515, "learning_rate": 3.99615976494008e-06, "loss": 0.1621, "step": 1469 }, { "epoch": 0.33120229813839525, "grad_norm": 0.5142434013369097, "learning_rate": 3.994666888697304e-06, "loss": 0.1845, "step": 1470 }, { "epoch": 0.33142760582420366, "grad_norm": 0.4805539617231105, "learning_rate": 3.993173182521718e-06, "loss": 0.1605, "step": 1471 }, { "epoch": 0.33165291351001214, "grad_norm": 0.5043483938154787, "learning_rate": 3.991678647242719e-06, "loss": 0.175, "step": 1472 }, { "epoch": 0.33187822119582056, "grad_norm": 0.478038959369919, "learning_rate": 3.990183283690169e-06, "loss": 0.1596, "step": 1473 }, { "epoch": 0.332103528881629, "grad_norm": 0.4841686206196171, "learning_rate": 3.988687092694386e-06, "loss": 0.1647, "step": 1474 }, { "epoch": 0.3323288365674374, "grad_norm": 0.4846814128855193, "learning_rate": 3.98719007508615e-06, "loss": 0.1702, "step": 1475 }, { "epoch": 0.33255414425324586, "grad_norm": 0.4858363667117268, "learning_rate": 3.985692231696699e-06, "loss": 0.1689, "step": 1476 }, { "epoch": 0.3327794519390543, "grad_norm": 0.48567860031406734, "learning_rate": 3.98419356335773e-06, "loss": 0.1694, "step": 1477 }, { "epoch": 0.3330047596248627, "grad_norm": 0.49126963786538463, "learning_rate": 3.982694070901396e-06, "loss": 0.1714, "step": 1478 }, { "epoch": 0.3332300673106711, "grad_norm": 0.46435208474415085, "learning_rate": 3.981193755160311e-06, "loss": 0.1632, "step": 1479 }, { "epoch": 0.3334553749964796, "grad_norm": 0.5103818515068653, "learning_rate": 3.979692616967543e-06, "loss": 0.1906, "step": 1480 }, { "epoch": 0.333680682682288, "grad_norm": 0.48145456741346393, "learning_rate": 3.9781906571566195e-06, "loss": 0.1649, "step": 1481 }, { "epoch": 0.33390599036809643, "grad_norm": 0.4853961796628509, "learning_rate": 3.976687876561523e-06, "loss": 0.1774, "step": 1482 }, { "epoch": 0.33413129805390485, "grad_norm": 0.4799122387066231, "learning_rate": 3.975184276016689e-06, "loss": 0.1739, "step": 1483 }, { "epoch": 0.3343566057397133, "grad_norm": 0.46563910497802685, "learning_rate": 3.973679856357014e-06, "loss": 0.1554, "step": 1484 }, { "epoch": 0.33458191342552174, "grad_norm": 0.44840230729722863, "learning_rate": 3.972174618417843e-06, "loss": 0.1588, "step": 1485 }, { "epoch": 0.33480722111133016, "grad_norm": 0.505445345978517, "learning_rate": 3.970668563034982e-06, "loss": 0.1753, "step": 1486 }, { "epoch": 0.3350325287971386, "grad_norm": 0.4995685182105176, "learning_rate": 3.9691616910446845e-06, "loss": 0.1701, "step": 1487 }, { "epoch": 0.33525783648294705, "grad_norm": 0.509006048277001, "learning_rate": 3.967654003283662e-06, "loss": 0.1723, "step": 1488 }, { "epoch": 0.33548314416875546, "grad_norm": 0.5169005973891307, "learning_rate": 3.966145500589076e-06, "loss": 0.1803, "step": 1489 }, { "epoch": 0.3357084518545639, "grad_norm": 0.5079388196082648, "learning_rate": 3.9646361837985435e-06, "loss": 0.1791, "step": 1490 }, { "epoch": 0.3359337595403723, "grad_norm": 0.4560258431578383, "learning_rate": 3.9631260537501304e-06, "loss": 0.1524, "step": 1491 }, { "epoch": 0.3361590672261808, "grad_norm": 0.4766011031285634, "learning_rate": 3.961615111282357e-06, "loss": 0.1781, "step": 1492 }, { "epoch": 0.3363843749119892, "grad_norm": 0.4567234424418434, "learning_rate": 3.960103357234192e-06, "loss": 0.1643, "step": 1493 }, { "epoch": 0.3366096825977976, "grad_norm": 0.49289766811652796, "learning_rate": 3.958590792445057e-06, "loss": 0.1823, "step": 1494 }, { "epoch": 0.33683499028360603, "grad_norm": 0.4848856116585081, "learning_rate": 3.957077417754822e-06, "loss": 0.166, "step": 1495 }, { "epoch": 0.3370602979694145, "grad_norm": 0.4679747882685483, "learning_rate": 3.9555632340038075e-06, "loss": 0.1774, "step": 1496 }, { "epoch": 0.3372856056552229, "grad_norm": 0.48350839858166583, "learning_rate": 3.9540482420327845e-06, "loss": 0.1725, "step": 1497 }, { "epoch": 0.33751091334103134, "grad_norm": 0.4863933008059015, "learning_rate": 3.9525324426829716e-06, "loss": 0.1641, "step": 1498 }, { "epoch": 0.33773622102683976, "grad_norm": 0.46827754368039665, "learning_rate": 3.951015836796034e-06, "loss": 0.1524, "step": 1499 }, { "epoch": 0.33796152871264823, "grad_norm": 0.44169102557800627, "learning_rate": 3.949498425214088e-06, "loss": 0.1513, "step": 1500 }, { "epoch": 0.33796152871264823, "eval_loss": 0.16632719337940216, "eval_runtime": 56.8474, "eval_samples_per_second": 50.486, "eval_steps_per_second": 6.315, "step": 1500 }, { "epoch": 0.33818683639845665, "grad_norm": 0.4878295448211993, "learning_rate": 3.947980208779693e-06, "loss": 0.1653, "step": 1501 }, { "epoch": 0.33841214408426507, "grad_norm": 0.49941504395630504, "learning_rate": 3.946461188335863e-06, "loss": 0.1827, "step": 1502 }, { "epoch": 0.3386374517700735, "grad_norm": 0.4870788561570751, "learning_rate": 3.944941364726049e-06, "loss": 0.1946, "step": 1503 }, { "epoch": 0.33886275945588196, "grad_norm": 0.46386674326836835, "learning_rate": 3.943420738794153e-06, "loss": 0.1597, "step": 1504 }, { "epoch": 0.3390880671416904, "grad_norm": 0.5153906687266364, "learning_rate": 3.941899311384525e-06, "loss": 0.1768, "step": 1505 }, { "epoch": 0.3393133748274988, "grad_norm": 0.527331696836763, "learning_rate": 3.9403770833419535e-06, "loss": 0.1707, "step": 1506 }, { "epoch": 0.3395386825133072, "grad_norm": 0.4549856824962382, "learning_rate": 3.938854055511676e-06, "loss": 0.1529, "step": 1507 }, { "epoch": 0.3397639901991157, "grad_norm": 0.4933487941824393, "learning_rate": 3.937330228739374e-06, "loss": 0.1702, "step": 1508 }, { "epoch": 0.3399892978849241, "grad_norm": 0.4879928825898799, "learning_rate": 3.9358056038711714e-06, "loss": 0.1722, "step": 1509 }, { "epoch": 0.3402146055707325, "grad_norm": 0.4636410414321768, "learning_rate": 3.934280181753634e-06, "loss": 0.1651, "step": 1510 }, { "epoch": 0.34043991325654094, "grad_norm": 0.45711139575548393, "learning_rate": 3.932753963233773e-06, "loss": 0.1463, "step": 1511 }, { "epoch": 0.3406652209423494, "grad_norm": 0.46149943138988064, "learning_rate": 3.931226949159041e-06, "loss": 0.1645, "step": 1512 }, { "epoch": 0.34089052862815783, "grad_norm": 0.48167999322238986, "learning_rate": 3.9296991403773325e-06, "loss": 0.1666, "step": 1513 }, { "epoch": 0.34111583631396625, "grad_norm": 0.4665831445664159, "learning_rate": 3.9281705377369814e-06, "loss": 0.149, "step": 1514 }, { "epoch": 0.34134114399977467, "grad_norm": 0.45575953209532205, "learning_rate": 3.9266411420867635e-06, "loss": 0.147, "step": 1515 }, { "epoch": 0.34156645168558314, "grad_norm": 0.5058205173300985, "learning_rate": 3.925110954275897e-06, "loss": 0.1681, "step": 1516 }, { "epoch": 0.34179175937139156, "grad_norm": 0.4522501093222901, "learning_rate": 3.923579975154037e-06, "loss": 0.1587, "step": 1517 }, { "epoch": 0.3420170670572, "grad_norm": 0.479996247672822, "learning_rate": 3.922048205571279e-06, "loss": 0.1718, "step": 1518 }, { "epoch": 0.34224237474300845, "grad_norm": 0.4954333978709633, "learning_rate": 3.920515646378159e-06, "loss": 0.1648, "step": 1519 }, { "epoch": 0.34246768242881687, "grad_norm": 0.4822288274252376, "learning_rate": 3.918982298425647e-06, "loss": 0.1799, "step": 1520 }, { "epoch": 0.3426929901146253, "grad_norm": 0.46648553853872377, "learning_rate": 3.917448162565157e-06, "loss": 0.1433, "step": 1521 }, { "epoch": 0.3429182978004337, "grad_norm": 0.47922896446110114, "learning_rate": 3.915913239648535e-06, "loss": 0.1742, "step": 1522 }, { "epoch": 0.3431436054862422, "grad_norm": 0.4812863396173565, "learning_rate": 3.91437753052807e-06, "loss": 0.1707, "step": 1523 }, { "epoch": 0.3433689131720506, "grad_norm": 0.45912938511177226, "learning_rate": 3.91284103605648e-06, "loss": 0.1573, "step": 1524 }, { "epoch": 0.343594220857859, "grad_norm": 0.4950628079069977, "learning_rate": 3.911303757086925e-06, "loss": 0.1679, "step": 1525 }, { "epoch": 0.34381952854366743, "grad_norm": 0.4467689502898321, "learning_rate": 3.909765694473e-06, "loss": 0.1597, "step": 1526 }, { "epoch": 0.3440448362294759, "grad_norm": 0.5129219990156939, "learning_rate": 3.908226849068731e-06, "loss": 0.1661, "step": 1527 }, { "epoch": 0.3442701439152843, "grad_norm": 0.4551952170221029, "learning_rate": 3.906687221728583e-06, "loss": 0.1643, "step": 1528 }, { "epoch": 0.34449545160109274, "grad_norm": 0.47170480347901833, "learning_rate": 3.905146813307455e-06, "loss": 0.1611, "step": 1529 }, { "epoch": 0.34472075928690116, "grad_norm": 0.48001748868857014, "learning_rate": 3.903605624660676e-06, "loss": 0.1612, "step": 1530 }, { "epoch": 0.34494606697270963, "grad_norm": 0.4748628275155922, "learning_rate": 3.902063656644012e-06, "loss": 0.1631, "step": 1531 }, { "epoch": 0.34517137465851805, "grad_norm": 0.5139859239848883, "learning_rate": 3.900520910113659e-06, "loss": 0.1914, "step": 1532 }, { "epoch": 0.34539668234432647, "grad_norm": 0.4878759796792643, "learning_rate": 3.898977385926249e-06, "loss": 0.1718, "step": 1533 }, { "epoch": 0.3456219900301349, "grad_norm": 0.4716708640496516, "learning_rate": 3.897433084938841e-06, "loss": 0.1526, "step": 1534 }, { "epoch": 0.34584729771594336, "grad_norm": 0.46184841826766815, "learning_rate": 3.895888008008929e-06, "loss": 0.1654, "step": 1535 }, { "epoch": 0.3460726054017518, "grad_norm": 0.5164135232993046, "learning_rate": 3.894342155994437e-06, "loss": 0.1811, "step": 1536 }, { "epoch": 0.3462979130875602, "grad_norm": 0.48767451819098034, "learning_rate": 3.892795529753718e-06, "loss": 0.1667, "step": 1537 }, { "epoch": 0.3465232207733686, "grad_norm": 0.4712763769177538, "learning_rate": 3.891248130145556e-06, "loss": 0.1625, "step": 1538 }, { "epoch": 0.3467485284591771, "grad_norm": 0.4849166281449554, "learning_rate": 3.889699958029166e-06, "loss": 0.1593, "step": 1539 }, { "epoch": 0.3469738361449855, "grad_norm": 0.4780505339438159, "learning_rate": 3.888151014264189e-06, "loss": 0.1652, "step": 1540 }, { "epoch": 0.3471991438307939, "grad_norm": 0.4459016280706361, "learning_rate": 3.886601299710694e-06, "loss": 0.1517, "step": 1541 }, { "epoch": 0.34742445151660234, "grad_norm": 0.45591641194578175, "learning_rate": 3.885050815229182e-06, "loss": 0.1511, "step": 1542 }, { "epoch": 0.3476497592024108, "grad_norm": 0.4832877784381766, "learning_rate": 3.88349956168058e-06, "loss": 0.1684, "step": 1543 }, { "epoch": 0.34787506688821923, "grad_norm": 0.4986224178690113, "learning_rate": 3.881947539926239e-06, "loss": 0.1691, "step": 1544 }, { "epoch": 0.34810037457402765, "grad_norm": 0.4775733324064565, "learning_rate": 3.880394750827939e-06, "loss": 0.1615, "step": 1545 }, { "epoch": 0.34832568225983607, "grad_norm": 0.4858017668488311, "learning_rate": 3.878841195247888e-06, "loss": 0.1811, "step": 1546 }, { "epoch": 0.34855098994564454, "grad_norm": 0.48153045793870897, "learning_rate": 3.877286874048716e-06, "loss": 0.1718, "step": 1547 }, { "epoch": 0.34877629763145296, "grad_norm": 0.512083637621429, "learning_rate": 3.875731788093478e-06, "loss": 0.1759, "step": 1548 }, { "epoch": 0.3490016053172614, "grad_norm": 0.47558354766986316, "learning_rate": 3.874175938245659e-06, "loss": 0.1611, "step": 1549 }, { "epoch": 0.3492269130030698, "grad_norm": 0.47594444437190775, "learning_rate": 3.872619325369162e-06, "loss": 0.1642, "step": 1550 }, { "epoch": 0.34945222068887827, "grad_norm": 0.49698776261142624, "learning_rate": 3.871061950328317e-06, "loss": 0.1647, "step": 1551 }, { "epoch": 0.3496775283746867, "grad_norm": 0.5120833541363724, "learning_rate": 3.869503813987876e-06, "loss": 0.1654, "step": 1552 }, { "epoch": 0.3499028360604951, "grad_norm": 0.5070864621540254, "learning_rate": 3.867944917213014e-06, "loss": 0.1703, "step": 1553 }, { "epoch": 0.3501281437463035, "grad_norm": 0.45295202225033465, "learning_rate": 3.866385260869327e-06, "loss": 0.1448, "step": 1554 }, { "epoch": 0.350353451432112, "grad_norm": 0.4918072139538417, "learning_rate": 3.864824845822837e-06, "loss": 0.1721, "step": 1555 }, { "epoch": 0.3505787591179204, "grad_norm": 0.5257156167287909, "learning_rate": 3.8632636729399815e-06, "loss": 0.1758, "step": 1556 }, { "epoch": 0.35080406680372883, "grad_norm": 0.4680727482304966, "learning_rate": 3.861701743087622e-06, "loss": 0.164, "step": 1557 }, { "epoch": 0.35102937448953725, "grad_norm": 0.4875956186422521, "learning_rate": 3.860139057133042e-06, "loss": 0.1768, "step": 1558 }, { "epoch": 0.3512546821753457, "grad_norm": 0.47233606422385976, "learning_rate": 3.858575615943941e-06, "loss": 0.1698, "step": 1559 }, { "epoch": 0.35147998986115414, "grad_norm": 0.5025916927692134, "learning_rate": 3.85701142038844e-06, "loss": 0.1854, "step": 1560 }, { "epoch": 0.35170529754696256, "grad_norm": 0.4746547374579445, "learning_rate": 3.855446471335078e-06, "loss": 0.1572, "step": 1561 }, { "epoch": 0.351930605232771, "grad_norm": 0.4794744494656748, "learning_rate": 3.853880769652815e-06, "loss": 0.1691, "step": 1562 }, { "epoch": 0.35215591291857945, "grad_norm": 0.4843099574697001, "learning_rate": 3.852314316211023e-06, "loss": 0.1672, "step": 1563 }, { "epoch": 0.35238122060438787, "grad_norm": 0.48546921942948973, "learning_rate": 3.850747111879499e-06, "loss": 0.1786, "step": 1564 }, { "epoch": 0.3526065282901963, "grad_norm": 0.45578063099662863, "learning_rate": 3.84917915752845e-06, "loss": 0.1571, "step": 1565 }, { "epoch": 0.3528318359760047, "grad_norm": 0.469326755886643, "learning_rate": 3.8476104540285054e-06, "loss": 0.1652, "step": 1566 }, { "epoch": 0.3530571436618132, "grad_norm": 0.533642189695708, "learning_rate": 3.846041002250705e-06, "loss": 0.1787, "step": 1567 }, { "epoch": 0.3532824513476216, "grad_norm": 0.4602243037230457, "learning_rate": 3.84447080306651e-06, "loss": 0.1672, "step": 1568 }, { "epoch": 0.35350775903343, "grad_norm": 0.4914227338324527, "learning_rate": 3.842899857347792e-06, "loss": 0.1731, "step": 1569 }, { "epoch": 0.3537330667192385, "grad_norm": 0.4528175947193335, "learning_rate": 3.841328165966837e-06, "loss": 0.1582, "step": 1570 }, { "epoch": 0.3539583744050469, "grad_norm": 0.5043477043072159, "learning_rate": 3.839755729796349e-06, "loss": 0.1756, "step": 1571 }, { "epoch": 0.3541836820908553, "grad_norm": 0.45252198647800007, "learning_rate": 3.838182549709442e-06, "loss": 0.1542, "step": 1572 }, { "epoch": 0.35440898977666374, "grad_norm": 0.5153655969655626, "learning_rate": 3.8366086265796445e-06, "loss": 0.1788, "step": 1573 }, { "epoch": 0.3546342974624722, "grad_norm": 0.46848812709265947, "learning_rate": 3.835033961280898e-06, "loss": 0.1608, "step": 1574 }, { "epoch": 0.35485960514828063, "grad_norm": 0.49700435600318305, "learning_rate": 3.8334585546875544e-06, "loss": 0.1628, "step": 1575 }, { "epoch": 0.35508491283408905, "grad_norm": 0.4515771767768039, "learning_rate": 3.831882407674379e-06, "loss": 0.1454, "step": 1576 }, { "epoch": 0.35531022051989747, "grad_norm": 0.46810553468047617, "learning_rate": 3.830305521116546e-06, "loss": 0.1479, "step": 1577 }, { "epoch": 0.35553552820570594, "grad_norm": 0.5185799810546041, "learning_rate": 3.828727895889644e-06, "loss": 0.1867, "step": 1578 }, { "epoch": 0.35576083589151436, "grad_norm": 0.4648477693043239, "learning_rate": 3.827149532869668e-06, "loss": 0.1632, "step": 1579 }, { "epoch": 0.3559861435773228, "grad_norm": 0.548001166651646, "learning_rate": 3.825570432933026e-06, "loss": 0.1737, "step": 1580 }, { "epoch": 0.3562114512631312, "grad_norm": 0.5151582597577937, "learning_rate": 3.823990596956531e-06, "loss": 0.1665, "step": 1581 }, { "epoch": 0.35643675894893967, "grad_norm": 0.46553401719297743, "learning_rate": 3.8224100258174066e-06, "loss": 0.1617, "step": 1582 }, { "epoch": 0.3566620666347481, "grad_norm": 0.48246926237586046, "learning_rate": 3.820828720393287e-06, "loss": 0.1705, "step": 1583 }, { "epoch": 0.3568873743205565, "grad_norm": 0.4609227338672053, "learning_rate": 3.819246681562212e-06, "loss": 0.1542, "step": 1584 }, { "epoch": 0.3571126820063649, "grad_norm": 0.4806312081628679, "learning_rate": 3.817663910202628e-06, "loss": 0.1545, "step": 1585 }, { "epoch": 0.3573379896921734, "grad_norm": 0.4522822568915773, "learning_rate": 3.81608040719339e-06, "loss": 0.1584, "step": 1586 }, { "epoch": 0.3575632973779818, "grad_norm": 0.4831632255864022, "learning_rate": 3.8144961734137566e-06, "loss": 0.1617, "step": 1587 }, { "epoch": 0.35778860506379023, "grad_norm": 0.46465414372602604, "learning_rate": 3.812911209743395e-06, "loss": 0.1448, "step": 1588 }, { "epoch": 0.35801391274959865, "grad_norm": 0.47415748028404553, "learning_rate": 3.8113255170623763e-06, "loss": 0.1504, "step": 1589 }, { "epoch": 0.3582392204354071, "grad_norm": 0.49482383184695794, "learning_rate": 3.809739096251176e-06, "loss": 0.1716, "step": 1590 }, { "epoch": 0.35846452812121554, "grad_norm": 0.4760533886621652, "learning_rate": 3.8081519481906747e-06, "loss": 0.1712, "step": 1591 }, { "epoch": 0.35868983580702396, "grad_norm": 0.4693732270317497, "learning_rate": 3.8065640737621566e-06, "loss": 0.1579, "step": 1592 }, { "epoch": 0.3589151434928324, "grad_norm": 0.49273079763166744, "learning_rate": 3.804975473847309e-06, "loss": 0.1643, "step": 1593 }, { "epoch": 0.35914045117864085, "grad_norm": 0.48411925790937316, "learning_rate": 3.803386149328223e-06, "loss": 0.1762, "step": 1594 }, { "epoch": 0.35936575886444927, "grad_norm": 0.5103521369943442, "learning_rate": 3.8017961010873904e-06, "loss": 0.1633, "step": 1595 }, { "epoch": 0.3595910665502577, "grad_norm": 0.4853884510594153, "learning_rate": 3.8002053300077056e-06, "loss": 0.1827, "step": 1596 }, { "epoch": 0.3598163742360661, "grad_norm": 0.48135448571531514, "learning_rate": 3.7986138369724664e-06, "loss": 0.166, "step": 1597 }, { "epoch": 0.3600416819218746, "grad_norm": 0.5077422631864816, "learning_rate": 3.7970216228653667e-06, "loss": 0.164, "step": 1598 }, { "epoch": 0.360266989607683, "grad_norm": 0.4604975406803662, "learning_rate": 3.795428688570505e-06, "loss": 0.1577, "step": 1599 }, { "epoch": 0.3604922972934914, "grad_norm": 0.5169319139843352, "learning_rate": 3.7938350349723784e-06, "loss": 0.1734, "step": 1600 }, { "epoch": 0.36071760497929983, "grad_norm": 0.48315257961110103, "learning_rate": 3.792240662955884e-06, "loss": 0.1683, "step": 1601 }, { "epoch": 0.3609429126651083, "grad_norm": 0.4443054345156236, "learning_rate": 3.7906455734063156e-06, "loss": 0.1583, "step": 1602 }, { "epoch": 0.3611682203509167, "grad_norm": 0.5015309298690244, "learning_rate": 3.7890497672093686e-06, "loss": 0.1759, "step": 1603 }, { "epoch": 0.36139352803672514, "grad_norm": 0.4893223920945183, "learning_rate": 3.7874532452511324e-06, "loss": 0.1594, "step": 1604 }, { "epoch": 0.36161883572253356, "grad_norm": 0.4687661324719426, "learning_rate": 3.785856008418099e-06, "loss": 0.1646, "step": 1605 }, { "epoch": 0.36184414340834203, "grad_norm": 0.4862531688588126, "learning_rate": 3.7842580575971533e-06, "loss": 0.1724, "step": 1606 }, { "epoch": 0.36206945109415045, "grad_norm": 0.508026435849886, "learning_rate": 3.782659393675577e-06, "loss": 0.1673, "step": 1607 }, { "epoch": 0.36229475877995887, "grad_norm": 0.4917181311635296, "learning_rate": 3.7810600175410493e-06, "loss": 0.1503, "step": 1608 }, { "epoch": 0.3625200664657673, "grad_norm": 0.47047467772282814, "learning_rate": 3.7794599300816435e-06, "loss": 0.1601, "step": 1609 }, { "epoch": 0.36274537415157576, "grad_norm": 0.48120667803642914, "learning_rate": 3.77785913218583e-06, "loss": 0.1767, "step": 1610 }, { "epoch": 0.3629706818373842, "grad_norm": 0.47399195684134654, "learning_rate": 3.7762576247424707e-06, "loss": 0.1625, "step": 1611 }, { "epoch": 0.3631959895231926, "grad_norm": 0.4634113964733802, "learning_rate": 3.7746554086408245e-06, "loss": 0.1531, "step": 1612 }, { "epoch": 0.363421297209001, "grad_norm": 0.442021039632381, "learning_rate": 3.7730524847705407e-06, "loss": 0.1638, "step": 1613 }, { "epoch": 0.3636466048948095, "grad_norm": 0.48085227932801655, "learning_rate": 3.7714488540216637e-06, "loss": 0.1649, "step": 1614 }, { "epoch": 0.3638719125806179, "grad_norm": 0.4517008974302194, "learning_rate": 3.7698445172846305e-06, "loss": 0.153, "step": 1615 }, { "epoch": 0.3640972202664263, "grad_norm": 0.4824941635671589, "learning_rate": 3.7682394754502687e-06, "loss": 0.168, "step": 1616 }, { "epoch": 0.3643225279522348, "grad_norm": 0.481444912322331, "learning_rate": 3.7666337294097987e-06, "loss": 0.1614, "step": 1617 }, { "epoch": 0.3645478356380432, "grad_norm": 0.45048559828104257, "learning_rate": 3.7650272800548316e-06, "loss": 0.1507, "step": 1618 }, { "epoch": 0.36477314332385163, "grad_norm": 0.4380402419214886, "learning_rate": 3.7634201282773673e-06, "loss": 0.1479, "step": 1619 }, { "epoch": 0.36499845100966005, "grad_norm": 0.4694987763481897, "learning_rate": 3.7618122749697993e-06, "loss": 0.1493, "step": 1620 }, { "epoch": 0.3652237586954685, "grad_norm": 0.49229010846473126, "learning_rate": 3.7602037210249077e-06, "loss": 0.166, "step": 1621 }, { "epoch": 0.36544906638127694, "grad_norm": 0.4732508852634151, "learning_rate": 3.7585944673358632e-06, "loss": 0.158, "step": 1622 }, { "epoch": 0.36567437406708536, "grad_norm": 0.4922730105742576, "learning_rate": 3.756984514796224e-06, "loss": 0.1658, "step": 1623 }, { "epoch": 0.3658996817528938, "grad_norm": 0.480465721750276, "learning_rate": 3.7553738642999354e-06, "loss": 0.1629, "step": 1624 }, { "epoch": 0.36612498943870225, "grad_norm": 0.4555500517432691, "learning_rate": 3.753762516741333e-06, "loss": 0.16, "step": 1625 }, { "epoch": 0.36635029712451067, "grad_norm": 0.46896943858709556, "learning_rate": 3.7521504730151382e-06, "loss": 0.1684, "step": 1626 }, { "epoch": 0.3665756048103191, "grad_norm": 0.4746436027474732, "learning_rate": 3.7505377340164585e-06, "loss": 0.1634, "step": 1627 }, { "epoch": 0.3668009124961275, "grad_norm": 0.49150528576567154, "learning_rate": 3.748924300640787e-06, "loss": 0.1754, "step": 1628 }, { "epoch": 0.367026220181936, "grad_norm": 0.4800251870123433, "learning_rate": 3.747310173784004e-06, "loss": 0.161, "step": 1629 }, { "epoch": 0.3672515278677444, "grad_norm": 0.4886368625564826, "learning_rate": 3.745695354342374e-06, "loss": 0.173, "step": 1630 }, { "epoch": 0.3674768355535528, "grad_norm": 0.4710407314033299, "learning_rate": 3.7440798432125452e-06, "loss": 0.1496, "step": 1631 }, { "epoch": 0.36770214323936123, "grad_norm": 0.44396957481507243, "learning_rate": 3.742463641291552e-06, "loss": 0.1489, "step": 1632 }, { "epoch": 0.3679274509251697, "grad_norm": 0.4495509640753466, "learning_rate": 3.7408467494768104e-06, "loss": 0.1433, "step": 1633 }, { "epoch": 0.3681527586109781, "grad_norm": 0.4911614375848097, "learning_rate": 3.73922916866612e-06, "loss": 0.1711, "step": 1634 }, { "epoch": 0.36837806629678654, "grad_norm": 0.4925175016344445, "learning_rate": 3.7376108997576628e-06, "loss": 0.1625, "step": 1635 }, { "epoch": 0.36860337398259496, "grad_norm": 0.47304790543096314, "learning_rate": 3.7359919436500038e-06, "loss": 0.1722, "step": 1636 }, { "epoch": 0.36882868166840344, "grad_norm": 0.5227935062806247, "learning_rate": 3.7343723012420884e-06, "loss": 0.1708, "step": 1637 }, { "epoch": 0.36905398935421185, "grad_norm": 0.43706210714839444, "learning_rate": 3.7327519734332453e-06, "loss": 0.1516, "step": 1638 }, { "epoch": 0.36927929704002027, "grad_norm": 0.4483713856914514, "learning_rate": 3.73113096112318e-06, "loss": 0.1591, "step": 1639 }, { "epoch": 0.3695046047258287, "grad_norm": 0.44984030973045785, "learning_rate": 3.7295092652119815e-06, "loss": 0.1574, "step": 1640 }, { "epoch": 0.36972991241163716, "grad_norm": 0.45693541394796433, "learning_rate": 3.7278868866001165e-06, "loss": 0.161, "step": 1641 }, { "epoch": 0.3699552200974456, "grad_norm": 0.46348952374315455, "learning_rate": 3.726263826188432e-06, "loss": 0.1613, "step": 1642 }, { "epoch": 0.370180527783254, "grad_norm": 0.46604172277835615, "learning_rate": 3.724640084878153e-06, "loss": 0.1584, "step": 1643 }, { "epoch": 0.3704058354690624, "grad_norm": 0.4853993400922052, "learning_rate": 3.7230156635708815e-06, "loss": 0.1699, "step": 1644 }, { "epoch": 0.3706311431548709, "grad_norm": 0.441528425970984, "learning_rate": 3.7213905631685988e-06, "loss": 0.1547, "step": 1645 }, { "epoch": 0.3708564508406793, "grad_norm": 0.48827412915196455, "learning_rate": 3.7197647845736616e-06, "loss": 0.1526, "step": 1646 }, { "epoch": 0.3710817585264877, "grad_norm": 0.4670003645595098, "learning_rate": 3.7181383286888056e-06, "loss": 0.1532, "step": 1647 }, { "epoch": 0.37130706621229614, "grad_norm": 0.46132477401273203, "learning_rate": 3.7165111964171407e-06, "loss": 0.156, "step": 1648 }, { "epoch": 0.3715323738981046, "grad_norm": 0.5060266476097205, "learning_rate": 3.714883388662153e-06, "loss": 0.1696, "step": 1649 }, { "epoch": 0.37175768158391304, "grad_norm": 0.4444436379093003, "learning_rate": 3.7132549063277033e-06, "loss": 0.1577, "step": 1650 }, { "epoch": 0.37198298926972145, "grad_norm": 0.4985865793993255, "learning_rate": 3.711625750318026e-06, "loss": 0.1807, "step": 1651 }, { "epoch": 0.37220829695552987, "grad_norm": 0.45732286320521387, "learning_rate": 3.7099959215377325e-06, "loss": 0.1605, "step": 1652 }, { "epoch": 0.37243360464133834, "grad_norm": 0.4660657647459733, "learning_rate": 3.7083654208918044e-06, "loss": 0.1546, "step": 1653 }, { "epoch": 0.37265891232714676, "grad_norm": 0.44304367341123163, "learning_rate": 3.7067342492855997e-06, "loss": 0.1521, "step": 1654 }, { "epoch": 0.3728842200129552, "grad_norm": 0.47238935736242904, "learning_rate": 3.7051024076248455e-06, "loss": 0.1609, "step": 1655 }, { "epoch": 0.3731095276987636, "grad_norm": 0.4508232710567462, "learning_rate": 3.7034698968156434e-06, "loss": 0.1495, "step": 1656 }, { "epoch": 0.3733348353845721, "grad_norm": 0.5187504962903139, "learning_rate": 3.7018367177644654e-06, "loss": 0.1969, "step": 1657 }, { "epoch": 0.3735601430703805, "grad_norm": 0.4859885855039429, "learning_rate": 3.700202871378156e-06, "loss": 0.1691, "step": 1658 }, { "epoch": 0.3737854507561889, "grad_norm": 0.48555317612448284, "learning_rate": 3.698568358563928e-06, "loss": 0.1618, "step": 1659 }, { "epoch": 0.3740107584419973, "grad_norm": 0.471500260609219, "learning_rate": 3.696933180229366e-06, "loss": 0.1617, "step": 1660 }, { "epoch": 0.3742360661278058, "grad_norm": 0.5179863634705896, "learning_rate": 3.6952973372824236e-06, "loss": 0.1738, "step": 1661 }, { "epoch": 0.3744613738136142, "grad_norm": 0.46447876453674614, "learning_rate": 3.6936608306314227e-06, "loss": 0.1505, "step": 1662 }, { "epoch": 0.37468668149942264, "grad_norm": 0.4636206922712721, "learning_rate": 3.6920236611850557e-06, "loss": 0.1432, "step": 1663 }, { "epoch": 0.37491198918523105, "grad_norm": 0.49137205139355977, "learning_rate": 3.690385829852381e-06, "loss": 0.1682, "step": 1664 }, { "epoch": 0.3751372968710395, "grad_norm": 0.478847300739953, "learning_rate": 3.6887473375428257e-06, "loss": 0.171, "step": 1665 }, { "epoch": 0.37536260455684795, "grad_norm": 0.5097285105087138, "learning_rate": 3.6871081851661825e-06, "loss": 0.172, "step": 1666 }, { "epoch": 0.37558791224265636, "grad_norm": 0.4606989176372907, "learning_rate": 3.685468373632613e-06, "loss": 0.1584, "step": 1667 }, { "epoch": 0.37581321992846484, "grad_norm": 0.47821792660078855, "learning_rate": 3.6838279038526427e-06, "loss": 0.1621, "step": 1668 }, { "epoch": 0.37603852761427325, "grad_norm": 0.49790475159273734, "learning_rate": 3.6821867767371634e-06, "loss": 0.1671, "step": 1669 }, { "epoch": 0.3762638353000817, "grad_norm": 0.45791610679822975, "learning_rate": 3.6805449931974313e-06, "loss": 0.1603, "step": 1670 }, { "epoch": 0.3764891429858901, "grad_norm": 0.4993384427798507, "learning_rate": 3.6789025541450686e-06, "loss": 0.1617, "step": 1671 }, { "epoch": 0.37671445067169856, "grad_norm": 0.4810348992142745, "learning_rate": 3.67725946049206e-06, "loss": 0.1598, "step": 1672 }, { "epoch": 0.376939758357507, "grad_norm": 0.5228990391190075, "learning_rate": 3.675615713150754e-06, "loss": 0.1771, "step": 1673 }, { "epoch": 0.3771650660433154, "grad_norm": 0.4933808144229279, "learning_rate": 3.6739713130338617e-06, "loss": 0.1766, "step": 1674 }, { "epoch": 0.3773903737291238, "grad_norm": 0.49129051234537585, "learning_rate": 3.6723262610544586e-06, "loss": 0.1753, "step": 1675 }, { "epoch": 0.3776156814149323, "grad_norm": 0.4765726245594172, "learning_rate": 3.6706805581259807e-06, "loss": 0.1608, "step": 1676 }, { "epoch": 0.3778409891007407, "grad_norm": 0.48126788082063077, "learning_rate": 3.669034205162224e-06, "loss": 0.1618, "step": 1677 }, { "epoch": 0.3780662967865491, "grad_norm": 0.4743792209420733, "learning_rate": 3.6673872030773473e-06, "loss": 0.1589, "step": 1678 }, { "epoch": 0.37829160447235755, "grad_norm": 0.45737859410066595, "learning_rate": 3.66573955278587e-06, "loss": 0.1537, "step": 1679 }, { "epoch": 0.378516912158166, "grad_norm": 0.47694611648098956, "learning_rate": 3.664091255202672e-06, "loss": 0.1623, "step": 1680 }, { "epoch": 0.37874221984397444, "grad_norm": 0.4972321650519721, "learning_rate": 3.662442311242989e-06, "loss": 0.17, "step": 1681 }, { "epoch": 0.37896752752978285, "grad_norm": 0.49956960858663274, "learning_rate": 3.66079272182242e-06, "loss": 0.1743, "step": 1682 }, { "epoch": 0.3791928352155913, "grad_norm": 0.4628531179808592, "learning_rate": 3.6591424878569203e-06, "loss": 0.1615, "step": 1683 }, { "epoch": 0.37941814290139975, "grad_norm": 0.506047314167328, "learning_rate": 3.657491610262802e-06, "loss": 0.1785, "step": 1684 }, { "epoch": 0.37964345058720816, "grad_norm": 0.4616821462609864, "learning_rate": 3.655840089956738e-06, "loss": 0.1603, "step": 1685 }, { "epoch": 0.3798687582730166, "grad_norm": 0.44081782340460696, "learning_rate": 3.654187927855754e-06, "loss": 0.156, "step": 1686 }, { "epoch": 0.380094065958825, "grad_norm": 0.465733362020006, "learning_rate": 3.6525351248772357e-06, "loss": 0.1438, "step": 1687 }, { "epoch": 0.3803193736446335, "grad_norm": 0.49319990988355183, "learning_rate": 3.6508816819389216e-06, "loss": 0.1655, "step": 1688 }, { "epoch": 0.3805446813304419, "grad_norm": 0.47847792905995146, "learning_rate": 3.6492275999589065e-06, "loss": 0.1537, "step": 1689 }, { "epoch": 0.3807699890162503, "grad_norm": 0.4929434149921711, "learning_rate": 3.6475728798556426e-06, "loss": 0.1675, "step": 1690 }, { "epoch": 0.38099529670205873, "grad_norm": 0.48998776304248226, "learning_rate": 3.645917522547933e-06, "loss": 0.1749, "step": 1691 }, { "epoch": 0.3812206043878672, "grad_norm": 0.4906889531338348, "learning_rate": 3.6442615289549354e-06, "loss": 0.1623, "step": 1692 }, { "epoch": 0.3814459120736756, "grad_norm": 0.5004684870879217, "learning_rate": 3.6426048999961626e-06, "loss": 0.171, "step": 1693 }, { "epoch": 0.38167121975948404, "grad_norm": 0.4722242162068364, "learning_rate": 3.6409476365914786e-06, "loss": 0.1591, "step": 1694 }, { "epoch": 0.38189652744529246, "grad_norm": 0.5011760714533439, "learning_rate": 3.6392897396610992e-06, "loss": 0.1753, "step": 1695 }, { "epoch": 0.38212183513110093, "grad_norm": 0.47823957791904037, "learning_rate": 3.6376312101255934e-06, "loss": 0.1552, "step": 1696 }, { "epoch": 0.38234714281690935, "grad_norm": 0.48094097708510114, "learning_rate": 3.6359720489058804e-06, "loss": 0.1586, "step": 1697 }, { "epoch": 0.38257245050271776, "grad_norm": 0.485064759706226, "learning_rate": 3.6343122569232313e-06, "loss": 0.1708, "step": 1698 }, { "epoch": 0.3827977581885262, "grad_norm": 0.4724466085894854, "learning_rate": 3.6326518350992657e-06, "loss": 0.149, "step": 1699 }, { "epoch": 0.38302306587433466, "grad_norm": 0.48086104377028743, "learning_rate": 3.6309907843559542e-06, "loss": 0.1575, "step": 1700 }, { "epoch": 0.3832483735601431, "grad_norm": 0.4589335602870298, "learning_rate": 3.6293291056156178e-06, "loss": 0.1553, "step": 1701 }, { "epoch": 0.3834736812459515, "grad_norm": 0.49392644134888775, "learning_rate": 3.6276667998009242e-06, "loss": 0.1605, "step": 1702 }, { "epoch": 0.3836989889317599, "grad_norm": 0.49623607357562033, "learning_rate": 3.626003867834888e-06, "loss": 0.1715, "step": 1703 }, { "epoch": 0.3839242966175684, "grad_norm": 0.5028003735238179, "learning_rate": 3.624340310640875e-06, "loss": 0.1715, "step": 1704 }, { "epoch": 0.3841496043033768, "grad_norm": 0.5246279321726527, "learning_rate": 3.6226761291425956e-06, "loss": 0.1677, "step": 1705 }, { "epoch": 0.3843749119891852, "grad_norm": 0.4714051843773443, "learning_rate": 3.621011324264109e-06, "loss": 0.1664, "step": 1706 }, { "epoch": 0.38460021967499364, "grad_norm": 0.47138163904110986, "learning_rate": 3.6193458969298184e-06, "loss": 0.1764, "step": 1707 }, { "epoch": 0.3848255273608021, "grad_norm": 0.4923417840544699, "learning_rate": 3.617679848064474e-06, "loss": 0.1597, "step": 1708 }, { "epoch": 0.38505083504661053, "grad_norm": 0.4649501594809374, "learning_rate": 3.6160131785931695e-06, "loss": 0.158, "step": 1709 }, { "epoch": 0.38527614273241895, "grad_norm": 0.4682784221008982, "learning_rate": 3.6143458894413463e-06, "loss": 0.1598, "step": 1710 }, { "epoch": 0.38550145041822736, "grad_norm": 0.4559827677564158, "learning_rate": 3.6126779815347863e-06, "loss": 0.1559, "step": 1711 }, { "epoch": 0.38572675810403584, "grad_norm": 0.4422578569156396, "learning_rate": 3.611009455799617e-06, "loss": 0.1509, "step": 1712 }, { "epoch": 0.38595206578984426, "grad_norm": 0.4437396743103725, "learning_rate": 3.609340313162309e-06, "loss": 0.147, "step": 1713 }, { "epoch": 0.3861773734756527, "grad_norm": 0.4720363562053173, "learning_rate": 3.6076705545496743e-06, "loss": 0.1727, "step": 1714 }, { "epoch": 0.38640268116146115, "grad_norm": 0.4443211995923577, "learning_rate": 3.606000180888868e-06, "loss": 0.1467, "step": 1715 }, { "epoch": 0.38662798884726957, "grad_norm": 0.47935065592719434, "learning_rate": 3.604329193107386e-06, "loss": 0.1543, "step": 1716 }, { "epoch": 0.386853296533078, "grad_norm": 0.5092502507366087, "learning_rate": 3.6026575921330665e-06, "loss": 0.17, "step": 1717 }, { "epoch": 0.3870786042188864, "grad_norm": 0.48204778905856294, "learning_rate": 3.600985378894086e-06, "loss": 0.1628, "step": 1718 }, { "epoch": 0.3873039119046949, "grad_norm": 0.48203224699763203, "learning_rate": 3.5993125543189634e-06, "loss": 0.1676, "step": 1719 }, { "epoch": 0.3875292195905033, "grad_norm": 0.454587963068617, "learning_rate": 3.5976391193365544e-06, "loss": 0.1509, "step": 1720 }, { "epoch": 0.3877545272763117, "grad_norm": 0.498846730629908, "learning_rate": 3.5959650748760562e-06, "loss": 0.1753, "step": 1721 }, { "epoch": 0.38797983496212013, "grad_norm": 0.45540583194092865, "learning_rate": 3.5942904218670025e-06, "loss": 0.1609, "step": 1722 }, { "epoch": 0.3882051426479286, "grad_norm": 0.4756057121487946, "learning_rate": 3.592615161239267e-06, "loss": 0.1636, "step": 1723 }, { "epoch": 0.388430450333737, "grad_norm": 0.47297457111173763, "learning_rate": 3.590939293923058e-06, "loss": 0.1701, "step": 1724 }, { "epoch": 0.38865575801954544, "grad_norm": 0.5041510529405214, "learning_rate": 3.5892628208489226e-06, "loss": 0.1587, "step": 1725 }, { "epoch": 0.38888106570535386, "grad_norm": 0.46158951154891076, "learning_rate": 3.5875857429477447e-06, "loss": 0.1594, "step": 1726 }, { "epoch": 0.38910637339116233, "grad_norm": 0.4415370624886341, "learning_rate": 3.585908061150741e-06, "loss": 0.1493, "step": 1727 }, { "epoch": 0.38933168107697075, "grad_norm": 0.4744067839485117, "learning_rate": 3.584229776389468e-06, "loss": 0.1685, "step": 1728 }, { "epoch": 0.38955698876277917, "grad_norm": 0.5160028615095501, "learning_rate": 3.5825508895958143e-06, "loss": 0.1733, "step": 1729 }, { "epoch": 0.3897822964485876, "grad_norm": 0.5072082547164747, "learning_rate": 3.580871401702002e-06, "loss": 0.1717, "step": 1730 }, { "epoch": 0.39000760413439606, "grad_norm": 0.44889530087849144, "learning_rate": 3.5791913136405883e-06, "loss": 0.1512, "step": 1731 }, { "epoch": 0.3902329118202045, "grad_norm": 0.47243671574782475, "learning_rate": 3.5775106263444644e-06, "loss": 0.1571, "step": 1732 }, { "epoch": 0.3904582195060129, "grad_norm": 0.4673534957109278, "learning_rate": 3.5758293407468525e-06, "loss": 0.1536, "step": 1733 }, { "epoch": 0.3906835271918213, "grad_norm": 0.5182709051377471, "learning_rate": 3.5741474577813086e-06, "loss": 0.1725, "step": 1734 }, { "epoch": 0.3909088348776298, "grad_norm": 0.5045935815654671, "learning_rate": 3.572464978381719e-06, "loss": 0.17, "step": 1735 }, { "epoch": 0.3911341425634382, "grad_norm": 0.4855265240200406, "learning_rate": 3.570781903482302e-06, "loss": 0.174, "step": 1736 }, { "epoch": 0.3913594502492466, "grad_norm": 0.45358032477406396, "learning_rate": 3.569098234017606e-06, "loss": 0.1518, "step": 1737 }, { "epoch": 0.39158475793505504, "grad_norm": 0.5160232778445221, "learning_rate": 3.5674139709225104e-06, "loss": 0.1684, "step": 1738 }, { "epoch": 0.3918100656208635, "grad_norm": 0.4803803541458198, "learning_rate": 3.565729115132224e-06, "loss": 0.1662, "step": 1739 }, { "epoch": 0.39203537330667193, "grad_norm": 0.4787719969692587, "learning_rate": 3.5640436675822833e-06, "loss": 0.1565, "step": 1740 }, { "epoch": 0.39226068099248035, "grad_norm": 0.48099904622308787, "learning_rate": 3.5623576292085555e-06, "loss": 0.1659, "step": 1741 }, { "epoch": 0.39248598867828877, "grad_norm": 0.48566284958274347, "learning_rate": 3.5606710009472335e-06, "loss": 0.1562, "step": 1742 }, { "epoch": 0.39271129636409724, "grad_norm": 0.502006974708806, "learning_rate": 3.558983783734841e-06, "loss": 0.1572, "step": 1743 }, { "epoch": 0.39293660404990566, "grad_norm": 0.4542809843689211, "learning_rate": 3.5572959785082264e-06, "loss": 0.1519, "step": 1744 }, { "epoch": 0.3931619117357141, "grad_norm": 0.4747827690867886, "learning_rate": 3.5556075862045636e-06, "loss": 0.1591, "step": 1745 }, { "epoch": 0.3933872194215225, "grad_norm": 0.4862382157467648, "learning_rate": 3.5539186077613562e-06, "loss": 0.1517, "step": 1746 }, { "epoch": 0.39361252710733097, "grad_norm": 0.48020012626984315, "learning_rate": 3.552229044116428e-06, "loss": 0.1682, "step": 1747 }, { "epoch": 0.3938378347931394, "grad_norm": 0.4669769781915097, "learning_rate": 3.5505388962079337e-06, "loss": 0.1565, "step": 1748 }, { "epoch": 0.3940631424789478, "grad_norm": 0.4648212358293205, "learning_rate": 3.548848164974347e-06, "loss": 0.1523, "step": 1749 }, { "epoch": 0.3942884501647562, "grad_norm": 0.4987953329315867, "learning_rate": 3.54715685135447e-06, "loss": 0.1847, "step": 1750 }, { "epoch": 0.3945137578505647, "grad_norm": 0.44830900353615505, "learning_rate": 3.545464956287425e-06, "loss": 0.1544, "step": 1751 }, { "epoch": 0.3947390655363731, "grad_norm": 0.44263663083561267, "learning_rate": 3.5437724807126583e-06, "loss": 0.1486, "step": 1752 }, { "epoch": 0.39496437322218153, "grad_norm": 0.46596869817108705, "learning_rate": 3.542079425569938e-06, "loss": 0.1591, "step": 1753 }, { "epoch": 0.39518968090798995, "grad_norm": 0.4619508102805029, "learning_rate": 3.5403857917993554e-06, "loss": 0.1615, "step": 1754 }, { "epoch": 0.3954149885937984, "grad_norm": 0.5113106262452023, "learning_rate": 3.5386915803413234e-06, "loss": 0.1746, "step": 1755 }, { "epoch": 0.39564029627960684, "grad_norm": 0.47742352068847194, "learning_rate": 3.5369967921365718e-06, "loss": 0.1566, "step": 1756 }, { "epoch": 0.39586560396541526, "grad_norm": 0.48642828981283603, "learning_rate": 3.5353014281261545e-06, "loss": 0.1565, "step": 1757 }, { "epoch": 0.3960909116512237, "grad_norm": 0.48189096819477817, "learning_rate": 3.5336054892514437e-06, "loss": 0.1691, "step": 1758 }, { "epoch": 0.39631621933703215, "grad_norm": 0.48477286061008706, "learning_rate": 3.531908976454132e-06, "loss": 0.1669, "step": 1759 }, { "epoch": 0.39654152702284057, "grad_norm": 0.44895590316474676, "learning_rate": 3.530211890676229e-06, "loss": 0.1443, "step": 1760 }, { "epoch": 0.396766834708649, "grad_norm": 0.46011556543301124, "learning_rate": 3.528514232860063e-06, "loss": 0.167, "step": 1761 }, { "epoch": 0.3969921423944574, "grad_norm": 0.4858974568703489, "learning_rate": 3.52681600394828e-06, "loss": 0.1719, "step": 1762 }, { "epoch": 0.3972174500802659, "grad_norm": 0.46589057661030264, "learning_rate": 3.525117204883844e-06, "loss": 0.1569, "step": 1763 }, { "epoch": 0.3974427577660743, "grad_norm": 0.4945996442906626, "learning_rate": 3.5234178366100343e-06, "loss": 0.1698, "step": 1764 }, { "epoch": 0.3976680654518827, "grad_norm": 0.5230375509533259, "learning_rate": 3.5217179000704467e-06, "loss": 0.1529, "step": 1765 }, { "epoch": 0.3978933731376912, "grad_norm": 0.4744484175293173, "learning_rate": 3.520017396208993e-06, "loss": 0.1653, "step": 1766 }, { "epoch": 0.3981186808234996, "grad_norm": 0.4897183630580415, "learning_rate": 3.518316325969899e-06, "loss": 0.1691, "step": 1767 }, { "epoch": 0.398343988509308, "grad_norm": 0.501512935731379, "learning_rate": 3.5166146902977055e-06, "loss": 0.1732, "step": 1768 }, { "epoch": 0.39856929619511644, "grad_norm": 0.4910117121160473, "learning_rate": 3.514912490137268e-06, "loss": 0.1575, "step": 1769 }, { "epoch": 0.3987946038809249, "grad_norm": 0.5124656318430874, "learning_rate": 3.5132097264337546e-06, "loss": 0.1647, "step": 1770 }, { "epoch": 0.39901991156673333, "grad_norm": 0.4908288371907048, "learning_rate": 3.5115064001326467e-06, "loss": 0.1649, "step": 1771 }, { "epoch": 0.39924521925254175, "grad_norm": 0.48086723360755884, "learning_rate": 3.5098025121797375e-06, "loss": 0.1626, "step": 1772 }, { "epoch": 0.39947052693835017, "grad_norm": 0.4692892966024187, "learning_rate": 3.508098063521134e-06, "loss": 0.1591, "step": 1773 }, { "epoch": 0.39969583462415864, "grad_norm": 0.5065104083880511, "learning_rate": 3.5063930551032494e-06, "loss": 0.1653, "step": 1774 }, { "epoch": 0.39992114230996706, "grad_norm": 0.47844515863928405, "learning_rate": 3.504687487872815e-06, "loss": 0.159, "step": 1775 }, { "epoch": 0.4001464499957755, "grad_norm": 0.5296198292166024, "learning_rate": 3.5029813627768665e-06, "loss": 0.1796, "step": 1776 }, { "epoch": 0.4003717576815839, "grad_norm": 0.48975396814814165, "learning_rate": 3.501274680762753e-06, "loss": 0.1517, "step": 1777 }, { "epoch": 0.40059706536739237, "grad_norm": 0.47662762225410193, "learning_rate": 3.499567442778131e-06, "loss": 0.1602, "step": 1778 }, { "epoch": 0.4008223730532008, "grad_norm": 0.49921176967190556, "learning_rate": 3.497859649770965e-06, "loss": 0.1666, "step": 1779 }, { "epoch": 0.4010476807390092, "grad_norm": 0.4678569612081498, "learning_rate": 3.49615130268953e-06, "loss": 0.1548, "step": 1780 }, { "epoch": 0.4012729884248176, "grad_norm": 0.492057854759809, "learning_rate": 3.494442402482407e-06, "loss": 0.1685, "step": 1781 }, { "epoch": 0.4014982961106261, "grad_norm": 0.4657679154537899, "learning_rate": 3.4927329500984857e-06, "loss": 0.1527, "step": 1782 }, { "epoch": 0.4017236037964345, "grad_norm": 0.49090571162135244, "learning_rate": 3.4910229464869594e-06, "loss": 0.1668, "step": 1783 }, { "epoch": 0.40194891148224293, "grad_norm": 0.4774053398472881, "learning_rate": 3.489312392597331e-06, "loss": 0.1609, "step": 1784 }, { "epoch": 0.40217421916805135, "grad_norm": 0.5139863051826496, "learning_rate": 3.4876012893794053e-06, "loss": 0.184, "step": 1785 }, { "epoch": 0.4023995268538598, "grad_norm": 0.44779930986429073, "learning_rate": 3.4858896377832966e-06, "loss": 0.1407, "step": 1786 }, { "epoch": 0.40262483453966824, "grad_norm": 0.49744330625774086, "learning_rate": 3.4841774387594202e-06, "loss": 0.1739, "step": 1787 }, { "epoch": 0.40285014222547666, "grad_norm": 0.43623504745923614, "learning_rate": 3.482464693258496e-06, "loss": 0.1453, "step": 1788 }, { "epoch": 0.4030754499112851, "grad_norm": 0.4794462886411687, "learning_rate": 3.4807514022315473e-06, "loss": 0.1564, "step": 1789 }, { "epoch": 0.40330075759709355, "grad_norm": 0.4983964466277479, "learning_rate": 3.4790375666299026e-06, "loss": 0.1555, "step": 1790 }, { "epoch": 0.40352606528290197, "grad_norm": 0.4818184175327813, "learning_rate": 3.4773231874051893e-06, "loss": 0.1661, "step": 1791 }, { "epoch": 0.4037513729687104, "grad_norm": 0.4833608794554769, "learning_rate": 3.4756082655093387e-06, "loss": 0.1651, "step": 1792 }, { "epoch": 0.4039766806545188, "grad_norm": 0.4771175845930751, "learning_rate": 3.4738928018945828e-06, "loss": 0.1584, "step": 1793 }, { "epoch": 0.4042019883403273, "grad_norm": 0.4471819840046623, "learning_rate": 3.4721767975134557e-06, "loss": 0.1504, "step": 1794 }, { "epoch": 0.4044272960261357, "grad_norm": 0.47232101345411487, "learning_rate": 3.470460253318789e-06, "loss": 0.1608, "step": 1795 }, { "epoch": 0.4046526037119441, "grad_norm": 0.4797922242264535, "learning_rate": 3.4687431702637165e-06, "loss": 0.1675, "step": 1796 }, { "epoch": 0.40487791139775253, "grad_norm": 0.49561782968484025, "learning_rate": 3.4670255493016715e-06, "loss": 0.1718, "step": 1797 }, { "epoch": 0.405103219083561, "grad_norm": 0.4584936281742346, "learning_rate": 3.465307391386383e-06, "loss": 0.1442, "step": 1798 }, { "epoch": 0.4053285267693694, "grad_norm": 0.47911010769893764, "learning_rate": 3.4635886974718814e-06, "loss": 0.1696, "step": 1799 }, { "epoch": 0.40555383445517784, "grad_norm": 0.4559976548205687, "learning_rate": 3.4618694685124927e-06, "loss": 0.1518, "step": 1800 }, { "epoch": 0.40577914214098626, "grad_norm": 0.48899872047373, "learning_rate": 3.4601497054628407e-06, "loss": 0.172, "step": 1801 }, { "epoch": 0.40600444982679473, "grad_norm": 0.4644054553830585, "learning_rate": 3.458429409277846e-06, "loss": 0.1579, "step": 1802 }, { "epoch": 0.40622975751260315, "grad_norm": 0.4463344674611823, "learning_rate": 3.4567085809127247e-06, "loss": 0.1563, "step": 1803 }, { "epoch": 0.40645506519841157, "grad_norm": 0.4719464396077941, "learning_rate": 3.454987221322989e-06, "loss": 0.1675, "step": 1804 }, { "epoch": 0.40668037288422, "grad_norm": 0.46911138126559293, "learning_rate": 3.4532653314644453e-06, "loss": 0.1608, "step": 1805 }, { "epoch": 0.40690568057002846, "grad_norm": 0.46701650488610563, "learning_rate": 3.4515429122931955e-06, "loss": 0.1562, "step": 1806 }, { "epoch": 0.4071309882558369, "grad_norm": 0.4345692567585763, "learning_rate": 3.4498199647656335e-06, "loss": 0.1489, "step": 1807 }, { "epoch": 0.4073562959416453, "grad_norm": 0.4732432278089056, "learning_rate": 3.4480964898384495e-06, "loss": 0.1566, "step": 1808 }, { "epoch": 0.4075816036274537, "grad_norm": 0.49232155873266253, "learning_rate": 3.4463724884686234e-06, "loss": 0.1733, "step": 1809 }, { "epoch": 0.4078069113132622, "grad_norm": 0.47860152710599685, "learning_rate": 3.44464796161343e-06, "loss": 0.1583, "step": 1810 }, { "epoch": 0.4080322189990706, "grad_norm": 0.5036762938508624, "learning_rate": 3.4429229102304336e-06, "loss": 0.174, "step": 1811 }, { "epoch": 0.408257526684879, "grad_norm": 0.5282822426431432, "learning_rate": 3.4411973352774917e-06, "loss": 0.1734, "step": 1812 }, { "epoch": 0.4084828343706875, "grad_norm": 0.4445382895295483, "learning_rate": 3.4394712377127524e-06, "loss": 0.1593, "step": 1813 }, { "epoch": 0.4087081420564959, "grad_norm": 0.46841840331802137, "learning_rate": 3.437744618494653e-06, "loss": 0.161, "step": 1814 }, { "epoch": 0.40893344974230433, "grad_norm": 0.490508242367761, "learning_rate": 3.4360174785819196e-06, "loss": 0.1559, "step": 1815 }, { "epoch": 0.40915875742811275, "grad_norm": 0.48426695631141686, "learning_rate": 3.4342898189335692e-06, "loss": 0.1583, "step": 1816 }, { "epoch": 0.4093840651139212, "grad_norm": 0.4367959518819385, "learning_rate": 3.432561640508908e-06, "loss": 0.1462, "step": 1817 }, { "epoch": 0.40960937279972964, "grad_norm": 0.4700483688647926, "learning_rate": 3.4308329442675276e-06, "loss": 0.1551, "step": 1818 }, { "epoch": 0.40983468048553806, "grad_norm": 0.435178736238957, "learning_rate": 3.4291037311693088e-06, "loss": 0.145, "step": 1819 }, { "epoch": 0.4100599881713465, "grad_norm": 0.4887752868821639, "learning_rate": 3.42737400217442e-06, "loss": 0.1619, "step": 1820 }, { "epoch": 0.41028529585715495, "grad_norm": 0.4538849148397559, "learning_rate": 3.4256437582433144e-06, "loss": 0.1531, "step": 1821 }, { "epoch": 0.41051060354296337, "grad_norm": 0.46154436604669785, "learning_rate": 3.423913000336732e-06, "loss": 0.1638, "step": 1822 }, { "epoch": 0.4107359112287718, "grad_norm": 0.43127529204771314, "learning_rate": 3.422181729415699e-06, "loss": 0.1422, "step": 1823 }, { "epoch": 0.4109612189145802, "grad_norm": 0.5198286942990128, "learning_rate": 3.4204499464415253e-06, "loss": 0.1668, "step": 1824 }, { "epoch": 0.4111865266003887, "grad_norm": 0.4687407439728831, "learning_rate": 3.418717652375805e-06, "loss": 0.1438, "step": 1825 }, { "epoch": 0.4114118342861971, "grad_norm": 0.4821244881625982, "learning_rate": 3.4169848481804165e-06, "loss": 0.1688, "step": 1826 }, { "epoch": 0.4116371419720055, "grad_norm": 0.46763244055600944, "learning_rate": 3.415251534817521e-06, "loss": 0.1612, "step": 1827 }, { "epoch": 0.41186244965781393, "grad_norm": 0.48728207989142946, "learning_rate": 3.4135177132495632e-06, "loss": 0.1624, "step": 1828 }, { "epoch": 0.4120877573436224, "grad_norm": 0.5312486959140987, "learning_rate": 3.4117833844392704e-06, "loss": 0.1735, "step": 1829 }, { "epoch": 0.4123130650294308, "grad_norm": 0.4692598649036747, "learning_rate": 3.41004854934965e-06, "loss": 0.1542, "step": 1830 }, { "epoch": 0.41253837271523924, "grad_norm": 0.46784467253864104, "learning_rate": 3.4083132089439912e-06, "loss": 0.1685, "step": 1831 }, { "epoch": 0.41276368040104766, "grad_norm": 0.4670753302202224, "learning_rate": 3.406577364185864e-06, "loss": 0.1642, "step": 1832 }, { "epoch": 0.41298898808685613, "grad_norm": 0.49989102638595734, "learning_rate": 3.404841016039118e-06, "loss": 0.1763, "step": 1833 }, { "epoch": 0.41321429577266455, "grad_norm": 0.49099336796274373, "learning_rate": 3.403104165467883e-06, "loss": 0.1707, "step": 1834 }, { "epoch": 0.41343960345847297, "grad_norm": 0.48820792687632875, "learning_rate": 3.4013668134365675e-06, "loss": 0.1537, "step": 1835 }, { "epoch": 0.4136649111442814, "grad_norm": 0.47863203313919633, "learning_rate": 3.399628960909857e-06, "loss": 0.1609, "step": 1836 }, { "epoch": 0.41389021883008986, "grad_norm": 0.4881564151016199, "learning_rate": 3.397890608852718e-06, "loss": 0.1651, "step": 1837 }, { "epoch": 0.4141155265158983, "grad_norm": 0.47544325425792294, "learning_rate": 3.3961517582303916e-06, "loss": 0.1637, "step": 1838 }, { "epoch": 0.4143408342017067, "grad_norm": 0.49922802100215147, "learning_rate": 3.394412410008397e-06, "loss": 0.1647, "step": 1839 }, { "epoch": 0.4145661418875151, "grad_norm": 0.4913684385302216, "learning_rate": 3.39267256515253e-06, "loss": 0.1641, "step": 1840 }, { "epoch": 0.4147914495733236, "grad_norm": 0.4471757683297294, "learning_rate": 3.3909322246288606e-06, "loss": 0.141, "step": 1841 }, { "epoch": 0.415016757259132, "grad_norm": 0.48812530948929644, "learning_rate": 3.3891913894037354e-06, "loss": 0.1645, "step": 1842 }, { "epoch": 0.4152420649449404, "grad_norm": 0.5119344738587837, "learning_rate": 3.3874500604437752e-06, "loss": 0.1548, "step": 1843 }, { "epoch": 0.41546737263074884, "grad_norm": 0.4946995583219049, "learning_rate": 3.385708238715876e-06, "loss": 0.1539, "step": 1844 }, { "epoch": 0.4156926803165573, "grad_norm": 0.44974296921309803, "learning_rate": 3.3839659251872054e-06, "loss": 0.1462, "step": 1845 }, { "epoch": 0.41591798800236573, "grad_norm": 0.5062284794310744, "learning_rate": 3.3822231208252053e-06, "loss": 0.1617, "step": 1846 }, { "epoch": 0.41614329568817415, "grad_norm": 0.5517545583347794, "learning_rate": 3.38047982659759e-06, "loss": 0.1753, "step": 1847 }, { "epoch": 0.41636860337398257, "grad_norm": 0.5085439188834923, "learning_rate": 3.3787360434723466e-06, "loss": 0.1629, "step": 1848 }, { "epoch": 0.41659391105979104, "grad_norm": 0.5002540243440992, "learning_rate": 3.3769917724177315e-06, "loss": 0.1725, "step": 1849 }, { "epoch": 0.41681921874559946, "grad_norm": 0.4825881609302707, "learning_rate": 3.3752470144022745e-06, "loss": 0.1574, "step": 1850 }, { "epoch": 0.4170445264314079, "grad_norm": 0.5424215184923034, "learning_rate": 3.3735017703947748e-06, "loss": 0.1742, "step": 1851 }, { "epoch": 0.4172698341172163, "grad_norm": 0.5082325374018036, "learning_rate": 3.371756041364301e-06, "loss": 0.154, "step": 1852 }, { "epoch": 0.41749514180302477, "grad_norm": 0.4676831647620805, "learning_rate": 3.370009828280191e-06, "loss": 0.1527, "step": 1853 }, { "epoch": 0.4177204494888332, "grad_norm": 0.46679801722571546, "learning_rate": 3.3682631321120507e-06, "loss": 0.1698, "step": 1854 }, { "epoch": 0.4179457571746416, "grad_norm": 0.5112618802926924, "learning_rate": 3.366515953829758e-06, "loss": 0.1868, "step": 1855 }, { "epoch": 0.41817106486045, "grad_norm": 0.5277385827073671, "learning_rate": 3.364768294403455e-06, "loss": 0.1688, "step": 1856 }, { "epoch": 0.4183963725462585, "grad_norm": 0.4851251657185495, "learning_rate": 3.3630201548035512e-06, "loss": 0.167, "step": 1857 }, { "epoch": 0.4186216802320669, "grad_norm": 0.4959283834884865, "learning_rate": 3.361271536000723e-06, "loss": 0.1664, "step": 1858 }, { "epoch": 0.41884698791787534, "grad_norm": 0.4935586642366765, "learning_rate": 3.359522438965915e-06, "loss": 0.164, "step": 1859 }, { "epoch": 0.41907229560368375, "grad_norm": 0.46178173846204396, "learning_rate": 3.3577728646703335e-06, "loss": 0.1493, "step": 1860 }, { "epoch": 0.4192976032894922, "grad_norm": 0.5079501676323284, "learning_rate": 3.3560228140854534e-06, "loss": 0.165, "step": 1861 }, { "epoch": 0.41952291097530064, "grad_norm": 0.4762264209677515, "learning_rate": 3.354272288183012e-06, "loss": 0.1665, "step": 1862 }, { "epoch": 0.41974821866110906, "grad_norm": 0.4572713716633099, "learning_rate": 3.352521287935011e-06, "loss": 0.1449, "step": 1863 }, { "epoch": 0.41997352634691754, "grad_norm": 0.4898671317124843, "learning_rate": 3.3507698143137157e-06, "loss": 0.1559, "step": 1864 }, { "epoch": 0.42019883403272595, "grad_norm": 0.48000831024326635, "learning_rate": 3.3490178682916534e-06, "loss": 0.1443, "step": 1865 }, { "epoch": 0.42042414171853437, "grad_norm": 0.4811857384241612, "learning_rate": 3.3472654508416157e-06, "loss": 0.1605, "step": 1866 }, { "epoch": 0.4206494494043428, "grad_norm": 0.4738425883508626, "learning_rate": 3.3455125629366546e-06, "loss": 0.1587, "step": 1867 }, { "epoch": 0.42087475709015126, "grad_norm": 0.47907204424926936, "learning_rate": 3.3437592055500825e-06, "loss": 0.1643, "step": 1868 }, { "epoch": 0.4211000647759597, "grad_norm": 0.49431849349421114, "learning_rate": 3.342005379655474e-06, "loss": 0.1578, "step": 1869 }, { "epoch": 0.4213253724617681, "grad_norm": 0.4829550989570886, "learning_rate": 3.340251086226663e-06, "loss": 0.1594, "step": 1870 }, { "epoch": 0.4215506801475765, "grad_norm": 0.48055300095473713, "learning_rate": 3.3384963262377434e-06, "loss": 0.165, "step": 1871 }, { "epoch": 0.421775987833385, "grad_norm": 0.5002044469517478, "learning_rate": 3.3367411006630677e-06, "loss": 0.1699, "step": 1872 }, { "epoch": 0.4220012955191934, "grad_norm": 0.494112142272217, "learning_rate": 3.3349854104772476e-06, "loss": 0.1703, "step": 1873 }, { "epoch": 0.4222266032050018, "grad_norm": 0.4407900454940755, "learning_rate": 3.333229256655153e-06, "loss": 0.1502, "step": 1874 }, { "epoch": 0.42245191089081024, "grad_norm": 0.4658230976699728, "learning_rate": 3.3314726401719088e-06, "loss": 0.1503, "step": 1875 }, { "epoch": 0.4226772185766187, "grad_norm": 0.46842504862609974, "learning_rate": 3.3297155620029e-06, "loss": 0.1508, "step": 1876 }, { "epoch": 0.42290252626242714, "grad_norm": 0.49870166945340855, "learning_rate": 3.3279580231237664e-06, "loss": 0.1581, "step": 1877 }, { "epoch": 0.42312783394823555, "grad_norm": 0.4689208756074871, "learning_rate": 3.326200024510405e-06, "loss": 0.1552, "step": 1878 }, { "epoch": 0.423353141634044, "grad_norm": 0.47798369467976537, "learning_rate": 3.324441567138965e-06, "loss": 0.1632, "step": 1879 }, { "epoch": 0.42357844931985245, "grad_norm": 0.4426346717785938, "learning_rate": 3.3226826519858526e-06, "loss": 0.1472, "step": 1880 }, { "epoch": 0.42380375700566086, "grad_norm": 0.45722379581080347, "learning_rate": 3.320923280027728e-06, "loss": 0.1565, "step": 1881 }, { "epoch": 0.4240290646914693, "grad_norm": 0.47583545692139984, "learning_rate": 3.3191634522415064e-06, "loss": 0.1466, "step": 1882 }, { "epoch": 0.4242543723772777, "grad_norm": 0.47819718631976604, "learning_rate": 3.317403169604352e-06, "loss": 0.1597, "step": 1883 }, { "epoch": 0.4244796800630862, "grad_norm": 0.44056899985102965, "learning_rate": 3.315642433093686e-06, "loss": 0.1424, "step": 1884 }, { "epoch": 0.4247049877488946, "grad_norm": 0.4663525404770331, "learning_rate": 3.313881243687179e-06, "loss": 0.1579, "step": 1885 }, { "epoch": 0.424930295434703, "grad_norm": 0.49526535778153447, "learning_rate": 3.3121196023627543e-06, "loss": 0.1705, "step": 1886 }, { "epoch": 0.4251556031205114, "grad_norm": 0.49666092172588805, "learning_rate": 3.3103575100985852e-06, "loss": 0.1592, "step": 1887 }, { "epoch": 0.4253809108063199, "grad_norm": 0.45343350200228694, "learning_rate": 3.3085949678730953e-06, "loss": 0.1417, "step": 1888 }, { "epoch": 0.4256062184921283, "grad_norm": 0.45820729838898594, "learning_rate": 3.3068319766649605e-06, "loss": 0.1433, "step": 1889 }, { "epoch": 0.42583152617793674, "grad_norm": 0.49477542929726775, "learning_rate": 3.305068537453102e-06, "loss": 0.1659, "step": 1890 }, { "epoch": 0.42605683386374515, "grad_norm": 0.4644258024347816, "learning_rate": 3.303304651216693e-06, "loss": 0.1537, "step": 1891 }, { "epoch": 0.42628214154955363, "grad_norm": 0.4965959281033501, "learning_rate": 3.3015403189351536e-06, "loss": 0.1714, "step": 1892 }, { "epoch": 0.42650744923536205, "grad_norm": 0.467098799532159, "learning_rate": 3.2997755415881516e-06, "loss": 0.1511, "step": 1893 }, { "epoch": 0.42673275692117046, "grad_norm": 0.47792108039203884, "learning_rate": 3.2980103201556023e-06, "loss": 0.1672, "step": 1894 }, { "epoch": 0.4269580646069789, "grad_norm": 0.5093402711827806, "learning_rate": 3.2962446556176676e-06, "loss": 0.1731, "step": 1895 }, { "epoch": 0.42718337229278736, "grad_norm": 0.48714060411006055, "learning_rate": 3.2944785489547544e-06, "loss": 0.1524, "step": 1896 }, { "epoch": 0.4274086799785958, "grad_norm": 0.45516129520081977, "learning_rate": 3.2927120011475168e-06, "loss": 0.1544, "step": 1897 }, { "epoch": 0.4276339876644042, "grad_norm": 0.4746908996355466, "learning_rate": 3.290945013176852e-06, "loss": 0.1581, "step": 1898 }, { "epoch": 0.4278592953502126, "grad_norm": 0.44735001802092467, "learning_rate": 3.2891775860239033e-06, "loss": 0.143, "step": 1899 }, { "epoch": 0.4280846030360211, "grad_norm": 0.5027516078080247, "learning_rate": 3.2874097206700566e-06, "loss": 0.1722, "step": 1900 }, { "epoch": 0.4283099107218295, "grad_norm": 0.5060602149207811, "learning_rate": 3.285641418096942e-06, "loss": 0.1714, "step": 1901 }, { "epoch": 0.4285352184076379, "grad_norm": 0.4653823592371517, "learning_rate": 3.2838726792864315e-06, "loss": 0.1542, "step": 1902 }, { "epoch": 0.42876052609344634, "grad_norm": 0.46278383982423194, "learning_rate": 3.2821035052206413e-06, "loss": 0.1572, "step": 1903 }, { "epoch": 0.4289858337792548, "grad_norm": 0.4544680300853722, "learning_rate": 3.2803338968819264e-06, "loss": 0.1516, "step": 1904 }, { "epoch": 0.42921114146506323, "grad_norm": 0.49859338344706805, "learning_rate": 3.278563855252885e-06, "loss": 0.1641, "step": 1905 }, { "epoch": 0.42943644915087165, "grad_norm": 0.45697671851825894, "learning_rate": 3.2767933813163542e-06, "loss": 0.1446, "step": 1906 }, { "epoch": 0.42966175683668006, "grad_norm": 0.4505096587808184, "learning_rate": 3.2750224760554135e-06, "loss": 0.155, "step": 1907 }, { "epoch": 0.42988706452248854, "grad_norm": 0.48709353813394207, "learning_rate": 3.2732511404533797e-06, "loss": 0.1617, "step": 1908 }, { "epoch": 0.43011237220829696, "grad_norm": 0.518531427247498, "learning_rate": 3.2714793754938102e-06, "loss": 0.1773, "step": 1909 }, { "epoch": 0.4303376798941054, "grad_norm": 0.46946154006491075, "learning_rate": 3.2697071821604986e-06, "loss": 0.1625, "step": 1910 }, { "epoch": 0.43056298757991385, "grad_norm": 0.49032743378244426, "learning_rate": 3.2679345614374802e-06, "loss": 0.1733, "step": 1911 }, { "epoch": 0.43078829526572227, "grad_norm": 0.4637403507116278, "learning_rate": 3.266161514309023e-06, "loss": 0.161, "step": 1912 }, { "epoch": 0.4310136029515307, "grad_norm": 0.494990037196409, "learning_rate": 3.264388041759635e-06, "loss": 0.1688, "step": 1913 }, { "epoch": 0.4312389106373391, "grad_norm": 0.4745189714599409, "learning_rate": 3.262614144774059e-06, "loss": 0.1678, "step": 1914 }, { "epoch": 0.4314642183231476, "grad_norm": 0.44797870354627173, "learning_rate": 3.260839824337274e-06, "loss": 0.1442, "step": 1915 }, { "epoch": 0.431689526008956, "grad_norm": 0.4667151491338866, "learning_rate": 3.259065081434495e-06, "loss": 0.1518, "step": 1916 }, { "epoch": 0.4319148336947644, "grad_norm": 0.5028061583487423, "learning_rate": 3.2572899170511683e-06, "loss": 0.1644, "step": 1917 }, { "epoch": 0.43214014138057283, "grad_norm": 0.4745421286227182, "learning_rate": 3.255514332172979e-06, "loss": 0.1643, "step": 1918 }, { "epoch": 0.4323654490663813, "grad_norm": 0.4621433702702943, "learning_rate": 3.2537383277858413e-06, "loss": 0.1483, "step": 1919 }, { "epoch": 0.4325907567521897, "grad_norm": 0.4689770686079878, "learning_rate": 3.2519619048759056e-06, "loss": 0.153, "step": 1920 }, { "epoch": 0.43281606443799814, "grad_norm": 0.4866272597635595, "learning_rate": 3.250185064429552e-06, "loss": 0.1671, "step": 1921 }, { "epoch": 0.43304137212380656, "grad_norm": 0.5005319571358561, "learning_rate": 3.248407807433396e-06, "loss": 0.1702, "step": 1922 }, { "epoch": 0.43326667980961503, "grad_norm": 0.4922067763016835, "learning_rate": 3.246630134874279e-06, "loss": 0.1593, "step": 1923 }, { "epoch": 0.43349198749542345, "grad_norm": 0.515609895521259, "learning_rate": 3.2448520477392788e-06, "loss": 0.1739, "step": 1924 }, { "epoch": 0.43371729518123187, "grad_norm": 0.469959696046339, "learning_rate": 3.2430735470157e-06, "loss": 0.1547, "step": 1925 }, { "epoch": 0.4339426028670403, "grad_norm": 0.4950915860353573, "learning_rate": 3.2412946336910778e-06, "loss": 0.1729, "step": 1926 }, { "epoch": 0.43416791055284876, "grad_norm": 0.47469229450105466, "learning_rate": 3.2395153087531767e-06, "loss": 0.1738, "step": 1927 }, { "epoch": 0.4343932182386572, "grad_norm": 0.4894232593544117, "learning_rate": 3.237735573189989e-06, "loss": 0.1675, "step": 1928 }, { "epoch": 0.4346185259244656, "grad_norm": 0.4519000417682082, "learning_rate": 3.2359554279897353e-06, "loss": 0.1393, "step": 1929 }, { "epoch": 0.434843833610274, "grad_norm": 0.4904620821483386, "learning_rate": 3.234174874140866e-06, "loss": 0.1566, "step": 1930 }, { "epoch": 0.4350691412960825, "grad_norm": 0.47125494694001646, "learning_rate": 3.232393912632054e-06, "loss": 0.1527, "step": 1931 }, { "epoch": 0.4352944489818909, "grad_norm": 0.47128696022337324, "learning_rate": 3.230612544452202e-06, "loss": 0.1644, "step": 1932 }, { "epoch": 0.4355197566676993, "grad_norm": 0.46240632956038014, "learning_rate": 3.228830770590436e-06, "loss": 0.1537, "step": 1933 }, { "epoch": 0.43574506435350774, "grad_norm": 0.4480412958017911, "learning_rate": 3.2270485920361093e-06, "loss": 0.1553, "step": 1934 }, { "epoch": 0.4359703720393162, "grad_norm": 0.47450609805560406, "learning_rate": 3.2252660097788003e-06, "loss": 0.1692, "step": 1935 }, { "epoch": 0.43619567972512463, "grad_norm": 0.5016631632234949, "learning_rate": 3.2234830248083095e-06, "loss": 0.1612, "step": 1936 }, { "epoch": 0.43642098741093305, "grad_norm": 0.4630317566529645, "learning_rate": 3.2216996381146613e-06, "loss": 0.1501, "step": 1937 }, { "epoch": 0.43664629509674147, "grad_norm": 0.4350896225829126, "learning_rate": 3.219915850688106e-06, "loss": 0.1385, "step": 1938 }, { "epoch": 0.43687160278254994, "grad_norm": 0.4693673421131627, "learning_rate": 3.2181316635191125e-06, "loss": 0.1546, "step": 1939 }, { "epoch": 0.43709691046835836, "grad_norm": 0.4718115830527021, "learning_rate": 3.2163470775983733e-06, "loss": 0.1505, "step": 1940 }, { "epoch": 0.4373222181541668, "grad_norm": 0.5096285305771173, "learning_rate": 3.2145620939168036e-06, "loss": 0.1643, "step": 1941 }, { "epoch": 0.4375475258399752, "grad_norm": 0.4902992882458936, "learning_rate": 3.2127767134655374e-06, "loss": 0.1654, "step": 1942 }, { "epoch": 0.43777283352578367, "grad_norm": 0.47117853274605337, "learning_rate": 3.210990937235931e-06, "loss": 0.1487, "step": 1943 }, { "epoch": 0.4379981412115921, "grad_norm": 0.4991686208300184, "learning_rate": 3.209204766219558e-06, "loss": 0.1675, "step": 1944 }, { "epoch": 0.4382234488974005, "grad_norm": 0.4538891115941367, "learning_rate": 3.207418201408213e-06, "loss": 0.1471, "step": 1945 }, { "epoch": 0.4384487565832089, "grad_norm": 0.4709571337411855, "learning_rate": 3.205631243793909e-06, "loss": 0.1538, "step": 1946 }, { "epoch": 0.4386740642690174, "grad_norm": 0.4860122274389726, "learning_rate": 3.2038438943688777e-06, "loss": 0.1653, "step": 1947 }, { "epoch": 0.4388993719548258, "grad_norm": 0.5018583708691458, "learning_rate": 3.202056154125567e-06, "loss": 0.1632, "step": 1948 }, { "epoch": 0.43912467964063423, "grad_norm": 0.44754173977669515, "learning_rate": 3.2002680240566412e-06, "loss": 0.1513, "step": 1949 }, { "epoch": 0.43934998732644265, "grad_norm": 0.45793980293365255, "learning_rate": 3.198479505154984e-06, "loss": 0.1611, "step": 1950 }, { "epoch": 0.4395752950122511, "grad_norm": 0.4798406577992235, "learning_rate": 3.1966905984136932e-06, "loss": 0.1615, "step": 1951 }, { "epoch": 0.43980060269805954, "grad_norm": 0.5049353027884466, "learning_rate": 3.1949013048260813e-06, "loss": 0.1739, "step": 1952 }, { "epoch": 0.44002591038386796, "grad_norm": 0.4691288105733682, "learning_rate": 3.1931116253856762e-06, "loss": 0.1603, "step": 1953 }, { "epoch": 0.4402512180696764, "grad_norm": 0.48549399893117434, "learning_rate": 3.1913215610862208e-06, "loss": 0.1643, "step": 1954 }, { "epoch": 0.44047652575548485, "grad_norm": 0.46413359860802367, "learning_rate": 3.189531112921671e-06, "loss": 0.157, "step": 1955 }, { "epoch": 0.44070183344129327, "grad_norm": 0.48570841885505445, "learning_rate": 3.1877402818861954e-06, "loss": 0.1357, "step": 1956 }, { "epoch": 0.4409271411271017, "grad_norm": 0.4779260667177893, "learning_rate": 3.185949068974177e-06, "loss": 0.158, "step": 1957 }, { "epoch": 0.4411524488129101, "grad_norm": 0.4418919545366911, "learning_rate": 3.184157475180208e-06, "loss": 0.1526, "step": 1958 }, { "epoch": 0.4413777564987186, "grad_norm": 0.49614129483843544, "learning_rate": 3.1823655014990937e-06, "loss": 0.1608, "step": 1959 }, { "epoch": 0.441603064184527, "grad_norm": 0.472344591085106, "learning_rate": 3.1805731489258516e-06, "loss": 0.1479, "step": 1960 }, { "epoch": 0.4418283718703354, "grad_norm": 0.49519200575357003, "learning_rate": 3.1787804184557074e-06, "loss": 0.1498, "step": 1961 }, { "epoch": 0.4420536795561439, "grad_norm": 0.4767514542436889, "learning_rate": 3.1769873110840977e-06, "loss": 0.1567, "step": 1962 }, { "epoch": 0.4422789872419523, "grad_norm": 0.4586959627133704, "learning_rate": 3.1751938278066687e-06, "loss": 0.1556, "step": 1963 }, { "epoch": 0.4425042949277607, "grad_norm": 0.4973373090100224, "learning_rate": 3.1733999696192736e-06, "loss": 0.1614, "step": 1964 }, { "epoch": 0.44272960261356914, "grad_norm": 0.4735446280299149, "learning_rate": 3.171605737517976e-06, "loss": 0.1647, "step": 1965 }, { "epoch": 0.4429549102993776, "grad_norm": 0.44265387795324745, "learning_rate": 3.1698111324990454e-06, "loss": 0.1407, "step": 1966 }, { "epoch": 0.44318021798518603, "grad_norm": 0.49122833299235197, "learning_rate": 3.16801615555896e-06, "loss": 0.1684, "step": 1967 }, { "epoch": 0.44340552567099445, "grad_norm": 0.464417443256395, "learning_rate": 3.1662208076944027e-06, "loss": 0.1543, "step": 1968 }, { "epoch": 0.44363083335680287, "grad_norm": 0.4593154910225855, "learning_rate": 3.1644250899022637e-06, "loss": 0.1513, "step": 1969 }, { "epoch": 0.44385614104261134, "grad_norm": 0.47599458719846477, "learning_rate": 3.162629003179638e-06, "loss": 0.1545, "step": 1970 }, { "epoch": 0.44408144872841976, "grad_norm": 0.48647770396183504, "learning_rate": 3.1608325485238257e-06, "loss": 0.1526, "step": 1971 }, { "epoch": 0.4443067564142282, "grad_norm": 0.48806006529062335, "learning_rate": 3.1590357269323312e-06, "loss": 0.1641, "step": 1972 }, { "epoch": 0.4445320641000366, "grad_norm": 0.4838101728349616, "learning_rate": 3.157238539402862e-06, "loss": 0.1691, "step": 1973 }, { "epoch": 0.44475737178584507, "grad_norm": 0.47238245788795774, "learning_rate": 3.15544098693333e-06, "loss": 0.1707, "step": 1974 }, { "epoch": 0.4449826794716535, "grad_norm": 0.45538517824292013, "learning_rate": 3.15364307052185e-06, "loss": 0.151, "step": 1975 }, { "epoch": 0.4452079871574619, "grad_norm": 0.4696391452617115, "learning_rate": 3.151844791166735e-06, "loss": 0.1637, "step": 1976 }, { "epoch": 0.4454332948432703, "grad_norm": 0.519697876199234, "learning_rate": 3.1500461498665053e-06, "loss": 0.1779, "step": 1977 }, { "epoch": 0.4456586025290788, "grad_norm": 0.4721751150534047, "learning_rate": 3.1482471476198784e-06, "loss": 0.1598, "step": 1978 }, { "epoch": 0.4458839102148872, "grad_norm": 0.46400557490164673, "learning_rate": 3.1464477854257726e-06, "loss": 0.1527, "step": 1979 }, { "epoch": 0.44610921790069563, "grad_norm": 0.47922627381109123, "learning_rate": 3.1446480642833077e-06, "loss": 0.1593, "step": 1980 }, { "epoch": 0.44633452558650405, "grad_norm": 0.4892235358343949, "learning_rate": 3.1428479851918014e-06, "loss": 0.1602, "step": 1981 }, { "epoch": 0.4465598332723125, "grad_norm": 0.49009997683912404, "learning_rate": 3.14104754915077e-06, "loss": 0.1576, "step": 1982 }, { "epoch": 0.44678514095812094, "grad_norm": 0.48069166309580474, "learning_rate": 3.1392467571599288e-06, "loss": 0.1653, "step": 1983 }, { "epoch": 0.44701044864392936, "grad_norm": 0.4771441161006696, "learning_rate": 3.137445610219192e-06, "loss": 0.1515, "step": 1984 }, { "epoch": 0.4472357563297378, "grad_norm": 0.4818283293635933, "learning_rate": 3.1356441093286673e-06, "loss": 0.166, "step": 1985 }, { "epoch": 0.44746106401554625, "grad_norm": 0.5147787178532238, "learning_rate": 3.133842255488661e-06, "loss": 0.174, "step": 1986 }, { "epoch": 0.44768637170135467, "grad_norm": 0.5344233693465625, "learning_rate": 3.132040049699676e-06, "loss": 0.1763, "step": 1987 }, { "epoch": 0.4479116793871631, "grad_norm": 0.48389319685884197, "learning_rate": 3.130237492962411e-06, "loss": 0.1643, "step": 1988 }, { "epoch": 0.4481369870729715, "grad_norm": 0.482536320658805, "learning_rate": 3.1284345862777572e-06, "loss": 0.1595, "step": 1989 }, { "epoch": 0.44836229475878, "grad_norm": 0.46393789436271843, "learning_rate": 3.1266313306468018e-06, "loss": 0.1524, "step": 1990 }, { "epoch": 0.4485876024445884, "grad_norm": 0.474674964893104, "learning_rate": 3.1248277270708255e-06, "loss": 0.1601, "step": 1991 }, { "epoch": 0.4488129101303968, "grad_norm": 0.490753280567544, "learning_rate": 3.1230237765513023e-06, "loss": 0.1721, "step": 1992 }, { "epoch": 0.44903821781620523, "grad_norm": 0.4499975991773963, "learning_rate": 3.121219480089899e-06, "loss": 0.1446, "step": 1993 }, { "epoch": 0.4492635255020137, "grad_norm": 0.458297504071037, "learning_rate": 3.119414838688473e-06, "loss": 0.1529, "step": 1994 }, { "epoch": 0.4494888331878221, "grad_norm": 0.4436947744388515, "learning_rate": 3.1176098533490755e-06, "loss": 0.1386, "step": 1995 }, { "epoch": 0.44971414087363054, "grad_norm": 0.5104688488260797, "learning_rate": 3.1158045250739473e-06, "loss": 0.1746, "step": 1996 }, { "epoch": 0.44993944855943896, "grad_norm": 0.4881255207930648, "learning_rate": 3.11399885486552e-06, "loss": 0.1533, "step": 1997 }, { "epoch": 0.45016475624524743, "grad_norm": 0.49185335203111147, "learning_rate": 3.1121928437264138e-06, "loss": 0.1572, "step": 1998 }, { "epoch": 0.45039006393105585, "grad_norm": 0.4923412831035321, "learning_rate": 3.1103864926594406e-06, "loss": 0.1532, "step": 1999 }, { "epoch": 0.45061537161686427, "grad_norm": 0.48052935278952436, "learning_rate": 3.1085798026676e-06, "loss": 0.1557, "step": 2000 }, { "epoch": 0.45061537161686427, "eval_loss": 0.15805742144584656, "eval_runtime": 56.952, "eval_samples_per_second": 50.393, "eval_steps_per_second": 6.304, "step": 2000 }, { "epoch": 0.4508406793026727, "grad_norm": 0.46526999043209805, "learning_rate": 3.1067727747540797e-06, "loss": 0.1411, "step": 2001 }, { "epoch": 0.45106598698848116, "grad_norm": 0.4607371866430295, "learning_rate": 3.1049654099222542e-06, "loss": 0.1581, "step": 2002 }, { "epoch": 0.4512912946742896, "grad_norm": 0.49592540797717755, "learning_rate": 3.1031577091756852e-06, "loss": 0.1536, "step": 2003 }, { "epoch": 0.451516602360098, "grad_norm": 0.46818841838662356, "learning_rate": 3.1013496735181232e-06, "loss": 0.1492, "step": 2004 }, { "epoch": 0.4517419100459064, "grad_norm": 0.46040611469553566, "learning_rate": 3.0995413039535017e-06, "loss": 0.1534, "step": 2005 }, { "epoch": 0.4519672177317149, "grad_norm": 0.4973561683635516, "learning_rate": 3.0977326014859415e-06, "loss": 0.1725, "step": 2006 }, { "epoch": 0.4521925254175233, "grad_norm": 0.460212661652161, "learning_rate": 3.095923567119748e-06, "loss": 0.1557, "step": 2007 }, { "epoch": 0.4524178331033317, "grad_norm": 0.4715234518160938, "learning_rate": 3.09411420185941e-06, "loss": 0.1614, "step": 2008 }, { "epoch": 0.4526431407891402, "grad_norm": 0.47436206347918625, "learning_rate": 3.0923045067096e-06, "loss": 0.155, "step": 2009 }, { "epoch": 0.4528684484749486, "grad_norm": 0.48006343097393855, "learning_rate": 3.090494482675176e-06, "loss": 0.1714, "step": 2010 }, { "epoch": 0.45309375616075703, "grad_norm": 0.48029448201908953, "learning_rate": 3.088684130761175e-06, "loss": 0.1661, "step": 2011 }, { "epoch": 0.45331906384656545, "grad_norm": 0.4503633138858404, "learning_rate": 3.0868734519728194e-06, "loss": 0.1425, "step": 2012 }, { "epoch": 0.4535443715323739, "grad_norm": 0.5150747727818801, "learning_rate": 3.085062447315511e-06, "loss": 0.1675, "step": 2013 }, { "epoch": 0.45376967921818234, "grad_norm": 0.44458727455954306, "learning_rate": 3.0832511177948326e-06, "loss": 0.1468, "step": 2014 }, { "epoch": 0.45399498690399076, "grad_norm": 0.5048541437741968, "learning_rate": 3.081439464416549e-06, "loss": 0.173, "step": 2015 }, { "epoch": 0.4542202945897992, "grad_norm": 0.4810433869141697, "learning_rate": 3.0796274881866034e-06, "loss": 0.1647, "step": 2016 }, { "epoch": 0.45444560227560765, "grad_norm": 0.4669019629530169, "learning_rate": 3.0778151901111187e-06, "loss": 0.1625, "step": 2017 }, { "epoch": 0.45467090996141607, "grad_norm": 0.44513180434068006, "learning_rate": 3.0760025711963964e-06, "loss": 0.1404, "step": 2018 }, { "epoch": 0.4548962176472245, "grad_norm": 0.5034063685601008, "learning_rate": 3.0741896324489163e-06, "loss": 0.1702, "step": 2019 }, { "epoch": 0.4551215253330329, "grad_norm": 0.4557191252849021, "learning_rate": 3.0723763748753354e-06, "loss": 0.1516, "step": 2020 }, { "epoch": 0.4553468330188414, "grad_norm": 0.48336102411704346, "learning_rate": 3.0705627994824887e-06, "loss": 0.1595, "step": 2021 }, { "epoch": 0.4555721407046498, "grad_norm": 0.4485419976949585, "learning_rate": 3.0687489072773864e-06, "loss": 0.1499, "step": 2022 }, { "epoch": 0.4557974483904582, "grad_norm": 0.4615635228199084, "learning_rate": 3.0669346992672156e-06, "loss": 0.1488, "step": 2023 }, { "epoch": 0.45602275607626663, "grad_norm": 0.47702823681473344, "learning_rate": 3.0651201764593375e-06, "loss": 0.1628, "step": 2024 }, { "epoch": 0.4562480637620751, "grad_norm": 0.46354927078581676, "learning_rate": 3.06330533986129e-06, "loss": 0.1554, "step": 2025 }, { "epoch": 0.4564733714478835, "grad_norm": 0.4654409790097615, "learning_rate": 3.0614901904807836e-06, "loss": 0.1536, "step": 2026 }, { "epoch": 0.45669867913369194, "grad_norm": 0.4515681205108471, "learning_rate": 3.0596747293257047e-06, "loss": 0.154, "step": 2027 }, { "epoch": 0.45692398681950036, "grad_norm": 0.4789039895573779, "learning_rate": 3.0578589574041097e-06, "loss": 0.1495, "step": 2028 }, { "epoch": 0.45714929450530883, "grad_norm": 0.470697922964349, "learning_rate": 3.056042875724228e-06, "loss": 0.1676, "step": 2029 }, { "epoch": 0.45737460219111725, "grad_norm": 0.4563412984778146, "learning_rate": 3.0542264852944635e-06, "loss": 0.145, "step": 2030 }, { "epoch": 0.45759990987692567, "grad_norm": 0.48369559208181934, "learning_rate": 3.052409787123391e-06, "loss": 0.1634, "step": 2031 }, { "epoch": 0.4578252175627341, "grad_norm": 0.4564814760737555, "learning_rate": 3.0505927822197533e-06, "loss": 0.1509, "step": 2032 }, { "epoch": 0.45805052524854256, "grad_norm": 0.4610972303240204, "learning_rate": 3.0487754715924674e-06, "loss": 0.1413, "step": 2033 }, { "epoch": 0.458275832934351, "grad_norm": 0.5102253098565472, "learning_rate": 3.0469578562506165e-06, "loss": 0.1575, "step": 2034 }, { "epoch": 0.4585011406201594, "grad_norm": 0.47644473351215494, "learning_rate": 3.045139937203455e-06, "loss": 0.155, "step": 2035 }, { "epoch": 0.4587264483059678, "grad_norm": 0.4840780941352861, "learning_rate": 3.0433217154604067e-06, "loss": 0.1725, "step": 2036 }, { "epoch": 0.4589517559917763, "grad_norm": 0.4675495037083089, "learning_rate": 3.0415031920310613e-06, "loss": 0.1578, "step": 2037 }, { "epoch": 0.4591770636775847, "grad_norm": 0.466053222288305, "learning_rate": 3.0396843679251777e-06, "loss": 0.1557, "step": 2038 }, { "epoch": 0.4594023713633931, "grad_norm": 0.4658213885418776, "learning_rate": 3.03786524415268e-06, "loss": 0.1509, "step": 2039 }, { "epoch": 0.45962767904920154, "grad_norm": 0.47486803158127977, "learning_rate": 3.0360458217236604e-06, "loss": 0.158, "step": 2040 }, { "epoch": 0.45985298673501, "grad_norm": 0.46493147154073844, "learning_rate": 3.034226101648377e-06, "loss": 0.1562, "step": 2041 }, { "epoch": 0.46007829442081843, "grad_norm": 0.46366054346391106, "learning_rate": 3.0324060849372526e-06, "loss": 0.1481, "step": 2042 }, { "epoch": 0.46030360210662685, "grad_norm": 0.4471818204754162, "learning_rate": 3.0305857726008736e-06, "loss": 0.1502, "step": 2043 }, { "epoch": 0.46052890979243527, "grad_norm": 0.45710029387132534, "learning_rate": 3.028765165649992e-06, "loss": 0.1526, "step": 2044 }, { "epoch": 0.46075421747824374, "grad_norm": 0.4532254215793454, "learning_rate": 3.026944265095524e-06, "loss": 0.1428, "step": 2045 }, { "epoch": 0.46097952516405216, "grad_norm": 0.4781547388158946, "learning_rate": 3.0251230719485465e-06, "loss": 0.1522, "step": 2046 }, { "epoch": 0.4612048328498606, "grad_norm": 0.4679788316950286, "learning_rate": 3.0233015872203004e-06, "loss": 0.1549, "step": 2047 }, { "epoch": 0.461430140535669, "grad_norm": 0.457693832340645, "learning_rate": 3.0214798119221884e-06, "loss": 0.1559, "step": 2048 }, { "epoch": 0.46165544822147747, "grad_norm": 0.4618043640061542, "learning_rate": 3.0196577470657744e-06, "loss": 0.1507, "step": 2049 }, { "epoch": 0.4618807559072859, "grad_norm": 0.46599511428045387, "learning_rate": 3.0178353936627835e-06, "loss": 0.1571, "step": 2050 }, { "epoch": 0.4621060635930943, "grad_norm": 0.48849016131669354, "learning_rate": 3.0160127527250993e-06, "loss": 0.1559, "step": 2051 }, { "epoch": 0.4623313712789027, "grad_norm": 0.4734928184582273, "learning_rate": 3.0141898252647682e-06, "loss": 0.1518, "step": 2052 }, { "epoch": 0.4625566789647112, "grad_norm": 0.47462773177763673, "learning_rate": 3.012366612293993e-06, "loss": 0.1595, "step": 2053 }, { "epoch": 0.4627819866505196, "grad_norm": 0.49632927526910214, "learning_rate": 3.0105431148251364e-06, "loss": 0.168, "step": 2054 }, { "epoch": 0.46300729433632803, "grad_norm": 0.44508473522061126, "learning_rate": 3.0087193338707175e-06, "loss": 0.1431, "step": 2055 }, { "epoch": 0.46323260202213645, "grad_norm": 0.4850902705606218, "learning_rate": 3.0068952704434145e-06, "loss": 0.1682, "step": 2056 }, { "epoch": 0.4634579097079449, "grad_norm": 0.4585047739418811, "learning_rate": 3.0050709255560616e-06, "loss": 0.1492, "step": 2057 }, { "epoch": 0.46368321739375334, "grad_norm": 0.4730071664371849, "learning_rate": 3.0032463002216504e-06, "loss": 0.1599, "step": 2058 }, { "epoch": 0.46390852507956176, "grad_norm": 0.47482057227207636, "learning_rate": 3.0014213954533265e-06, "loss": 0.1533, "step": 2059 }, { "epoch": 0.46413383276537024, "grad_norm": 0.4421130204730638, "learning_rate": 2.999596212264392e-06, "loss": 0.1522, "step": 2060 }, { "epoch": 0.46435914045117865, "grad_norm": 0.46017986322521726, "learning_rate": 2.997770751668302e-06, "loss": 0.1489, "step": 2061 }, { "epoch": 0.46458444813698707, "grad_norm": 0.48128701337916485, "learning_rate": 2.9959450146786674e-06, "loss": 0.1647, "step": 2062 }, { "epoch": 0.4648097558227955, "grad_norm": 0.4892451718733948, "learning_rate": 2.994119002309253e-06, "loss": 0.1684, "step": 2063 }, { "epoch": 0.46503506350860396, "grad_norm": 0.4825674322636185, "learning_rate": 2.9922927155739737e-06, "loss": 0.1642, "step": 2064 }, { "epoch": 0.4652603711944124, "grad_norm": 0.4603373858917702, "learning_rate": 2.9904661554868997e-06, "loss": 0.1404, "step": 2065 }, { "epoch": 0.4654856788802208, "grad_norm": 0.46565245742689776, "learning_rate": 2.9886393230622507e-06, "loss": 0.1427, "step": 2066 }, { "epoch": 0.4657109865660292, "grad_norm": 0.484180951774229, "learning_rate": 2.986812219314399e-06, "loss": 0.16, "step": 2067 }, { "epoch": 0.4659362942518377, "grad_norm": 0.4941176192277806, "learning_rate": 2.984984845257868e-06, "loss": 0.1671, "step": 2068 }, { "epoch": 0.4661616019376461, "grad_norm": 0.4853395880449739, "learning_rate": 2.983157201907329e-06, "loss": 0.1632, "step": 2069 }, { "epoch": 0.4663869096234545, "grad_norm": 0.484149010323965, "learning_rate": 2.981329290277605e-06, "loss": 0.1486, "step": 2070 }, { "epoch": 0.46661221730926294, "grad_norm": 0.47891308586143655, "learning_rate": 2.9795011113836686e-06, "loss": 0.1537, "step": 2071 }, { "epoch": 0.4668375249950714, "grad_norm": 0.4927883402568322, "learning_rate": 2.977672666240636e-06, "loss": 0.1639, "step": 2072 }, { "epoch": 0.46706283268087984, "grad_norm": 0.4744327591106369, "learning_rate": 2.9758439558637774e-06, "loss": 0.1562, "step": 2073 }, { "epoch": 0.46728814036668825, "grad_norm": 0.4941847293633856, "learning_rate": 2.974014981268507e-06, "loss": 0.159, "step": 2074 }, { "epoch": 0.46751344805249667, "grad_norm": 0.4943272692654053, "learning_rate": 2.972185743470386e-06, "loss": 0.1601, "step": 2075 }, { "epoch": 0.46773875573830515, "grad_norm": 0.46189492924477654, "learning_rate": 2.9703562434851218e-06, "loss": 0.15, "step": 2076 }, { "epoch": 0.46796406342411356, "grad_norm": 0.4392177203658699, "learning_rate": 2.9685264823285676e-06, "loss": 0.1428, "step": 2077 }, { "epoch": 0.468189371109922, "grad_norm": 0.4660710812280689, "learning_rate": 2.966696461016721e-06, "loss": 0.1535, "step": 2078 }, { "epoch": 0.4684146787957304, "grad_norm": 0.4464453798972159, "learning_rate": 2.964866180565725e-06, "loss": 0.1412, "step": 2079 }, { "epoch": 0.4686399864815389, "grad_norm": 0.47979725171124965, "learning_rate": 2.9630356419918682e-06, "loss": 0.1612, "step": 2080 }, { "epoch": 0.4688652941673473, "grad_norm": 0.4805922736007531, "learning_rate": 2.9612048463115774e-06, "loss": 0.1488, "step": 2081 }, { "epoch": 0.4690906018531557, "grad_norm": 0.4956729872123908, "learning_rate": 2.9593737945414264e-06, "loss": 0.1655, "step": 2082 }, { "epoch": 0.4693159095389641, "grad_norm": 0.4920903518112405, "learning_rate": 2.9575424876981298e-06, "loss": 0.1586, "step": 2083 }, { "epoch": 0.4695412172247726, "grad_norm": 0.46898887966562713, "learning_rate": 2.9557109267985445e-06, "loss": 0.1447, "step": 2084 }, { "epoch": 0.469766524910581, "grad_norm": 0.46688066208914697, "learning_rate": 2.953879112859668e-06, "loss": 0.1487, "step": 2085 }, { "epoch": 0.46999183259638944, "grad_norm": 0.47890920214338073, "learning_rate": 2.952047046898637e-06, "loss": 0.1704, "step": 2086 }, { "epoch": 0.47021714028219785, "grad_norm": 0.4993549169939433, "learning_rate": 2.9502147299327316e-06, "loss": 0.171, "step": 2087 }, { "epoch": 0.4704424479680063, "grad_norm": 0.4829359338136954, "learning_rate": 2.9483821629793673e-06, "loss": 0.1616, "step": 2088 }, { "epoch": 0.47066775565381475, "grad_norm": 0.4999751762418861, "learning_rate": 2.946549347056101e-06, "loss": 0.1657, "step": 2089 }, { "epoch": 0.47089306333962316, "grad_norm": 0.47745495500884666, "learning_rate": 2.9447162831806275e-06, "loss": 0.1555, "step": 2090 }, { "epoch": 0.4711183710254316, "grad_norm": 0.46324150350719656, "learning_rate": 2.942882972370778e-06, "loss": 0.1382, "step": 2091 }, { "epoch": 0.47134367871124006, "grad_norm": 0.47875126249754046, "learning_rate": 2.941049415644522e-06, "loss": 0.1567, "step": 2092 }, { "epoch": 0.4715689863970485, "grad_norm": 0.48557336281168423, "learning_rate": 2.9392156140199644e-06, "loss": 0.1587, "step": 2093 }, { "epoch": 0.4717942940828569, "grad_norm": 0.49638521974648225, "learning_rate": 2.9373815685153485e-06, "loss": 0.162, "step": 2094 }, { "epoch": 0.4720196017686653, "grad_norm": 0.49484643934095707, "learning_rate": 2.93554728014905e-06, "loss": 0.16, "step": 2095 }, { "epoch": 0.4722449094544738, "grad_norm": 0.46146720058527874, "learning_rate": 2.933712749939582e-06, "loss": 0.1534, "step": 2096 }, { "epoch": 0.4724702171402822, "grad_norm": 0.4545334918204591, "learning_rate": 2.9318779789055894e-06, "loss": 0.1432, "step": 2097 }, { "epoch": 0.4726955248260906, "grad_norm": 0.4554966952835028, "learning_rate": 2.9300429680658538e-06, "loss": 0.1435, "step": 2098 }, { "epoch": 0.47292083251189904, "grad_norm": 0.4724184782115837, "learning_rate": 2.928207718439287e-06, "loss": 0.1518, "step": 2099 }, { "epoch": 0.4731461401977075, "grad_norm": 0.4845200135364393, "learning_rate": 2.9263722310449353e-06, "loss": 0.1554, "step": 2100 }, { "epoch": 0.47337144788351593, "grad_norm": 0.46797662996106126, "learning_rate": 2.924536506901976e-06, "loss": 0.1536, "step": 2101 }, { "epoch": 0.47359675556932435, "grad_norm": 0.47906899237945594, "learning_rate": 2.9227005470297194e-06, "loss": 0.1649, "step": 2102 }, { "epoch": 0.47382206325513276, "grad_norm": 0.4618144263119681, "learning_rate": 2.9208643524476037e-06, "loss": 0.1518, "step": 2103 }, { "epoch": 0.47404737094094124, "grad_norm": 0.46748084889828573, "learning_rate": 2.919027924175201e-06, "loss": 0.1492, "step": 2104 }, { "epoch": 0.47427267862674966, "grad_norm": 0.5056719949637513, "learning_rate": 2.9171912632322102e-06, "loss": 0.1619, "step": 2105 }, { "epoch": 0.4744979863125581, "grad_norm": 0.48782787073562445, "learning_rate": 2.915354370638462e-06, "loss": 0.1598, "step": 2106 }, { "epoch": 0.47472329399836655, "grad_norm": 0.47920209045806517, "learning_rate": 2.913517247413914e-06, "loss": 0.1497, "step": 2107 }, { "epoch": 0.47494860168417496, "grad_norm": 0.469980252183471, "learning_rate": 2.9116798945786515e-06, "loss": 0.1575, "step": 2108 }, { "epoch": 0.4751739093699834, "grad_norm": 0.4846588971324414, "learning_rate": 2.909842313152888e-06, "loss": 0.1511, "step": 2109 }, { "epoch": 0.4753992170557918, "grad_norm": 0.5070918931965868, "learning_rate": 2.9080045041569647e-06, "loss": 0.1716, "step": 2110 }, { "epoch": 0.4756245247416003, "grad_norm": 0.4917914827122371, "learning_rate": 2.9061664686113487e-06, "loss": 0.1559, "step": 2111 }, { "epoch": 0.4758498324274087, "grad_norm": 0.489691610534915, "learning_rate": 2.904328207536632e-06, "loss": 0.1543, "step": 2112 }, { "epoch": 0.4760751401132171, "grad_norm": 0.48507920331153476, "learning_rate": 2.9024897219535326e-06, "loss": 0.1653, "step": 2113 }, { "epoch": 0.47630044779902553, "grad_norm": 0.47914495383302097, "learning_rate": 2.900651012882893e-06, "loss": 0.152, "step": 2114 }, { "epoch": 0.476525755484834, "grad_norm": 0.49396402566447883, "learning_rate": 2.8988120813456794e-06, "loss": 0.1613, "step": 2115 }, { "epoch": 0.4767510631706424, "grad_norm": 0.4811425757009818, "learning_rate": 2.896972928362983e-06, "loss": 0.1578, "step": 2116 }, { "epoch": 0.47697637085645084, "grad_norm": 0.47253258227579886, "learning_rate": 2.8951335549560156e-06, "loss": 0.1679, "step": 2117 }, { "epoch": 0.47720167854225926, "grad_norm": 0.4684167266249312, "learning_rate": 2.893293962146114e-06, "loss": 0.1509, "step": 2118 }, { "epoch": 0.47742698622806773, "grad_norm": 0.4615876351385914, "learning_rate": 2.8914541509547345e-06, "loss": 0.1528, "step": 2119 }, { "epoch": 0.47765229391387615, "grad_norm": 0.4725901325614137, "learning_rate": 2.8896141224034554e-06, "loss": 0.144, "step": 2120 }, { "epoch": 0.47787760159968456, "grad_norm": 0.47688211275536097, "learning_rate": 2.8877738775139767e-06, "loss": 0.1503, "step": 2121 }, { "epoch": 0.478102909285493, "grad_norm": 0.461565550780441, "learning_rate": 2.885933417308118e-06, "loss": 0.1392, "step": 2122 }, { "epoch": 0.47832821697130146, "grad_norm": 0.47986650743199105, "learning_rate": 2.8840927428078185e-06, "loss": 0.151, "step": 2123 }, { "epoch": 0.4785535246571099, "grad_norm": 0.5102470451322486, "learning_rate": 2.8822518550351356e-06, "loss": 0.177, "step": 2124 }, { "epoch": 0.4787788323429183, "grad_norm": 0.4810562269804644, "learning_rate": 2.8804107550122453e-06, "loss": 0.1599, "step": 2125 }, { "epoch": 0.4790041400287267, "grad_norm": 0.49106145138432467, "learning_rate": 2.878569443761442e-06, "loss": 0.1552, "step": 2126 }, { "epoch": 0.4792294477145352, "grad_norm": 0.4887172792733754, "learning_rate": 2.8767279223051375e-06, "loss": 0.1536, "step": 2127 }, { "epoch": 0.4794547554003436, "grad_norm": 0.4689500997214312, "learning_rate": 2.87488619166586e-06, "loss": 0.1549, "step": 2128 }, { "epoch": 0.479680063086152, "grad_norm": 0.45929306443811646, "learning_rate": 2.8730442528662537e-06, "loss": 0.1614, "step": 2129 }, { "epoch": 0.47990537077196044, "grad_norm": 0.4871163529397986, "learning_rate": 2.8712021069290786e-06, "loss": 0.1544, "step": 2130 }, { "epoch": 0.4801306784577689, "grad_norm": 0.4817155544257701, "learning_rate": 2.869359754877209e-06, "loss": 0.1588, "step": 2131 }, { "epoch": 0.48035598614357733, "grad_norm": 0.4484801331749894, "learning_rate": 2.8675171977336357e-06, "loss": 0.1442, "step": 2132 }, { "epoch": 0.48058129382938575, "grad_norm": 0.4641674894247696, "learning_rate": 2.8656744365214622e-06, "loss": 0.1528, "step": 2133 }, { "epoch": 0.48080660151519417, "grad_norm": 0.5012156150664434, "learning_rate": 2.863831472263904e-06, "loss": 0.1566, "step": 2134 }, { "epoch": 0.48103190920100264, "grad_norm": 0.4633341637924984, "learning_rate": 2.8619883059842897e-06, "loss": 0.1427, "step": 2135 }, { "epoch": 0.48125721688681106, "grad_norm": 0.4797143088628369, "learning_rate": 2.8601449387060622e-06, "loss": 0.161, "step": 2136 }, { "epoch": 0.4814825245726195, "grad_norm": 0.47702027421937043, "learning_rate": 2.858301371452774e-06, "loss": 0.1428, "step": 2137 }, { "epoch": 0.4817078322584279, "grad_norm": 0.5015112965273413, "learning_rate": 2.8564576052480895e-06, "loss": 0.168, "step": 2138 }, { "epoch": 0.48193313994423637, "grad_norm": 0.4822255713488226, "learning_rate": 2.8546136411157843e-06, "loss": 0.1541, "step": 2139 }, { "epoch": 0.4821584476300448, "grad_norm": 0.465307141222416, "learning_rate": 2.8527694800797417e-06, "loss": 0.1619, "step": 2140 }, { "epoch": 0.4823837553158532, "grad_norm": 0.50266265761066, "learning_rate": 2.850925123163956e-06, "loss": 0.1698, "step": 2141 }, { "epoch": 0.4826090630016616, "grad_norm": 0.4421128901521324, "learning_rate": 2.8490805713925298e-06, "loss": 0.1435, "step": 2142 }, { "epoch": 0.4828343706874701, "grad_norm": 0.4812431803950061, "learning_rate": 2.847235825789673e-06, "loss": 0.1539, "step": 2143 }, { "epoch": 0.4830596783732785, "grad_norm": 0.4818346965554266, "learning_rate": 2.845390887379706e-06, "loss": 0.1687, "step": 2144 }, { "epoch": 0.48328498605908693, "grad_norm": 0.5085616042672392, "learning_rate": 2.8435457571870527e-06, "loss": 0.1747, "step": 2145 }, { "epoch": 0.48351029374489535, "grad_norm": 0.44676664208314826, "learning_rate": 2.8417004362362465e-06, "loss": 0.131, "step": 2146 }, { "epoch": 0.4837356014307038, "grad_norm": 0.47361874739111104, "learning_rate": 2.8398549255519237e-06, "loss": 0.1602, "step": 2147 }, { "epoch": 0.48396090911651224, "grad_norm": 0.4763300815360598, "learning_rate": 2.838009226158829e-06, "loss": 0.1634, "step": 2148 }, { "epoch": 0.48418621680232066, "grad_norm": 0.5197625242097161, "learning_rate": 2.83616333908181e-06, "loss": 0.1731, "step": 2149 }, { "epoch": 0.4844115244881291, "grad_norm": 0.4824667341017959, "learning_rate": 2.8343172653458194e-06, "loss": 0.1648, "step": 2150 }, { "epoch": 0.48463683217393755, "grad_norm": 0.4719192705898006, "learning_rate": 2.8324710059759126e-06, "loss": 0.1591, "step": 2151 }, { "epoch": 0.48486213985974597, "grad_norm": 0.4515406394716849, "learning_rate": 2.8306245619972476e-06, "loss": 0.1483, "step": 2152 }, { "epoch": 0.4850874475455544, "grad_norm": 0.4818085870458921, "learning_rate": 2.828777934435088e-06, "loss": 0.1657, "step": 2153 }, { "epoch": 0.4853127552313628, "grad_norm": 0.4948205614656545, "learning_rate": 2.826931124314796e-06, "loss": 0.1691, "step": 2154 }, { "epoch": 0.4855380629171713, "grad_norm": 0.47291934012520137, "learning_rate": 2.8250841326618367e-06, "loss": 0.1527, "step": 2155 }, { "epoch": 0.4857633706029797, "grad_norm": 0.48355294932750525, "learning_rate": 2.8232369605017757e-06, "loss": 0.1592, "step": 2156 }, { "epoch": 0.4859886782887881, "grad_norm": 0.4768913578131576, "learning_rate": 2.8213896088602786e-06, "loss": 0.1487, "step": 2157 }, { "epoch": 0.4862139859745966, "grad_norm": 0.5053824042202755, "learning_rate": 2.8195420787631113e-06, "loss": 0.1714, "step": 2158 }, { "epoch": 0.486439293660405, "grad_norm": 0.487503834880126, "learning_rate": 2.8176943712361394e-06, "loss": 0.1627, "step": 2159 }, { "epoch": 0.4866646013462134, "grad_norm": 0.4992652219189248, "learning_rate": 2.8158464873053236e-06, "loss": 0.153, "step": 2160 }, { "epoch": 0.48688990903202184, "grad_norm": 0.4781211578280688, "learning_rate": 2.8139984279967265e-06, "loss": 0.1569, "step": 2161 }, { "epoch": 0.4871152167178303, "grad_norm": 0.45967039527673825, "learning_rate": 2.8121501943365066e-06, "loss": 0.1404, "step": 2162 }, { "epoch": 0.48734052440363873, "grad_norm": 0.46645982532415253, "learning_rate": 2.810301787350918e-06, "loss": 0.1562, "step": 2163 }, { "epoch": 0.48756583208944715, "grad_norm": 0.46918371300837053, "learning_rate": 2.808453208066314e-06, "loss": 0.1465, "step": 2164 }, { "epoch": 0.48779113977525557, "grad_norm": 0.46548524196093854, "learning_rate": 2.8066044575091404e-06, "loss": 0.1522, "step": 2165 }, { "epoch": 0.48801644746106404, "grad_norm": 0.5289539976247626, "learning_rate": 2.8047555367059404e-06, "loss": 0.1739, "step": 2166 }, { "epoch": 0.48824175514687246, "grad_norm": 0.458664098217161, "learning_rate": 2.80290644668335e-06, "loss": 0.1405, "step": 2167 }, { "epoch": 0.4884670628326809, "grad_norm": 0.4717126138511026, "learning_rate": 2.8010571884681004e-06, "loss": 0.1542, "step": 2168 }, { "epoch": 0.4886923705184893, "grad_norm": 0.4601707871424624, "learning_rate": 2.799207763087015e-06, "loss": 0.1484, "step": 2169 }, { "epoch": 0.48891767820429777, "grad_norm": 0.4711813562824451, "learning_rate": 2.7973581715670124e-06, "loss": 0.1492, "step": 2170 }, { "epoch": 0.4891429858901062, "grad_norm": 0.4512983700781856, "learning_rate": 2.7955084149351002e-06, "loss": 0.1559, "step": 2171 }, { "epoch": 0.4893682935759146, "grad_norm": 0.49222156857332283, "learning_rate": 2.7936584942183804e-06, "loss": 0.1596, "step": 2172 }, { "epoch": 0.489593601261723, "grad_norm": 0.46487770605974843, "learning_rate": 2.7918084104440446e-06, "loss": 0.1534, "step": 2173 }, { "epoch": 0.4898189089475315, "grad_norm": 0.46155781463668333, "learning_rate": 2.7899581646393746e-06, "loss": 0.1585, "step": 2174 }, { "epoch": 0.4900442166333399, "grad_norm": 0.48435801828172664, "learning_rate": 2.7881077578317445e-06, "loss": 0.1561, "step": 2175 }, { "epoch": 0.49026952431914833, "grad_norm": 0.46865565885766813, "learning_rate": 2.7862571910486148e-06, "loss": 0.155, "step": 2176 }, { "epoch": 0.49049483200495675, "grad_norm": 0.5237421496925503, "learning_rate": 2.784406465317538e-06, "loss": 0.1592, "step": 2177 }, { "epoch": 0.4907201396907652, "grad_norm": 0.5063351392528739, "learning_rate": 2.7825555816661503e-06, "loss": 0.176, "step": 2178 }, { "epoch": 0.49094544737657364, "grad_norm": 0.45029290239029635, "learning_rate": 2.7807045411221813e-06, "loss": 0.1497, "step": 2179 }, { "epoch": 0.49117075506238206, "grad_norm": 0.4807262732955275, "learning_rate": 2.778853344713443e-06, "loss": 0.1523, "step": 2180 }, { "epoch": 0.4913960627481905, "grad_norm": 0.4527240365909731, "learning_rate": 2.777001993467837e-06, "loss": 0.1503, "step": 2181 }, { "epoch": 0.49162137043399895, "grad_norm": 0.4750132640988713, "learning_rate": 2.7751504884133484e-06, "loss": 0.1504, "step": 2182 }, { "epoch": 0.49184667811980737, "grad_norm": 0.5064783537919353, "learning_rate": 2.7732988305780496e-06, "loss": 0.1604, "step": 2183 }, { "epoch": 0.4920719858056158, "grad_norm": 0.4501987876305936, "learning_rate": 2.7714470209900974e-06, "loss": 0.1512, "step": 2184 }, { "epoch": 0.4922972934914242, "grad_norm": 0.4760064527034139, "learning_rate": 2.769595060677732e-06, "loss": 0.149, "step": 2185 }, { "epoch": 0.4925226011772327, "grad_norm": 0.5027689026168123, "learning_rate": 2.7677429506692788e-06, "loss": 0.1527, "step": 2186 }, { "epoch": 0.4927479088630411, "grad_norm": 0.4722739354725667, "learning_rate": 2.7658906919931443e-06, "loss": 0.1512, "step": 2187 }, { "epoch": 0.4929732165488495, "grad_norm": 0.48734288018027916, "learning_rate": 2.76403828567782e-06, "loss": 0.1553, "step": 2188 }, { "epoch": 0.49319852423465793, "grad_norm": 0.46063676497770223, "learning_rate": 2.7621857327518763e-06, "loss": 0.1609, "step": 2189 }, { "epoch": 0.4934238319204664, "grad_norm": 0.4881192023888224, "learning_rate": 2.7603330342439686e-06, "loss": 0.1637, "step": 2190 }, { "epoch": 0.4936491396062748, "grad_norm": 0.46588329681809043, "learning_rate": 2.7584801911828314e-06, "loss": 0.1597, "step": 2191 }, { "epoch": 0.49387444729208324, "grad_norm": 0.48151698956393363, "learning_rate": 2.7566272045972777e-06, "loss": 0.1555, "step": 2192 }, { "epoch": 0.49409975497789166, "grad_norm": 0.44929106962011756, "learning_rate": 2.7547740755162034e-06, "loss": 0.139, "step": 2193 }, { "epoch": 0.49432506266370013, "grad_norm": 0.4619821656904074, "learning_rate": 2.752920804968581e-06, "loss": 0.1516, "step": 2194 }, { "epoch": 0.49455037034950855, "grad_norm": 0.4624932946392754, "learning_rate": 2.7510673939834633e-06, "loss": 0.1579, "step": 2195 }, { "epoch": 0.49477567803531697, "grad_norm": 0.44059934991547617, "learning_rate": 2.7492138435899794e-06, "loss": 0.1533, "step": 2196 }, { "epoch": 0.4950009857211254, "grad_norm": 0.46495736661665293, "learning_rate": 2.747360154817338e-06, "loss": 0.1568, "step": 2197 }, { "epoch": 0.49522629340693386, "grad_norm": 0.4471263495684936, "learning_rate": 2.745506328694822e-06, "loss": 0.1505, "step": 2198 }, { "epoch": 0.4954516010927423, "grad_norm": 0.47502698148207845, "learning_rate": 2.743652366251793e-06, "loss": 0.1528, "step": 2199 }, { "epoch": 0.4956769087785507, "grad_norm": 0.474101185921597, "learning_rate": 2.741798268517687e-06, "loss": 0.1444, "step": 2200 }, { "epoch": 0.4959022164643591, "grad_norm": 0.4950438275209808, "learning_rate": 2.7399440365220153e-06, "loss": 0.159, "step": 2201 }, { "epoch": 0.4961275241501676, "grad_norm": 0.48514409432512545, "learning_rate": 2.738089671294364e-06, "loss": 0.1623, "step": 2202 }, { "epoch": 0.496352831835976, "grad_norm": 0.49897046758102964, "learning_rate": 2.7362351738643926e-06, "loss": 0.1714, "step": 2203 }, { "epoch": 0.4965781395217844, "grad_norm": 0.4944720313099016, "learning_rate": 2.734380545261835e-06, "loss": 0.1454, "step": 2204 }, { "epoch": 0.4968034472075929, "grad_norm": 0.4740848361304964, "learning_rate": 2.7325257865164955e-06, "loss": 0.1505, "step": 2205 }, { "epoch": 0.4970287548934013, "grad_norm": 0.5044552603154835, "learning_rate": 2.730670898658255e-06, "loss": 0.1779, "step": 2206 }, { "epoch": 0.49725406257920973, "grad_norm": 0.46394146351071547, "learning_rate": 2.7288158827170623e-06, "loss": 0.1396, "step": 2207 }, { "epoch": 0.49747937026501815, "grad_norm": 0.49280257361880014, "learning_rate": 2.726960739722939e-06, "loss": 0.1622, "step": 2208 }, { "epoch": 0.4977046779508266, "grad_norm": 0.4867686438552059, "learning_rate": 2.725105470705977e-06, "loss": 0.1581, "step": 2209 }, { "epoch": 0.49792998563663504, "grad_norm": 0.5219938141413315, "learning_rate": 2.7232500766963373e-06, "loss": 0.1686, "step": 2210 }, { "epoch": 0.49815529332244346, "grad_norm": 0.4764321227719793, "learning_rate": 2.7213945587242507e-06, "loss": 0.1567, "step": 2211 }, { "epoch": 0.4983806010082519, "grad_norm": 0.49384595085813865, "learning_rate": 2.7195389178200194e-06, "loss": 0.1583, "step": 2212 }, { "epoch": 0.49860590869406035, "grad_norm": 0.5075757200906527, "learning_rate": 2.7176831550140093e-06, "loss": 0.1676, "step": 2213 }, { "epoch": 0.49883121637986877, "grad_norm": 0.4524722468045472, "learning_rate": 2.7158272713366573e-06, "loss": 0.1384, "step": 2214 }, { "epoch": 0.4990565240656772, "grad_norm": 0.5083060899486224, "learning_rate": 2.713971267818466e-06, "loss": 0.1613, "step": 2215 }, { "epoch": 0.4992818317514856, "grad_norm": 0.4396851281617713, "learning_rate": 2.7121151454900048e-06, "loss": 0.1482, "step": 2216 }, { "epoch": 0.4995071394372941, "grad_norm": 0.46962633997335007, "learning_rate": 2.7102589053819107e-06, "loss": 0.147, "step": 2217 }, { "epoch": 0.4997324471231025, "grad_norm": 0.5126065467533945, "learning_rate": 2.7084025485248827e-06, "loss": 0.1575, "step": 2218 }, { "epoch": 0.4999577548089109, "grad_norm": 0.4636702745429642, "learning_rate": 2.706546075949688e-06, "loss": 0.1497, "step": 2219 }, { "epoch": 0.5001830624947193, "grad_norm": 0.4624702418642572, "learning_rate": 2.7046894886871564e-06, "loss": 0.1554, "step": 2220 }, { "epoch": 0.5004083701805278, "grad_norm": 0.4919207689507557, "learning_rate": 2.7028327877681808e-06, "loss": 0.1567, "step": 2221 }, { "epoch": 0.5006336778663362, "grad_norm": 0.5157069877586713, "learning_rate": 2.700975974223719e-06, "loss": 0.1561, "step": 2222 }, { "epoch": 0.5008589855521447, "grad_norm": 0.45397574648979233, "learning_rate": 2.6991190490847898e-06, "loss": 0.1463, "step": 2223 }, { "epoch": 0.5010842932379531, "grad_norm": 0.4805592141525948, "learning_rate": 2.6972620133824745e-06, "loss": 0.1544, "step": 2224 }, { "epoch": 0.5013096009237615, "grad_norm": 0.45753171981042684, "learning_rate": 2.695404868147916e-06, "loss": 0.1521, "step": 2225 }, { "epoch": 0.50153490860957, "grad_norm": 0.4763708134097829, "learning_rate": 2.6935476144123173e-06, "loss": 0.1489, "step": 2226 }, { "epoch": 0.5017602162953784, "grad_norm": 0.4753257879884957, "learning_rate": 2.691690253206943e-06, "loss": 0.1556, "step": 2227 }, { "epoch": 0.5019855239811868, "grad_norm": 0.4927561800774899, "learning_rate": 2.689832785563116e-06, "loss": 0.1467, "step": 2228 }, { "epoch": 0.5022108316669952, "grad_norm": 0.4890552682019787, "learning_rate": 2.6879752125122193e-06, "loss": 0.1476, "step": 2229 }, { "epoch": 0.5024361393528036, "grad_norm": 0.5019953873414026, "learning_rate": 2.6861175350856937e-06, "loss": 0.1615, "step": 2230 }, { "epoch": 0.5026614470386122, "grad_norm": 0.5010903995101, "learning_rate": 2.684259754315038e-06, "loss": 0.1736, "step": 2231 }, { "epoch": 0.5028867547244206, "grad_norm": 0.5022799823142762, "learning_rate": 2.6824018712318084e-06, "loss": 0.154, "step": 2232 }, { "epoch": 0.503112062410229, "grad_norm": 0.5222667007278232, "learning_rate": 2.6805438868676186e-06, "loss": 0.1574, "step": 2233 }, { "epoch": 0.5033373700960374, "grad_norm": 0.4734865357653425, "learning_rate": 2.6786858022541385e-06, "loss": 0.1487, "step": 2234 }, { "epoch": 0.5035626777818458, "grad_norm": 0.49629092969775523, "learning_rate": 2.676827618423093e-06, "loss": 0.169, "step": 2235 }, { "epoch": 0.5037879854676542, "grad_norm": 0.47744881578341003, "learning_rate": 2.674969336406262e-06, "loss": 0.1543, "step": 2236 }, { "epoch": 0.5040132931534627, "grad_norm": 0.49005111881485025, "learning_rate": 2.6731109572354795e-06, "loss": 0.1615, "step": 2237 }, { "epoch": 0.5042386008392711, "grad_norm": 0.5405479885895289, "learning_rate": 2.6712524819426355e-06, "loss": 0.1741, "step": 2238 }, { "epoch": 0.5044639085250796, "grad_norm": 0.5018883042516251, "learning_rate": 2.6693939115596718e-06, "loss": 0.1542, "step": 2239 }, { "epoch": 0.504689216210888, "grad_norm": 0.44832772942283, "learning_rate": 2.6675352471185824e-06, "loss": 0.1432, "step": 2240 }, { "epoch": 0.5049145238966964, "grad_norm": 0.47367279205265483, "learning_rate": 2.6656764896514152e-06, "loss": 0.1511, "step": 2241 }, { "epoch": 0.5051398315825049, "grad_norm": 0.47569526383637317, "learning_rate": 2.6638176401902693e-06, "loss": 0.1639, "step": 2242 }, { "epoch": 0.5053651392683133, "grad_norm": 0.4650884586981442, "learning_rate": 2.6619586997672923e-06, "loss": 0.1429, "step": 2243 }, { "epoch": 0.5055904469541217, "grad_norm": 0.4823395176993751, "learning_rate": 2.6600996694146876e-06, "loss": 0.1668, "step": 2244 }, { "epoch": 0.5058157546399301, "grad_norm": 0.43899046677750514, "learning_rate": 2.658240550164704e-06, "loss": 0.1468, "step": 2245 }, { "epoch": 0.5060410623257385, "grad_norm": 0.462906398923942, "learning_rate": 2.656381343049641e-06, "loss": 0.1509, "step": 2246 }, { "epoch": 0.5062663700115471, "grad_norm": 0.46216886541797236, "learning_rate": 2.654522049101847e-06, "loss": 0.1454, "step": 2247 }, { "epoch": 0.5064916776973555, "grad_norm": 0.47781029672794806, "learning_rate": 2.652662669353719e-06, "loss": 0.1625, "step": 2248 }, { "epoch": 0.5067169853831639, "grad_norm": 0.4687429076107305, "learning_rate": 2.6508032048377006e-06, "loss": 0.138, "step": 2249 }, { "epoch": 0.5069422930689723, "grad_norm": 0.49544551469790465, "learning_rate": 2.648943656586284e-06, "loss": 0.1473, "step": 2250 }, { "epoch": 0.5071676007547807, "grad_norm": 0.47331775745367966, "learning_rate": 2.6470840256320064e-06, "loss": 0.1524, "step": 2251 }, { "epoch": 0.5073929084405892, "grad_norm": 0.423740267378944, "learning_rate": 2.6452243130074523e-06, "loss": 0.1283, "step": 2252 }, { "epoch": 0.5076182161263976, "grad_norm": 0.46962236211799213, "learning_rate": 2.6433645197452493e-06, "loss": 0.1499, "step": 2253 }, { "epoch": 0.5078435238122061, "grad_norm": 0.4931593528990676, "learning_rate": 2.6415046468780726e-06, "loss": 0.1455, "step": 2254 }, { "epoch": 0.5080688314980145, "grad_norm": 0.4633383428887458, "learning_rate": 2.63964469543864e-06, "loss": 0.1384, "step": 2255 }, { "epoch": 0.5082941391838229, "grad_norm": 0.5014476308647303, "learning_rate": 2.637784666459714e-06, "loss": 0.1525, "step": 2256 }, { "epoch": 0.5085194468696314, "grad_norm": 0.5187286528852701, "learning_rate": 2.635924560974098e-06, "loss": 0.1672, "step": 2257 }, { "epoch": 0.5087447545554398, "grad_norm": 0.48859410138629195, "learning_rate": 2.6340643800146387e-06, "loss": 0.1577, "step": 2258 }, { "epoch": 0.5089700622412482, "grad_norm": 0.4758161336287191, "learning_rate": 2.6322041246142273e-06, "loss": 0.1554, "step": 2259 }, { "epoch": 0.5091953699270566, "grad_norm": 0.5204038587103618, "learning_rate": 2.6303437958057932e-06, "loss": 0.1596, "step": 2260 }, { "epoch": 0.509420677612865, "grad_norm": 0.4755339689407434, "learning_rate": 2.6284833946223075e-06, "loss": 0.1514, "step": 2261 }, { "epoch": 0.5096459852986736, "grad_norm": 0.49529525973555916, "learning_rate": 2.626622922096782e-06, "loss": 0.1662, "step": 2262 }, { "epoch": 0.509871292984482, "grad_norm": 0.47102844584098646, "learning_rate": 2.624762379262268e-06, "loss": 0.1497, "step": 2263 }, { "epoch": 0.5100966006702904, "grad_norm": 0.4819251906829047, "learning_rate": 2.622901767151855e-06, "loss": 0.1532, "step": 2264 }, { "epoch": 0.5103219083560988, "grad_norm": 0.472069862823219, "learning_rate": 2.6210410867986713e-06, "loss": 0.1436, "step": 2265 }, { "epoch": 0.5105472160419072, "grad_norm": 0.4645494432026978, "learning_rate": 2.619180339235884e-06, "loss": 0.1422, "step": 2266 }, { "epoch": 0.5107725237277156, "grad_norm": 0.47263922852142376, "learning_rate": 2.6173195254966966e-06, "loss": 0.162, "step": 2267 }, { "epoch": 0.5109978314135241, "grad_norm": 0.46779769487146516, "learning_rate": 2.6154586466143495e-06, "loss": 0.143, "step": 2268 }, { "epoch": 0.5112231390993325, "grad_norm": 0.4600174066739792, "learning_rate": 2.6135977036221195e-06, "loss": 0.1442, "step": 2269 }, { "epoch": 0.511448446785141, "grad_norm": 0.4575400490189286, "learning_rate": 2.6117366975533187e-06, "loss": 0.1512, "step": 2270 }, { "epoch": 0.5116737544709494, "grad_norm": 0.4826461966104453, "learning_rate": 2.609875629441295e-06, "loss": 0.1638, "step": 2271 }, { "epoch": 0.5118990621567578, "grad_norm": 0.4707429777697515, "learning_rate": 2.60801450031943e-06, "loss": 0.1469, "step": 2272 }, { "epoch": 0.5121243698425663, "grad_norm": 0.46468370484946786, "learning_rate": 2.6061533112211394e-06, "loss": 0.1532, "step": 2273 }, { "epoch": 0.5123496775283747, "grad_norm": 0.49977089530261853, "learning_rate": 2.604292063179871e-06, "loss": 0.1603, "step": 2274 }, { "epoch": 0.5125749852141831, "grad_norm": 0.4840088481175901, "learning_rate": 2.602430757229108e-06, "loss": 0.1599, "step": 2275 }, { "epoch": 0.5128002928999915, "grad_norm": 0.48088896715216267, "learning_rate": 2.600569394402363e-06, "loss": 0.1533, "step": 2276 }, { "epoch": 0.5130256005857999, "grad_norm": 0.5062988229080415, "learning_rate": 2.5987079757331824e-06, "loss": 0.1666, "step": 2277 }, { "epoch": 0.5132509082716085, "grad_norm": 0.4991590684305053, "learning_rate": 2.596846502255142e-06, "loss": 0.1495, "step": 2278 }, { "epoch": 0.5134762159574169, "grad_norm": 0.4712901894078783, "learning_rate": 2.5949849750018486e-06, "loss": 0.1494, "step": 2279 }, { "epoch": 0.5137015236432253, "grad_norm": 0.4566845858422049, "learning_rate": 2.5931233950069385e-06, "loss": 0.1481, "step": 2280 }, { "epoch": 0.5139268313290337, "grad_norm": 0.468692135600313, "learning_rate": 2.591261763304079e-06, "loss": 0.152, "step": 2281 }, { "epoch": 0.5141521390148421, "grad_norm": 0.48731655698950843, "learning_rate": 2.589400080926964e-06, "loss": 0.1469, "step": 2282 }, { "epoch": 0.5143774467006506, "grad_norm": 0.47987896655282924, "learning_rate": 2.5875383489093165e-06, "loss": 0.1613, "step": 2283 }, { "epoch": 0.514602754386459, "grad_norm": 0.4635204806630267, "learning_rate": 2.585676568284886e-06, "loss": 0.1532, "step": 2284 }, { "epoch": 0.5148280620722674, "grad_norm": 0.47497923474665377, "learning_rate": 2.583814740087451e-06, "loss": 0.1462, "step": 2285 }, { "epoch": 0.5150533697580759, "grad_norm": 0.5202422022869216, "learning_rate": 2.581952865350815e-06, "loss": 0.1636, "step": 2286 }, { "epoch": 0.5152786774438843, "grad_norm": 0.46226673508432603, "learning_rate": 2.5800909451088075e-06, "loss": 0.1558, "step": 2287 }, { "epoch": 0.5155039851296928, "grad_norm": 0.484793426628321, "learning_rate": 2.578228980395283e-06, "loss": 0.1523, "step": 2288 }, { "epoch": 0.5157292928155012, "grad_norm": 0.49786647707597725, "learning_rate": 2.5763669722441226e-06, "loss": 0.1626, "step": 2289 }, { "epoch": 0.5159546005013096, "grad_norm": 0.44749134037384025, "learning_rate": 2.5745049216892286e-06, "loss": 0.1376, "step": 2290 }, { "epoch": 0.516179908187118, "grad_norm": 0.4943654328069877, "learning_rate": 2.5726428297645285e-06, "loss": 0.1504, "step": 2291 }, { "epoch": 0.5164052158729264, "grad_norm": 0.4962431635915529, "learning_rate": 2.570780697503973e-06, "loss": 0.1528, "step": 2292 }, { "epoch": 0.5166305235587348, "grad_norm": 0.47973962504827283, "learning_rate": 2.5689185259415346e-06, "loss": 0.1676, "step": 2293 }, { "epoch": 0.5168558312445434, "grad_norm": 0.4546694046351566, "learning_rate": 2.5670563161112073e-06, "loss": 0.1419, "step": 2294 }, { "epoch": 0.5170811389303518, "grad_norm": 0.5004005389250398, "learning_rate": 2.5651940690470074e-06, "loss": 0.1578, "step": 2295 }, { "epoch": 0.5173064466161602, "grad_norm": 0.5551306967913575, "learning_rate": 2.56333178578297e-06, "loss": 0.1632, "step": 2296 }, { "epoch": 0.5175317543019686, "grad_norm": 0.4957720566111589, "learning_rate": 2.5614694673531533e-06, "loss": 0.1515, "step": 2297 }, { "epoch": 0.517757061987777, "grad_norm": 0.4733168651798975, "learning_rate": 2.5596071147916325e-06, "loss": 0.1433, "step": 2298 }, { "epoch": 0.5179823696735855, "grad_norm": 0.4984969498794251, "learning_rate": 2.557744729132503e-06, "loss": 0.168, "step": 2299 }, { "epoch": 0.5182076773593939, "grad_norm": 0.4739201510380568, "learning_rate": 2.555882311409878e-06, "loss": 0.1616, "step": 2300 }, { "epoch": 0.5184329850452023, "grad_norm": 0.47477837369247494, "learning_rate": 2.554019862657888e-06, "loss": 0.1512, "step": 2301 }, { "epoch": 0.5186582927310108, "grad_norm": 0.4827657739269289, "learning_rate": 2.5521573839106815e-06, "loss": 0.1532, "step": 2302 }, { "epoch": 0.5188836004168192, "grad_norm": 0.48081583349713264, "learning_rate": 2.5502948762024244e-06, "loss": 0.1487, "step": 2303 }, { "epoch": 0.5191089081026277, "grad_norm": 0.5136945737827481, "learning_rate": 2.5484323405672965e-06, "loss": 0.163, "step": 2304 }, { "epoch": 0.5193342157884361, "grad_norm": 0.4852412918348156, "learning_rate": 2.546569778039496e-06, "loss": 0.1502, "step": 2305 }, { "epoch": 0.5195595234742445, "grad_norm": 0.4839387174700844, "learning_rate": 2.544707189653233e-06, "loss": 0.1566, "step": 2306 }, { "epoch": 0.5197848311600529, "grad_norm": 0.47852121774050516, "learning_rate": 2.542844576442734e-06, "loss": 0.1562, "step": 2307 }, { "epoch": 0.5200101388458613, "grad_norm": 0.48176737230647965, "learning_rate": 2.5409819394422386e-06, "loss": 0.1473, "step": 2308 }, { "epoch": 0.5202354465316699, "grad_norm": 0.4493722340870022, "learning_rate": 2.539119279686001e-06, "loss": 0.1389, "step": 2309 }, { "epoch": 0.5204607542174783, "grad_norm": 0.4867659173267954, "learning_rate": 2.5372565982082843e-06, "loss": 0.1565, "step": 2310 }, { "epoch": 0.5206860619032867, "grad_norm": 0.49083074439740926, "learning_rate": 2.535393896043368e-06, "loss": 0.1693, "step": 2311 }, { "epoch": 0.5209113695890951, "grad_norm": 0.5118582010278282, "learning_rate": 2.5335311742255392e-06, "loss": 0.165, "step": 2312 }, { "epoch": 0.5211366772749035, "grad_norm": 0.46165246926341413, "learning_rate": 2.5316684337891005e-06, "loss": 0.1513, "step": 2313 }, { "epoch": 0.521361984960712, "grad_norm": 0.5081771222693094, "learning_rate": 2.5298056757683604e-06, "loss": 0.1744, "step": 2314 }, { "epoch": 0.5215872926465204, "grad_norm": 0.47259517752258834, "learning_rate": 2.52794290119764e-06, "loss": 0.1529, "step": 2315 }, { "epoch": 0.5218126003323288, "grad_norm": 0.5412140292393451, "learning_rate": 2.5260801111112677e-06, "loss": 0.1769, "step": 2316 }, { "epoch": 0.5220379080181373, "grad_norm": 0.46283220054109886, "learning_rate": 2.5242173065435815e-06, "loss": 0.1458, "step": 2317 }, { "epoch": 0.5222632157039457, "grad_norm": 0.44276183424887405, "learning_rate": 2.5223544885289287e-06, "loss": 0.1353, "step": 2318 }, { "epoch": 0.5224885233897542, "grad_norm": 0.4848664491076183, "learning_rate": 2.5204916581016608e-06, "loss": 0.1502, "step": 2319 }, { "epoch": 0.5227138310755626, "grad_norm": 0.48383050929686455, "learning_rate": 2.518628816296139e-06, "loss": 0.1559, "step": 2320 }, { "epoch": 0.522939138761371, "grad_norm": 0.47148303411187387, "learning_rate": 2.5167659641467302e-06, "loss": 0.1492, "step": 2321 }, { "epoch": 0.5231644464471794, "grad_norm": 0.5130011837904479, "learning_rate": 2.5149031026878063e-06, "loss": 0.1543, "step": 2322 }, { "epoch": 0.5233897541329878, "grad_norm": 0.43672925822618086, "learning_rate": 2.5130402329537444e-06, "loss": 0.1276, "step": 2323 }, { "epoch": 0.5236150618187962, "grad_norm": 0.5096482667519452, "learning_rate": 2.5111773559789277e-06, "loss": 0.1668, "step": 2324 }, { "epoch": 0.5238403695046048, "grad_norm": 0.4790070945167404, "learning_rate": 2.509314472797742e-06, "loss": 0.1481, "step": 2325 }, { "epoch": 0.5240656771904132, "grad_norm": 0.4576983330267588, "learning_rate": 2.5074515844445774e-06, "loss": 0.1439, "step": 2326 }, { "epoch": 0.5242909848762216, "grad_norm": 0.5058847652244145, "learning_rate": 2.5055886919538247e-06, "loss": 0.1759, "step": 2327 }, { "epoch": 0.52451629256203, "grad_norm": 0.4661076795855495, "learning_rate": 2.50372579635988e-06, "loss": 0.1502, "step": 2328 }, { "epoch": 0.5247416002478384, "grad_norm": 0.4492691768597872, "learning_rate": 2.5018628986971395e-06, "loss": 0.1341, "step": 2329 }, { "epoch": 0.5249669079336469, "grad_norm": 0.4715357476203562, "learning_rate": 2.5e-06, "loss": 0.1458, "step": 2330 }, { "epoch": 0.5251922156194553, "grad_norm": 0.5043033667691116, "learning_rate": 2.4981371013028618e-06, "loss": 0.1699, "step": 2331 }, { "epoch": 0.5254175233052637, "grad_norm": 0.46576266805778144, "learning_rate": 2.4962742036401213e-06, "loss": 0.1467, "step": 2332 }, { "epoch": 0.5256428309910722, "grad_norm": 0.47223610796977544, "learning_rate": 2.494411308046176e-06, "loss": 0.1458, "step": 2333 }, { "epoch": 0.5258681386768806, "grad_norm": 0.4994925272719538, "learning_rate": 2.4925484155554235e-06, "loss": 0.1632, "step": 2334 }, { "epoch": 0.5260934463626891, "grad_norm": 0.4905595343755695, "learning_rate": 2.490685527202258e-06, "loss": 0.173, "step": 2335 }, { "epoch": 0.5263187540484975, "grad_norm": 0.47089124227730406, "learning_rate": 2.4888226440210723e-06, "loss": 0.147, "step": 2336 }, { "epoch": 0.5265440617343059, "grad_norm": 0.4599527059597026, "learning_rate": 2.4869597670462555e-06, "loss": 0.1524, "step": 2337 }, { "epoch": 0.5267693694201143, "grad_norm": 0.4808986795245479, "learning_rate": 2.4850968973121945e-06, "loss": 0.1634, "step": 2338 }, { "epoch": 0.5269946771059227, "grad_norm": 0.48589161401299336, "learning_rate": 2.483234035853271e-06, "loss": 0.1572, "step": 2339 }, { "epoch": 0.5272199847917312, "grad_norm": 0.47488130307449883, "learning_rate": 2.481371183703862e-06, "loss": 0.1421, "step": 2340 }, { "epoch": 0.5274452924775397, "grad_norm": 0.47126856103360054, "learning_rate": 2.4795083418983405e-06, "loss": 0.159, "step": 2341 }, { "epoch": 0.5276706001633481, "grad_norm": 0.4926033575294693, "learning_rate": 2.477645511471073e-06, "loss": 0.1596, "step": 2342 }, { "epoch": 0.5278959078491565, "grad_norm": 0.46332393911961006, "learning_rate": 2.475782693456419e-06, "loss": 0.1378, "step": 2343 }, { "epoch": 0.5281212155349649, "grad_norm": 0.4725650364239991, "learning_rate": 2.473919888888733e-06, "loss": 0.1363, "step": 2344 }, { "epoch": 0.5283465232207734, "grad_norm": 0.502604824086372, "learning_rate": 2.472057098802361e-06, "loss": 0.1547, "step": 2345 }, { "epoch": 0.5285718309065818, "grad_norm": 0.46349203187037064, "learning_rate": 2.4701943242316405e-06, "loss": 0.1404, "step": 2346 }, { "epoch": 0.5287971385923902, "grad_norm": 0.4973121971815668, "learning_rate": 2.4683315662109003e-06, "loss": 0.1672, "step": 2347 }, { "epoch": 0.5290224462781986, "grad_norm": 0.43225256940399115, "learning_rate": 2.466468825774461e-06, "loss": 0.1356, "step": 2348 }, { "epoch": 0.5292477539640071, "grad_norm": 0.477633940104711, "learning_rate": 2.464606103956633e-06, "loss": 0.1485, "step": 2349 }, { "epoch": 0.5294730616498156, "grad_norm": 0.49014865184145545, "learning_rate": 2.462743401791716e-06, "loss": 0.1621, "step": 2350 }, { "epoch": 0.529698369335624, "grad_norm": 0.4980263724685209, "learning_rate": 2.460880720314e-06, "loss": 0.1558, "step": 2351 }, { "epoch": 0.5299236770214324, "grad_norm": 0.49992308145356856, "learning_rate": 2.4590180605577614e-06, "loss": 0.155, "step": 2352 }, { "epoch": 0.5301489847072408, "grad_norm": 0.47557074308194397, "learning_rate": 2.4571554235572665e-06, "loss": 0.1527, "step": 2353 }, { "epoch": 0.5303742923930492, "grad_norm": 0.4686763140918017, "learning_rate": 2.4552928103467677e-06, "loss": 0.1545, "step": 2354 }, { "epoch": 0.5305996000788576, "grad_norm": 0.49061218416270386, "learning_rate": 2.4534302219605044e-06, "loss": 0.1525, "step": 2355 }, { "epoch": 0.5308249077646662, "grad_norm": 0.4635700122759651, "learning_rate": 2.4515676594327035e-06, "loss": 0.1452, "step": 2356 }, { "epoch": 0.5310502154504746, "grad_norm": 0.48378534785787525, "learning_rate": 2.4497051237975773e-06, "loss": 0.1544, "step": 2357 }, { "epoch": 0.531275523136283, "grad_norm": 0.504499518788173, "learning_rate": 2.4478426160893197e-06, "loss": 0.1666, "step": 2358 }, { "epoch": 0.5315008308220914, "grad_norm": 0.4972981132140918, "learning_rate": 2.4459801373421134e-06, "loss": 0.1533, "step": 2359 }, { "epoch": 0.5317261385078998, "grad_norm": 0.4770186193899861, "learning_rate": 2.4441176885901234e-06, "loss": 0.1524, "step": 2360 }, { "epoch": 0.5319514461937083, "grad_norm": 0.48734897113858827, "learning_rate": 2.4422552708674977e-06, "loss": 0.1476, "step": 2361 }, { "epoch": 0.5321767538795167, "grad_norm": 0.4816980687725706, "learning_rate": 2.440392885208368e-06, "loss": 0.143, "step": 2362 }, { "epoch": 0.5324020615653251, "grad_norm": 0.47347405015746924, "learning_rate": 2.4385305326468475e-06, "loss": 0.1487, "step": 2363 }, { "epoch": 0.5326273692511336, "grad_norm": 0.45637701573209427, "learning_rate": 2.436668214217031e-06, "loss": 0.1427, "step": 2364 }, { "epoch": 0.532852676936942, "grad_norm": 0.46057914273271494, "learning_rate": 2.4348059309529935e-06, "loss": 0.1519, "step": 2365 }, { "epoch": 0.5330779846227505, "grad_norm": 0.4618776570707972, "learning_rate": 2.4329436838887936e-06, "loss": 0.1455, "step": 2366 }, { "epoch": 0.5333032923085589, "grad_norm": 0.5031610167097199, "learning_rate": 2.4310814740584663e-06, "loss": 0.1587, "step": 2367 }, { "epoch": 0.5335285999943673, "grad_norm": 0.4851364088493003, "learning_rate": 2.4292193024960275e-06, "loss": 0.1534, "step": 2368 }, { "epoch": 0.5337539076801757, "grad_norm": 0.44650754269598664, "learning_rate": 2.427357170235472e-06, "loss": 0.132, "step": 2369 }, { "epoch": 0.5339792153659841, "grad_norm": 0.4834717917296514, "learning_rate": 2.425495078310772e-06, "loss": 0.1496, "step": 2370 }, { "epoch": 0.5342045230517926, "grad_norm": 0.494643450442732, "learning_rate": 2.4236330277558774e-06, "loss": 0.1632, "step": 2371 }, { "epoch": 0.5344298307376011, "grad_norm": 0.4847535814939488, "learning_rate": 2.4217710196047166e-06, "loss": 0.1609, "step": 2372 }, { "epoch": 0.5346551384234095, "grad_norm": 0.46634369290995936, "learning_rate": 2.419909054891193e-06, "loss": 0.1494, "step": 2373 }, { "epoch": 0.5348804461092179, "grad_norm": 0.4900134894586286, "learning_rate": 2.4180471346491864e-06, "loss": 0.1495, "step": 2374 }, { "epoch": 0.5351057537950263, "grad_norm": 0.46965514940495373, "learning_rate": 2.4161852599125504e-06, "loss": 0.1537, "step": 2375 }, { "epoch": 0.5353310614808348, "grad_norm": 0.43801200426498405, "learning_rate": 2.414323431715115e-06, "loss": 0.1327, "step": 2376 }, { "epoch": 0.5355563691666432, "grad_norm": 0.4797440345308628, "learning_rate": 2.412461651090685e-06, "loss": 0.1461, "step": 2377 }, { "epoch": 0.5357816768524516, "grad_norm": 0.46511644318972317, "learning_rate": 2.410599919073037e-06, "loss": 0.1469, "step": 2378 }, { "epoch": 0.53600698453826, "grad_norm": 0.47717879720300405, "learning_rate": 2.408738236695922e-06, "loss": 0.1514, "step": 2379 }, { "epoch": 0.5362322922240685, "grad_norm": 0.48877338489049654, "learning_rate": 2.4068766049930623e-06, "loss": 0.1577, "step": 2380 }, { "epoch": 0.536457599909877, "grad_norm": 0.49093102138390154, "learning_rate": 2.4050150249981522e-06, "loss": 0.1596, "step": 2381 }, { "epoch": 0.5366829075956854, "grad_norm": 0.49852870940863864, "learning_rate": 2.403153497744859e-06, "loss": 0.1625, "step": 2382 }, { "epoch": 0.5369082152814938, "grad_norm": 0.4438631646845407, "learning_rate": 2.4012920242668184e-06, "loss": 0.1504, "step": 2383 }, { "epoch": 0.5371335229673022, "grad_norm": 0.490620298226887, "learning_rate": 2.3994306055976374e-06, "loss": 0.1589, "step": 2384 }, { "epoch": 0.5373588306531106, "grad_norm": 0.49255329553538285, "learning_rate": 2.397569242770893e-06, "loss": 0.1637, "step": 2385 }, { "epoch": 0.537584138338919, "grad_norm": 0.470517428218469, "learning_rate": 2.3957079368201293e-06, "loss": 0.1485, "step": 2386 }, { "epoch": 0.5378094460247275, "grad_norm": 0.4442052068375048, "learning_rate": 2.393846688778861e-06, "loss": 0.1435, "step": 2387 }, { "epoch": 0.538034753710536, "grad_norm": 0.47767306805174803, "learning_rate": 2.39198549968057e-06, "loss": 0.1666, "step": 2388 }, { "epoch": 0.5382600613963444, "grad_norm": 0.4893993150625802, "learning_rate": 2.390124370558705e-06, "loss": 0.153, "step": 2389 }, { "epoch": 0.5384853690821528, "grad_norm": 0.5095806079847959, "learning_rate": 2.3882633024466813e-06, "loss": 0.1669, "step": 2390 }, { "epoch": 0.5387106767679612, "grad_norm": 0.461764023336289, "learning_rate": 2.386402296377881e-06, "loss": 0.1516, "step": 2391 }, { "epoch": 0.5389359844537697, "grad_norm": 0.45256030151813437, "learning_rate": 2.3845413533856517e-06, "loss": 0.1405, "step": 2392 }, { "epoch": 0.5391612921395781, "grad_norm": 0.48555523539900436, "learning_rate": 2.3826804745033046e-06, "loss": 0.1651, "step": 2393 }, { "epoch": 0.5393865998253865, "grad_norm": 0.46075267816473214, "learning_rate": 2.3808196607641176e-06, "loss": 0.1522, "step": 2394 }, { "epoch": 0.5396119075111949, "grad_norm": 0.48391583028397295, "learning_rate": 2.3789589132013304e-06, "loss": 0.1602, "step": 2395 }, { "epoch": 0.5398372151970035, "grad_norm": 0.47602108556598505, "learning_rate": 2.3770982328481464e-06, "loss": 0.1616, "step": 2396 }, { "epoch": 0.5400625228828119, "grad_norm": 0.4731398077638761, "learning_rate": 2.3752376207377333e-06, "loss": 0.1629, "step": 2397 }, { "epoch": 0.5402878305686203, "grad_norm": 0.4678881070699935, "learning_rate": 2.3733770779032185e-06, "loss": 0.1578, "step": 2398 }, { "epoch": 0.5405131382544287, "grad_norm": 0.5047146203164565, "learning_rate": 2.371516605377693e-06, "loss": 0.1649, "step": 2399 }, { "epoch": 0.5407384459402371, "grad_norm": 0.46549000580913674, "learning_rate": 2.3696562041942076e-06, "loss": 0.161, "step": 2400 }, { "epoch": 0.5409637536260455, "grad_norm": 0.4772724936234673, "learning_rate": 2.367795875385773e-06, "loss": 0.154, "step": 2401 }, { "epoch": 0.541189061311854, "grad_norm": 0.47575796244422874, "learning_rate": 2.3659356199853617e-06, "loss": 0.1376, "step": 2402 }, { "epoch": 0.5414143689976625, "grad_norm": 0.47979560555492917, "learning_rate": 2.3640754390259026e-06, "loss": 0.1556, "step": 2403 }, { "epoch": 0.5416396766834709, "grad_norm": 0.47076199460274065, "learning_rate": 2.362215333540287e-06, "loss": 0.1571, "step": 2404 }, { "epoch": 0.5418649843692793, "grad_norm": 0.4736130870197844, "learning_rate": 2.36035530456136e-06, "loss": 0.1491, "step": 2405 }, { "epoch": 0.5420902920550877, "grad_norm": 0.4561576715451127, "learning_rate": 2.3584953531219278e-06, "loss": 0.1487, "step": 2406 }, { "epoch": 0.5423155997408962, "grad_norm": 0.49891426407957157, "learning_rate": 2.356635480254751e-06, "loss": 0.1644, "step": 2407 }, { "epoch": 0.5425409074267046, "grad_norm": 0.4801905005277808, "learning_rate": 2.3547756869925485e-06, "loss": 0.1508, "step": 2408 }, { "epoch": 0.542766215112513, "grad_norm": 0.5022208650888927, "learning_rate": 2.3529159743679936e-06, "loss": 0.1625, "step": 2409 }, { "epoch": 0.5429915227983214, "grad_norm": 0.46139824228014115, "learning_rate": 2.3510563434137175e-06, "loss": 0.1436, "step": 2410 }, { "epoch": 0.5432168304841299, "grad_norm": 0.4559106153860657, "learning_rate": 2.3491967951623006e-06, "loss": 0.1388, "step": 2411 }, { "epoch": 0.5434421381699384, "grad_norm": 0.4617151774727058, "learning_rate": 2.347337330646282e-06, "loss": 0.1518, "step": 2412 }, { "epoch": 0.5436674458557468, "grad_norm": 0.448814934539756, "learning_rate": 2.3454779508981536e-06, "loss": 0.1442, "step": 2413 }, { "epoch": 0.5438927535415552, "grad_norm": 0.5048961507681691, "learning_rate": 2.3436186569503598e-06, "loss": 0.1582, "step": 2414 }, { "epoch": 0.5441180612273636, "grad_norm": 0.48880538543679386, "learning_rate": 2.341759449835297e-06, "loss": 0.156, "step": 2415 }, { "epoch": 0.544343368913172, "grad_norm": 0.46704869155160356, "learning_rate": 2.339900330585313e-06, "loss": 0.1546, "step": 2416 }, { "epoch": 0.5445686765989805, "grad_norm": 0.4270795493273556, "learning_rate": 2.338041300232708e-06, "loss": 0.1398, "step": 2417 }, { "epoch": 0.5447939842847889, "grad_norm": 0.4543419706489556, "learning_rate": 2.3361823598097316e-06, "loss": 0.148, "step": 2418 }, { "epoch": 0.5450192919705974, "grad_norm": 0.49543645533648134, "learning_rate": 2.334323510348585e-06, "loss": 0.1585, "step": 2419 }, { "epoch": 0.5452445996564058, "grad_norm": 0.48534446537771797, "learning_rate": 2.332464752881418e-06, "loss": 0.1581, "step": 2420 }, { "epoch": 0.5454699073422142, "grad_norm": 0.46715254152261837, "learning_rate": 2.330606088440329e-06, "loss": 0.1471, "step": 2421 }, { "epoch": 0.5456952150280227, "grad_norm": 0.49929947864432916, "learning_rate": 2.3287475180573653e-06, "loss": 0.1536, "step": 2422 }, { "epoch": 0.5459205227138311, "grad_norm": 0.45143500817783183, "learning_rate": 2.3268890427645213e-06, "loss": 0.1394, "step": 2423 }, { "epoch": 0.5461458303996395, "grad_norm": 0.47672250458243587, "learning_rate": 2.3250306635937385e-06, "loss": 0.1409, "step": 2424 }, { "epoch": 0.5463711380854479, "grad_norm": 0.48041838203924997, "learning_rate": 2.323172381576907e-06, "loss": 0.1516, "step": 2425 }, { "epoch": 0.5465964457712563, "grad_norm": 0.4613411287053727, "learning_rate": 2.3213141977458615e-06, "loss": 0.1415, "step": 2426 }, { "epoch": 0.5468217534570649, "grad_norm": 0.4860898207811665, "learning_rate": 2.3194561131323823e-06, "loss": 0.1548, "step": 2427 }, { "epoch": 0.5470470611428733, "grad_norm": 0.5168493415498433, "learning_rate": 2.3175981287681924e-06, "loss": 0.1627, "step": 2428 }, { "epoch": 0.5472723688286817, "grad_norm": 0.4950004563190919, "learning_rate": 2.3157402456849632e-06, "loss": 0.1559, "step": 2429 }, { "epoch": 0.5474976765144901, "grad_norm": 0.5077308120636856, "learning_rate": 2.3138824649143076e-06, "loss": 0.154, "step": 2430 }, { "epoch": 0.5477229842002985, "grad_norm": 0.4440272901386497, "learning_rate": 2.312024787487782e-06, "loss": 0.1498, "step": 2431 }, { "epoch": 0.5479482918861069, "grad_norm": 0.4799839361928191, "learning_rate": 2.310167214436885e-06, "loss": 0.1517, "step": 2432 }, { "epoch": 0.5481735995719154, "grad_norm": 0.49434889067719406, "learning_rate": 2.3083097467930583e-06, "loss": 0.1584, "step": 2433 }, { "epoch": 0.5483989072577238, "grad_norm": 0.485517757580691, "learning_rate": 2.306452385587683e-06, "loss": 0.1499, "step": 2434 }, { "epoch": 0.5486242149435323, "grad_norm": 0.4972321054541833, "learning_rate": 2.304595131852085e-06, "loss": 0.156, "step": 2435 }, { "epoch": 0.5488495226293407, "grad_norm": 0.48890762164687734, "learning_rate": 2.3027379866175263e-06, "loss": 0.1598, "step": 2436 }, { "epoch": 0.5490748303151491, "grad_norm": 0.4606463681481626, "learning_rate": 2.300880950915211e-06, "loss": 0.1497, "step": 2437 }, { "epoch": 0.5493001380009576, "grad_norm": 0.4696087783219515, "learning_rate": 2.2990240257762817e-06, "loss": 0.151, "step": 2438 }, { "epoch": 0.549525445686766, "grad_norm": 0.46354615609402217, "learning_rate": 2.2971672122318196e-06, "loss": 0.1296, "step": 2439 }, { "epoch": 0.5497507533725744, "grad_norm": 0.4538811159223632, "learning_rate": 2.295310511312844e-06, "loss": 0.1485, "step": 2440 }, { "epoch": 0.5499760610583828, "grad_norm": 0.5031836245171942, "learning_rate": 2.293453924050312e-06, "loss": 0.1542, "step": 2441 }, { "epoch": 0.5502013687441912, "grad_norm": 0.4862351352432136, "learning_rate": 2.2915974514751173e-06, "loss": 0.1607, "step": 2442 }, { "epoch": 0.5504266764299998, "grad_norm": 0.46367126664664826, "learning_rate": 2.2897410946180897e-06, "loss": 0.1558, "step": 2443 }, { "epoch": 0.5506519841158082, "grad_norm": 0.4954802060102958, "learning_rate": 2.287884854509995e-06, "loss": 0.1476, "step": 2444 }, { "epoch": 0.5508772918016166, "grad_norm": 0.4713809372504246, "learning_rate": 2.286028732181535e-06, "loss": 0.1584, "step": 2445 }, { "epoch": 0.551102599487425, "grad_norm": 0.4838974999326031, "learning_rate": 2.2841727286633444e-06, "loss": 0.1486, "step": 2446 }, { "epoch": 0.5513279071732334, "grad_norm": 0.4721516817905157, "learning_rate": 2.282316844985992e-06, "loss": 0.1453, "step": 2447 }, { "epoch": 0.5515532148590419, "grad_norm": 0.4846307488266691, "learning_rate": 2.280461082179982e-06, "loss": 0.1511, "step": 2448 }, { "epoch": 0.5517785225448503, "grad_norm": 0.4830182185008593, "learning_rate": 2.27860544127575e-06, "loss": 0.1617, "step": 2449 }, { "epoch": 0.5520038302306588, "grad_norm": 0.4733189597116919, "learning_rate": 2.2767499233036635e-06, "loss": 0.1486, "step": 2450 }, { "epoch": 0.5522291379164672, "grad_norm": 0.474466806300033, "learning_rate": 2.2748945292940237e-06, "loss": 0.1437, "step": 2451 }, { "epoch": 0.5524544456022756, "grad_norm": 0.47381411512019134, "learning_rate": 2.2730392602770617e-06, "loss": 0.1487, "step": 2452 }, { "epoch": 0.552679753288084, "grad_norm": 0.46505376816656113, "learning_rate": 2.271184117282938e-06, "loss": 0.1449, "step": 2453 }, { "epoch": 0.5529050609738925, "grad_norm": 0.4947855734546613, "learning_rate": 2.269329101341745e-06, "loss": 0.1599, "step": 2454 }, { "epoch": 0.5531303686597009, "grad_norm": 0.45848960827036234, "learning_rate": 2.267474213483505e-06, "loss": 0.14, "step": 2455 }, { "epoch": 0.5533556763455093, "grad_norm": 0.47049641172903567, "learning_rate": 2.265619454738166e-06, "loss": 0.1506, "step": 2456 }, { "epoch": 0.5535809840313177, "grad_norm": 0.4852049885196248, "learning_rate": 2.2637648261356078e-06, "loss": 0.1582, "step": 2457 }, { "epoch": 0.5538062917171263, "grad_norm": 0.469824519560325, "learning_rate": 2.2619103287056366e-06, "loss": 0.1514, "step": 2458 }, { "epoch": 0.5540315994029347, "grad_norm": 0.47447240106880856, "learning_rate": 2.260055963477985e-06, "loss": 0.1555, "step": 2459 }, { "epoch": 0.5542569070887431, "grad_norm": 0.473070074615829, "learning_rate": 2.2582017314823135e-06, "loss": 0.1534, "step": 2460 }, { "epoch": 0.5544822147745515, "grad_norm": 0.46559527370566156, "learning_rate": 2.2563476337482073e-06, "loss": 0.1543, "step": 2461 }, { "epoch": 0.5547075224603599, "grad_norm": 0.45421710995779324, "learning_rate": 2.254493671305179e-06, "loss": 0.1517, "step": 2462 }, { "epoch": 0.5549328301461683, "grad_norm": 0.4869363376822142, "learning_rate": 2.2526398451826638e-06, "loss": 0.1576, "step": 2463 }, { "epoch": 0.5551581378319768, "grad_norm": 0.46141489622645343, "learning_rate": 2.250786156410022e-06, "loss": 0.1536, "step": 2464 }, { "epoch": 0.5553834455177852, "grad_norm": 0.4943768008406088, "learning_rate": 2.2489326060165384e-06, "loss": 0.1626, "step": 2465 }, { "epoch": 0.5556087532035937, "grad_norm": 0.4709770438969968, "learning_rate": 2.24707919503142e-06, "loss": 0.1553, "step": 2466 }, { "epoch": 0.5558340608894021, "grad_norm": 0.44988307346347045, "learning_rate": 2.2452259244837974e-06, "loss": 0.1408, "step": 2467 }, { "epoch": 0.5560593685752105, "grad_norm": 0.44722088845974944, "learning_rate": 2.2433727954027227e-06, "loss": 0.1404, "step": 2468 }, { "epoch": 0.556284676261019, "grad_norm": 0.4842500435166934, "learning_rate": 2.24151980881717e-06, "loss": 0.1537, "step": 2469 }, { "epoch": 0.5565099839468274, "grad_norm": 0.471125979789008, "learning_rate": 2.239666965756032e-06, "loss": 0.1434, "step": 2470 }, { "epoch": 0.5567352916326358, "grad_norm": 0.45948464636699554, "learning_rate": 2.237814267248124e-06, "loss": 0.1442, "step": 2471 }, { "epoch": 0.5569605993184442, "grad_norm": 0.48146904571655486, "learning_rate": 2.2359617143221805e-06, "loss": 0.1557, "step": 2472 }, { "epoch": 0.5571859070042526, "grad_norm": 0.46022255259287337, "learning_rate": 2.234109308006856e-06, "loss": 0.1452, "step": 2473 }, { "epoch": 0.5574112146900612, "grad_norm": 0.4719622400103109, "learning_rate": 2.232257049330722e-06, "loss": 0.1368, "step": 2474 }, { "epoch": 0.5576365223758696, "grad_norm": 0.4698878977583087, "learning_rate": 2.2304049393222686e-06, "loss": 0.1494, "step": 2475 }, { "epoch": 0.557861830061678, "grad_norm": 0.4714898445421254, "learning_rate": 2.2285529790099034e-06, "loss": 0.1474, "step": 2476 }, { "epoch": 0.5580871377474864, "grad_norm": 0.47770540573100884, "learning_rate": 2.2267011694219513e-06, "loss": 0.1567, "step": 2477 }, { "epoch": 0.5583124454332948, "grad_norm": 0.4909679540323867, "learning_rate": 2.224849511586652e-06, "loss": 0.1474, "step": 2478 }, { "epoch": 0.5585377531191033, "grad_norm": 0.47098216682326777, "learning_rate": 2.2229980065321636e-06, "loss": 0.1364, "step": 2479 }, { "epoch": 0.5587630608049117, "grad_norm": 0.4768020498354351, "learning_rate": 2.221146655286558e-06, "loss": 0.1599, "step": 2480 }, { "epoch": 0.5589883684907201, "grad_norm": 0.4800137909514748, "learning_rate": 2.2192954588778195e-06, "loss": 0.157, "step": 2481 }, { "epoch": 0.5592136761765286, "grad_norm": 0.4677165542238797, "learning_rate": 2.21744441833385e-06, "loss": 0.1532, "step": 2482 }, { "epoch": 0.559438983862337, "grad_norm": 0.487362845686037, "learning_rate": 2.2155935346824634e-06, "loss": 0.1625, "step": 2483 }, { "epoch": 0.5596642915481455, "grad_norm": 0.4736455775888925, "learning_rate": 2.2137428089513857e-06, "loss": 0.1476, "step": 2484 }, { "epoch": 0.5598895992339539, "grad_norm": 0.5085683208858834, "learning_rate": 2.2118922421682563e-06, "loss": 0.1628, "step": 2485 }, { "epoch": 0.5601149069197623, "grad_norm": 0.4795859420103309, "learning_rate": 2.2100418353606262e-06, "loss": 0.1561, "step": 2486 }, { "epoch": 0.5603402146055707, "grad_norm": 0.4727352372361001, "learning_rate": 2.208191589555956e-06, "loss": 0.1433, "step": 2487 }, { "epoch": 0.5605655222913791, "grad_norm": 0.4978582284269819, "learning_rate": 2.20634150578162e-06, "loss": 0.1529, "step": 2488 }, { "epoch": 0.5607908299771875, "grad_norm": 0.4799519585661368, "learning_rate": 2.2044915850649e-06, "loss": 0.1495, "step": 2489 }, { "epoch": 0.5610161376629961, "grad_norm": 0.4882231226390542, "learning_rate": 2.202641828432988e-06, "loss": 0.1538, "step": 2490 }, { "epoch": 0.5612414453488045, "grad_norm": 0.49437833162263073, "learning_rate": 2.2007922369129854e-06, "loss": 0.1629, "step": 2491 }, { "epoch": 0.5614667530346129, "grad_norm": 0.48283606695640197, "learning_rate": 2.1989428115319005e-06, "loss": 0.1559, "step": 2492 }, { "epoch": 0.5616920607204213, "grad_norm": 0.4824869865618337, "learning_rate": 2.1970935533166505e-06, "loss": 0.1551, "step": 2493 }, { "epoch": 0.5619173684062297, "grad_norm": 0.47903485883095387, "learning_rate": 2.19524446329406e-06, "loss": 0.1464, "step": 2494 }, { "epoch": 0.5621426760920382, "grad_norm": 0.47432503827693573, "learning_rate": 2.1933955424908596e-06, "loss": 0.164, "step": 2495 }, { "epoch": 0.5623679837778466, "grad_norm": 0.44292504233636465, "learning_rate": 2.1915467919336862e-06, "loss": 0.1338, "step": 2496 }, { "epoch": 0.562593291463655, "grad_norm": 0.4519280759224281, "learning_rate": 2.1896982126490825e-06, "loss": 0.1498, "step": 2497 }, { "epoch": 0.5628185991494635, "grad_norm": 0.49827045995047065, "learning_rate": 2.1878498056634946e-06, "loss": 0.1618, "step": 2498 }, { "epoch": 0.563043906835272, "grad_norm": 0.4730814583558337, "learning_rate": 2.1860015720032747e-06, "loss": 0.1539, "step": 2499 }, { "epoch": 0.5632692145210804, "grad_norm": 0.4851839051181484, "learning_rate": 2.1841535126946777e-06, "loss": 0.1425, "step": 2500 }, { "epoch": 0.5632692145210804, "eval_loss": 0.15211372077465057, "eval_runtime": 56.972, "eval_samples_per_second": 50.376, "eval_steps_per_second": 6.301, "step": 2500 }, { "epoch": 0.5634945222068888, "grad_norm": 0.5224387250101071, "learning_rate": 2.1823056287638623e-06, "loss": 0.1621, "step": 2501 }, { "epoch": 0.5637198298926972, "grad_norm": 0.4877748448316099, "learning_rate": 2.180457921236889e-06, "loss": 0.1497, "step": 2502 }, { "epoch": 0.5639451375785056, "grad_norm": 0.4737044072894314, "learning_rate": 2.1786103911397218e-06, "loss": 0.1477, "step": 2503 }, { "epoch": 0.564170445264314, "grad_norm": 0.44795906173065436, "learning_rate": 2.176763039498225e-06, "loss": 0.1452, "step": 2504 }, { "epoch": 0.5643957529501226, "grad_norm": 0.48310843295660033, "learning_rate": 2.174915867338164e-06, "loss": 0.1458, "step": 2505 }, { "epoch": 0.564621060635931, "grad_norm": 0.47537092410064236, "learning_rate": 2.1730688756852046e-06, "loss": 0.1497, "step": 2506 }, { "epoch": 0.5648463683217394, "grad_norm": 0.4872498445237118, "learning_rate": 2.171222065564913e-06, "loss": 0.1656, "step": 2507 }, { "epoch": 0.5650716760075478, "grad_norm": 0.4905041228342476, "learning_rate": 2.1693754380027533e-06, "loss": 0.1563, "step": 2508 }, { "epoch": 0.5652969836933562, "grad_norm": 0.47516862562513035, "learning_rate": 2.1675289940240883e-06, "loss": 0.1424, "step": 2509 }, { "epoch": 0.5655222913791647, "grad_norm": 0.42823728751408857, "learning_rate": 2.165682734654181e-06, "loss": 0.1282, "step": 2510 }, { "epoch": 0.5657475990649731, "grad_norm": 0.5124273639117506, "learning_rate": 2.16383666091819e-06, "loss": 0.1566, "step": 2511 }, { "epoch": 0.5659729067507815, "grad_norm": 0.4662028274966955, "learning_rate": 2.161990773841171e-06, "loss": 0.1547, "step": 2512 }, { "epoch": 0.56619821443659, "grad_norm": 0.47300391524406277, "learning_rate": 2.1601450744480763e-06, "loss": 0.1499, "step": 2513 }, { "epoch": 0.5664235221223984, "grad_norm": 0.5055930403538434, "learning_rate": 2.1582995637637543e-06, "loss": 0.1602, "step": 2514 }, { "epoch": 0.5666488298082069, "grad_norm": 0.46885317458281295, "learning_rate": 2.156454242812948e-06, "loss": 0.1429, "step": 2515 }, { "epoch": 0.5668741374940153, "grad_norm": 0.4912017132958433, "learning_rate": 2.1546091126202955e-06, "loss": 0.1636, "step": 2516 }, { "epoch": 0.5670994451798237, "grad_norm": 0.4595886886992803, "learning_rate": 2.1527641742103282e-06, "loss": 0.1405, "step": 2517 }, { "epoch": 0.5673247528656321, "grad_norm": 0.4988690329908019, "learning_rate": 2.150919428607472e-06, "loss": 0.1501, "step": 2518 }, { "epoch": 0.5675500605514405, "grad_norm": 0.4762729304575605, "learning_rate": 2.149074876836045e-06, "loss": 0.1564, "step": 2519 }, { "epoch": 0.567775368237249, "grad_norm": 0.47204168342815955, "learning_rate": 2.147230519920259e-06, "loss": 0.1641, "step": 2520 }, { "epoch": 0.5680006759230575, "grad_norm": 0.46207272776568986, "learning_rate": 2.1453863588842165e-06, "loss": 0.1418, "step": 2521 }, { "epoch": 0.5682259836088659, "grad_norm": 0.4668825504120351, "learning_rate": 2.143542394751911e-06, "loss": 0.1538, "step": 2522 }, { "epoch": 0.5684512912946743, "grad_norm": 0.4837268539344098, "learning_rate": 2.1416986285472268e-06, "loss": 0.1541, "step": 2523 }, { "epoch": 0.5686765989804827, "grad_norm": 0.4624010885176109, "learning_rate": 2.139855061293939e-06, "loss": 0.1537, "step": 2524 }, { "epoch": 0.5689019066662911, "grad_norm": 0.4710709274529512, "learning_rate": 2.1380116940157107e-06, "loss": 0.155, "step": 2525 }, { "epoch": 0.5691272143520996, "grad_norm": 0.5156089057170854, "learning_rate": 2.1361685277360973e-06, "loss": 0.1608, "step": 2526 }, { "epoch": 0.569352522037908, "grad_norm": 0.48493200248859675, "learning_rate": 2.1343255634785386e-06, "loss": 0.1503, "step": 2527 }, { "epoch": 0.5695778297237164, "grad_norm": 0.4333440828139637, "learning_rate": 2.132482802266364e-06, "loss": 0.122, "step": 2528 }, { "epoch": 0.5698031374095249, "grad_norm": 0.4607815473590659, "learning_rate": 2.1306402451227907e-06, "loss": 0.1503, "step": 2529 }, { "epoch": 0.5700284450953333, "grad_norm": 0.46389727276493975, "learning_rate": 2.128797893070922e-06, "loss": 0.1513, "step": 2530 }, { "epoch": 0.5702537527811418, "grad_norm": 0.5132583394666452, "learning_rate": 2.1269557471337467e-06, "loss": 0.1515, "step": 2531 }, { "epoch": 0.5704790604669502, "grad_norm": 0.5026188029568095, "learning_rate": 2.1251138083341404e-06, "loss": 0.1528, "step": 2532 }, { "epoch": 0.5707043681527586, "grad_norm": 0.481269253549719, "learning_rate": 2.123272077694864e-06, "loss": 0.1453, "step": 2533 }, { "epoch": 0.570929675838567, "grad_norm": 0.46800101724365095, "learning_rate": 2.1214305562385592e-06, "loss": 0.1548, "step": 2534 }, { "epoch": 0.5711549835243754, "grad_norm": 0.4658194574312725, "learning_rate": 2.1195892449877556e-06, "loss": 0.1455, "step": 2535 }, { "epoch": 0.5713802912101839, "grad_norm": 0.4773496273400579, "learning_rate": 2.117748144964865e-06, "loss": 0.1571, "step": 2536 }, { "epoch": 0.5716055988959924, "grad_norm": 0.4741653276708234, "learning_rate": 2.115907257192182e-06, "loss": 0.1482, "step": 2537 }, { "epoch": 0.5718309065818008, "grad_norm": 0.4492929001791757, "learning_rate": 2.1140665826918823e-06, "loss": 0.138, "step": 2538 }, { "epoch": 0.5720562142676092, "grad_norm": 0.5064130427481476, "learning_rate": 2.1122261224860237e-06, "loss": 0.1645, "step": 2539 }, { "epoch": 0.5722815219534176, "grad_norm": 0.46568785233626575, "learning_rate": 2.1103858775965455e-06, "loss": 0.1564, "step": 2540 }, { "epoch": 0.5725068296392261, "grad_norm": 0.4824316713930859, "learning_rate": 2.1085458490452663e-06, "loss": 0.1539, "step": 2541 }, { "epoch": 0.5727321373250345, "grad_norm": 0.48853412799556045, "learning_rate": 2.106706037853887e-06, "loss": 0.1441, "step": 2542 }, { "epoch": 0.5729574450108429, "grad_norm": 0.46998661739099296, "learning_rate": 2.1048664450439853e-06, "loss": 0.1403, "step": 2543 }, { "epoch": 0.5731827526966513, "grad_norm": 0.4822964462853322, "learning_rate": 2.103027071637018e-06, "loss": 0.149, "step": 2544 }, { "epoch": 0.5734080603824598, "grad_norm": 0.4713654126005489, "learning_rate": 2.101187918654321e-06, "loss": 0.143, "step": 2545 }, { "epoch": 0.5736333680682683, "grad_norm": 0.5069590134179427, "learning_rate": 2.099348987117108e-06, "loss": 0.1608, "step": 2546 }, { "epoch": 0.5738586757540767, "grad_norm": 0.48913938668701046, "learning_rate": 2.0975102780464674e-06, "loss": 0.1542, "step": 2547 }, { "epoch": 0.5740839834398851, "grad_norm": 0.4917953755983581, "learning_rate": 2.095671792463368e-06, "loss": 0.1588, "step": 2548 }, { "epoch": 0.5743092911256935, "grad_norm": 0.47742831862757695, "learning_rate": 2.0938335313886513e-06, "loss": 0.1547, "step": 2549 }, { "epoch": 0.5745345988115019, "grad_norm": 0.4400050925635982, "learning_rate": 2.0919954958430357e-06, "loss": 0.1359, "step": 2550 }, { "epoch": 0.5747599064973103, "grad_norm": 0.5085400055192658, "learning_rate": 2.0901576868471125e-06, "loss": 0.1596, "step": 2551 }, { "epoch": 0.5749852141831189, "grad_norm": 0.44727232653419413, "learning_rate": 2.0883201054213493e-06, "loss": 0.1451, "step": 2552 }, { "epoch": 0.5752105218689273, "grad_norm": 0.46059727795786004, "learning_rate": 2.086482752586087e-06, "loss": 0.1487, "step": 2553 }, { "epoch": 0.5754358295547357, "grad_norm": 0.46831069725509195, "learning_rate": 2.0846456293615384e-06, "loss": 0.1468, "step": 2554 }, { "epoch": 0.5756611372405441, "grad_norm": 0.46378208101596174, "learning_rate": 2.0828087367677906e-06, "loss": 0.1518, "step": 2555 }, { "epoch": 0.5758864449263525, "grad_norm": 0.4756688058583516, "learning_rate": 2.0809720758247997e-06, "loss": 0.1604, "step": 2556 }, { "epoch": 0.576111752612161, "grad_norm": 0.5154499252124413, "learning_rate": 2.0791356475523967e-06, "loss": 0.1649, "step": 2557 }, { "epoch": 0.5763370602979694, "grad_norm": 0.49943906859891457, "learning_rate": 2.077299452970282e-06, "loss": 0.1584, "step": 2558 }, { "epoch": 0.5765623679837778, "grad_norm": 0.47353120279436367, "learning_rate": 2.0754634930980245e-06, "loss": 0.1526, "step": 2559 }, { "epoch": 0.5767876756695863, "grad_norm": 0.4764005539219673, "learning_rate": 2.0736277689550655e-06, "loss": 0.1521, "step": 2560 }, { "epoch": 0.5770129833553947, "grad_norm": 0.46881451194766166, "learning_rate": 2.0717922815607134e-06, "loss": 0.1569, "step": 2561 }, { "epoch": 0.5772382910412032, "grad_norm": 0.4827888344579709, "learning_rate": 2.069957031934147e-06, "loss": 0.1478, "step": 2562 }, { "epoch": 0.5774635987270116, "grad_norm": 0.5023148970873527, "learning_rate": 2.0681220210944106e-06, "loss": 0.1609, "step": 2563 }, { "epoch": 0.57768890641282, "grad_norm": 0.49917019654858047, "learning_rate": 2.066287250060418e-06, "loss": 0.1616, "step": 2564 }, { "epoch": 0.5779142140986284, "grad_norm": 0.4968052640321321, "learning_rate": 2.06445271985095e-06, "loss": 0.1575, "step": 2565 }, { "epoch": 0.5781395217844368, "grad_norm": 0.4487199967977616, "learning_rate": 2.062618431484652e-06, "loss": 0.1438, "step": 2566 }, { "epoch": 0.5783648294702453, "grad_norm": 0.4846379596071071, "learning_rate": 2.060784385980036e-06, "loss": 0.1493, "step": 2567 }, { "epoch": 0.5785901371560538, "grad_norm": 0.4792229250058271, "learning_rate": 2.05895058435548e-06, "loss": 0.138, "step": 2568 }, { "epoch": 0.5788154448418622, "grad_norm": 0.4901580327188909, "learning_rate": 2.0571170276292233e-06, "loss": 0.1469, "step": 2569 }, { "epoch": 0.5790407525276706, "grad_norm": 0.48749294511610947, "learning_rate": 2.0552837168193738e-06, "loss": 0.1497, "step": 2570 }, { "epoch": 0.579266060213479, "grad_norm": 0.4548388896156358, "learning_rate": 2.0534506529439e-06, "loss": 0.1419, "step": 2571 }, { "epoch": 0.5794913678992875, "grad_norm": 0.5154923361811012, "learning_rate": 2.051617837020633e-06, "loss": 0.1596, "step": 2572 }, { "epoch": 0.5797166755850959, "grad_norm": 0.4729595000797912, "learning_rate": 2.0497852700672692e-06, "loss": 0.14, "step": 2573 }, { "epoch": 0.5799419832709043, "grad_norm": 0.4959258355172917, "learning_rate": 2.047952953101363e-06, "loss": 0.1604, "step": 2574 }, { "epoch": 0.5801672909567127, "grad_norm": 0.45432381151595685, "learning_rate": 2.0461208871403333e-06, "loss": 0.1414, "step": 2575 }, { "epoch": 0.5803925986425212, "grad_norm": 0.47096658021491633, "learning_rate": 2.0442890732014563e-06, "loss": 0.1454, "step": 2576 }, { "epoch": 0.5806179063283297, "grad_norm": 0.4525308560866689, "learning_rate": 2.042457512301871e-06, "loss": 0.14, "step": 2577 }, { "epoch": 0.5808432140141381, "grad_norm": 0.5020390621894459, "learning_rate": 2.040626205458574e-06, "loss": 0.1537, "step": 2578 }, { "epoch": 0.5810685216999465, "grad_norm": 0.4954963757896994, "learning_rate": 2.038795153688423e-06, "loss": 0.1578, "step": 2579 }, { "epoch": 0.5812938293857549, "grad_norm": 0.4928423187106702, "learning_rate": 2.0369643580081326e-06, "loss": 0.1594, "step": 2580 }, { "epoch": 0.5815191370715633, "grad_norm": 0.4882262305198443, "learning_rate": 2.0351338194342744e-06, "loss": 0.1478, "step": 2581 }, { "epoch": 0.5817444447573717, "grad_norm": 0.4604675050885477, "learning_rate": 2.0333035389832795e-06, "loss": 0.1417, "step": 2582 }, { "epoch": 0.5819697524431802, "grad_norm": 0.5080053698936663, "learning_rate": 2.0314735176714336e-06, "loss": 0.1575, "step": 2583 }, { "epoch": 0.5821950601289887, "grad_norm": 0.5019888888860421, "learning_rate": 2.0296437565148786e-06, "loss": 0.161, "step": 2584 }, { "epoch": 0.5824203678147971, "grad_norm": 0.49531296644130557, "learning_rate": 2.0278142565296153e-06, "loss": 0.1429, "step": 2585 }, { "epoch": 0.5826456755006055, "grad_norm": 0.4866792190717066, "learning_rate": 2.025985018731494e-06, "loss": 0.1581, "step": 2586 }, { "epoch": 0.582870983186414, "grad_norm": 0.4894237643330584, "learning_rate": 2.0241560441362235e-06, "loss": 0.1524, "step": 2587 }, { "epoch": 0.5830962908722224, "grad_norm": 0.4945925080814945, "learning_rate": 2.0223273337593647e-06, "loss": 0.1572, "step": 2588 }, { "epoch": 0.5833215985580308, "grad_norm": 0.4973169711801826, "learning_rate": 2.020498888616333e-06, "loss": 0.1583, "step": 2589 }, { "epoch": 0.5835469062438392, "grad_norm": 0.5017905379704919, "learning_rate": 2.0186707097223952e-06, "loss": 0.1509, "step": 2590 }, { "epoch": 0.5837722139296476, "grad_norm": 0.48194812401972437, "learning_rate": 2.016842798092672e-06, "loss": 0.1458, "step": 2591 }, { "epoch": 0.5839975216154561, "grad_norm": 0.4790546754896114, "learning_rate": 2.0150151547421333e-06, "loss": 0.1524, "step": 2592 }, { "epoch": 0.5842228293012646, "grad_norm": 0.49411625346523946, "learning_rate": 2.013187780685602e-06, "loss": 0.1494, "step": 2593 }, { "epoch": 0.584448136987073, "grad_norm": 0.44879665522584505, "learning_rate": 2.0113606769377497e-06, "loss": 0.1469, "step": 2594 }, { "epoch": 0.5846734446728814, "grad_norm": 0.48272208567022384, "learning_rate": 2.009533844513101e-06, "loss": 0.1552, "step": 2595 }, { "epoch": 0.5848987523586898, "grad_norm": 0.4599871152837736, "learning_rate": 2.0077072844260267e-06, "loss": 0.1406, "step": 2596 }, { "epoch": 0.5851240600444982, "grad_norm": 0.5004012967022493, "learning_rate": 2.0058809976907475e-06, "loss": 0.1498, "step": 2597 }, { "epoch": 0.5853493677303067, "grad_norm": 0.46983282686813543, "learning_rate": 2.0040549853213326e-06, "loss": 0.1571, "step": 2598 }, { "epoch": 0.5855746754161152, "grad_norm": 0.46983737953382637, "learning_rate": 2.0022292483316984e-06, "loss": 0.1503, "step": 2599 }, { "epoch": 0.5857999831019236, "grad_norm": 0.4820623624485316, "learning_rate": 2.0004037877356085e-06, "loss": 0.1516, "step": 2600 }, { "epoch": 0.586025290787732, "grad_norm": 0.4855950313891801, "learning_rate": 1.998578604546674e-06, "loss": 0.1506, "step": 2601 }, { "epoch": 0.5862505984735404, "grad_norm": 0.4875750250191104, "learning_rate": 1.9967536997783495e-06, "loss": 0.1578, "step": 2602 }, { "epoch": 0.5864759061593489, "grad_norm": 0.4553425030090635, "learning_rate": 1.9949290744439392e-06, "loss": 0.1456, "step": 2603 }, { "epoch": 0.5867012138451573, "grad_norm": 0.4804593658936272, "learning_rate": 1.9931047295565863e-06, "loss": 0.1531, "step": 2604 }, { "epoch": 0.5869265215309657, "grad_norm": 0.45953845789303216, "learning_rate": 1.9912806661292838e-06, "loss": 0.1428, "step": 2605 }, { "epoch": 0.5871518292167741, "grad_norm": 0.49028482450191974, "learning_rate": 1.989456885174865e-06, "loss": 0.1575, "step": 2606 }, { "epoch": 0.5873771369025826, "grad_norm": 0.4458092542065396, "learning_rate": 1.987633387706008e-06, "loss": 0.1346, "step": 2607 }, { "epoch": 0.5876024445883911, "grad_norm": 0.4609231008990843, "learning_rate": 1.9858101747352326e-06, "loss": 0.1426, "step": 2608 }, { "epoch": 0.5878277522741995, "grad_norm": 0.49280751358160235, "learning_rate": 1.9839872472749016e-06, "loss": 0.1497, "step": 2609 }, { "epoch": 0.5880530599600079, "grad_norm": 0.4375963099756855, "learning_rate": 1.9821646063372174e-06, "loss": 0.1349, "step": 2610 }, { "epoch": 0.5882783676458163, "grad_norm": 0.4595108845111774, "learning_rate": 1.9803422529342264e-06, "loss": 0.1341, "step": 2611 }, { "epoch": 0.5885036753316247, "grad_norm": 0.4526724622007013, "learning_rate": 1.978520188077813e-06, "loss": 0.1384, "step": 2612 }, { "epoch": 0.5887289830174331, "grad_norm": 0.5053804717077324, "learning_rate": 1.976698412779701e-06, "loss": 0.1624, "step": 2613 }, { "epoch": 0.5889542907032416, "grad_norm": 0.45473150017860553, "learning_rate": 1.9748769280514544e-06, "loss": 0.1468, "step": 2614 }, { "epoch": 0.5891795983890501, "grad_norm": 0.4774154720585141, "learning_rate": 1.973055734904477e-06, "loss": 0.1521, "step": 2615 }, { "epoch": 0.5894049060748585, "grad_norm": 0.4993969480390666, "learning_rate": 1.971234834350008e-06, "loss": 0.1493, "step": 2616 }, { "epoch": 0.5896302137606669, "grad_norm": 0.4814371640595727, "learning_rate": 1.9694142273991264e-06, "loss": 0.15, "step": 2617 }, { "epoch": 0.5898555214464754, "grad_norm": 0.4811078134010406, "learning_rate": 1.967593915062748e-06, "loss": 0.1549, "step": 2618 }, { "epoch": 0.5900808291322838, "grad_norm": 0.49120183783803484, "learning_rate": 1.9657738983516227e-06, "loss": 0.1528, "step": 2619 }, { "epoch": 0.5903061368180922, "grad_norm": 0.5010617162358244, "learning_rate": 1.96395417827634e-06, "loss": 0.1643, "step": 2620 }, { "epoch": 0.5905314445039006, "grad_norm": 0.4265067214265054, "learning_rate": 1.9621347558473216e-06, "loss": 0.1257, "step": 2621 }, { "epoch": 0.590756752189709, "grad_norm": 0.48263433337715467, "learning_rate": 1.960315632074824e-06, "loss": 0.1532, "step": 2622 }, { "epoch": 0.5909820598755176, "grad_norm": 0.4806343374996929, "learning_rate": 1.95849680796894e-06, "loss": 0.1459, "step": 2623 }, { "epoch": 0.591207367561326, "grad_norm": 0.484164716047899, "learning_rate": 1.9566782845395945e-06, "loss": 0.1572, "step": 2624 }, { "epoch": 0.5914326752471344, "grad_norm": 0.4839391840783974, "learning_rate": 1.9548600627965454e-06, "loss": 0.1607, "step": 2625 }, { "epoch": 0.5916579829329428, "grad_norm": 0.49965378539140476, "learning_rate": 1.9530421437493843e-06, "loss": 0.1531, "step": 2626 }, { "epoch": 0.5918832906187512, "grad_norm": 0.4769090414318042, "learning_rate": 1.951224528407534e-06, "loss": 0.1454, "step": 2627 }, { "epoch": 0.5921085983045596, "grad_norm": 0.47164774135841486, "learning_rate": 1.949407217780247e-06, "loss": 0.1517, "step": 2628 }, { "epoch": 0.5923339059903681, "grad_norm": 0.4840071951346353, "learning_rate": 1.94759021287661e-06, "loss": 0.163, "step": 2629 }, { "epoch": 0.5925592136761765, "grad_norm": 0.4964969178311975, "learning_rate": 1.945773514705537e-06, "loss": 0.1517, "step": 2630 }, { "epoch": 0.592784521361985, "grad_norm": 0.49816050788736993, "learning_rate": 1.943957124275773e-06, "loss": 0.1557, "step": 2631 }, { "epoch": 0.5930098290477934, "grad_norm": 0.4413938462386378, "learning_rate": 1.9421410425958915e-06, "loss": 0.1296, "step": 2632 }, { "epoch": 0.5932351367336018, "grad_norm": 0.5117692482649495, "learning_rate": 1.9403252706742957e-06, "loss": 0.162, "step": 2633 }, { "epoch": 0.5934604444194103, "grad_norm": 0.4582300780902382, "learning_rate": 1.938509809519216e-06, "loss": 0.1419, "step": 2634 }, { "epoch": 0.5936857521052187, "grad_norm": 0.4811552477231402, "learning_rate": 1.9366946601387103e-06, "loss": 0.1483, "step": 2635 }, { "epoch": 0.5939110597910271, "grad_norm": 0.513375477344272, "learning_rate": 1.934879823540663e-06, "loss": 0.1699, "step": 2636 }, { "epoch": 0.5941363674768355, "grad_norm": 0.5216389103084176, "learning_rate": 1.9330653007327852e-06, "loss": 0.162, "step": 2637 }, { "epoch": 0.5943616751626439, "grad_norm": 0.4702534040240756, "learning_rate": 1.931251092722615e-06, "loss": 0.1518, "step": 2638 }, { "epoch": 0.5945869828484525, "grad_norm": 0.4873357715988879, "learning_rate": 1.9294372005175125e-06, "loss": 0.1514, "step": 2639 }, { "epoch": 0.5948122905342609, "grad_norm": 0.5050029824413044, "learning_rate": 1.9276236251246655e-06, "loss": 0.1496, "step": 2640 }, { "epoch": 0.5950375982200693, "grad_norm": 0.45566476221772784, "learning_rate": 1.9258103675510846e-06, "loss": 0.1444, "step": 2641 }, { "epoch": 0.5952629059058777, "grad_norm": 0.5160389746563608, "learning_rate": 1.9239974288036044e-06, "loss": 0.1655, "step": 2642 }, { "epoch": 0.5954882135916861, "grad_norm": 0.4321268048853623, "learning_rate": 1.9221848098888817e-06, "loss": 0.1282, "step": 2643 }, { "epoch": 0.5957135212774946, "grad_norm": 0.4705484540912023, "learning_rate": 1.920372511813397e-06, "loss": 0.1418, "step": 2644 }, { "epoch": 0.595938828963303, "grad_norm": 0.4727463487631332, "learning_rate": 1.9185605355834518e-06, "loss": 0.1495, "step": 2645 }, { "epoch": 0.5961641366491115, "grad_norm": 0.47996660778579237, "learning_rate": 1.916748882205168e-06, "loss": 0.1393, "step": 2646 }, { "epoch": 0.5963894443349199, "grad_norm": 0.47144733107356446, "learning_rate": 1.91493755268449e-06, "loss": 0.1427, "step": 2647 }, { "epoch": 0.5966147520207283, "grad_norm": 0.4486714323008961, "learning_rate": 1.913126548027181e-06, "loss": 0.1444, "step": 2648 }, { "epoch": 0.5968400597065368, "grad_norm": 0.46483001125109336, "learning_rate": 1.9113158692388253e-06, "loss": 0.1482, "step": 2649 }, { "epoch": 0.5970653673923452, "grad_norm": 0.4758845188227502, "learning_rate": 1.909505517324825e-06, "loss": 0.1507, "step": 2650 }, { "epoch": 0.5972906750781536, "grad_norm": 0.46282831309817246, "learning_rate": 1.9076954932904e-06, "loss": 0.1339, "step": 2651 }, { "epoch": 0.597515982763962, "grad_norm": 0.47494545960669815, "learning_rate": 1.905885798140591e-06, "loss": 0.1451, "step": 2652 }, { "epoch": 0.5977412904497704, "grad_norm": 0.4559165581505678, "learning_rate": 1.9040764328802523e-06, "loss": 0.1378, "step": 2653 }, { "epoch": 0.597966598135579, "grad_norm": 0.4564783058150478, "learning_rate": 1.9022673985140585e-06, "loss": 0.1356, "step": 2654 }, { "epoch": 0.5981919058213874, "grad_norm": 0.4860139197568067, "learning_rate": 1.9004586960464993e-06, "loss": 0.1581, "step": 2655 }, { "epoch": 0.5984172135071958, "grad_norm": 0.49231192056833295, "learning_rate": 1.8986503264818785e-06, "loss": 0.149, "step": 2656 }, { "epoch": 0.5986425211930042, "grad_norm": 0.46235777577164483, "learning_rate": 1.8968422908243156e-06, "loss": 0.1507, "step": 2657 }, { "epoch": 0.5988678288788126, "grad_norm": 0.4574777606001307, "learning_rate": 1.895034590077747e-06, "loss": 0.1447, "step": 2658 }, { "epoch": 0.599093136564621, "grad_norm": 0.518939972275897, "learning_rate": 1.8932272252459213e-06, "loss": 0.1661, "step": 2659 }, { "epoch": 0.5993184442504295, "grad_norm": 0.49871042299630464, "learning_rate": 1.8914201973324004e-06, "loss": 0.1624, "step": 2660 }, { "epoch": 0.5995437519362379, "grad_norm": 0.44078841524551204, "learning_rate": 1.88961350734056e-06, "loss": 0.1299, "step": 2661 }, { "epoch": 0.5997690596220464, "grad_norm": 0.4788882586858163, "learning_rate": 1.8878071562735873e-06, "loss": 0.1593, "step": 2662 }, { "epoch": 0.5999943673078548, "grad_norm": 0.4883771425411433, "learning_rate": 1.8860011451344811e-06, "loss": 0.153, "step": 2663 }, { "epoch": 0.6002196749936632, "grad_norm": 0.4863938817343778, "learning_rate": 1.8841954749260535e-06, "loss": 0.1536, "step": 2664 }, { "epoch": 0.6004449826794717, "grad_norm": 0.4956010517285103, "learning_rate": 1.8823901466509253e-06, "loss": 0.16, "step": 2665 }, { "epoch": 0.6006702903652801, "grad_norm": 0.48987831491655476, "learning_rate": 1.8805851613115278e-06, "loss": 0.1556, "step": 2666 }, { "epoch": 0.6008955980510885, "grad_norm": 0.45947037046050837, "learning_rate": 1.878780519910102e-06, "loss": 0.1343, "step": 2667 }, { "epoch": 0.6011209057368969, "grad_norm": 0.47137603246849347, "learning_rate": 1.8769762234486982e-06, "loss": 0.1396, "step": 2668 }, { "epoch": 0.6013462134227053, "grad_norm": 0.4610954891970303, "learning_rate": 1.8751722729291747e-06, "loss": 0.1509, "step": 2669 }, { "epoch": 0.6015715211085139, "grad_norm": 0.45814689046776785, "learning_rate": 1.8733686693531986e-06, "loss": 0.1342, "step": 2670 }, { "epoch": 0.6017968287943223, "grad_norm": 0.46749668348657086, "learning_rate": 1.8715654137222434e-06, "loss": 0.1385, "step": 2671 }, { "epoch": 0.6020221364801307, "grad_norm": 0.478938533355445, "learning_rate": 1.8697625070375893e-06, "loss": 0.1519, "step": 2672 }, { "epoch": 0.6022474441659391, "grad_norm": 0.47131761493341146, "learning_rate": 1.8679599503003246e-06, "loss": 0.1392, "step": 2673 }, { "epoch": 0.6024727518517475, "grad_norm": 0.4492544891627898, "learning_rate": 1.8661577445113399e-06, "loss": 0.146, "step": 2674 }, { "epoch": 0.602698059537556, "grad_norm": 0.49385705364348237, "learning_rate": 1.8643558906713344e-06, "loss": 0.1526, "step": 2675 }, { "epoch": 0.6029233672233644, "grad_norm": 0.45914439654005745, "learning_rate": 1.8625543897808094e-06, "loss": 0.1371, "step": 2676 }, { "epoch": 0.6031486749091728, "grad_norm": 0.4745967840339945, "learning_rate": 1.8607532428400714e-06, "loss": 0.146, "step": 2677 }, { "epoch": 0.6033739825949813, "grad_norm": 0.470332402788067, "learning_rate": 1.8589524508492308e-06, "loss": 0.1597, "step": 2678 }, { "epoch": 0.6035992902807897, "grad_norm": 0.46903431308321003, "learning_rate": 1.8571520148081992e-06, "loss": 0.1478, "step": 2679 }, { "epoch": 0.6038245979665982, "grad_norm": 0.4722535666341083, "learning_rate": 1.8553519357166927e-06, "loss": 0.1512, "step": 2680 }, { "epoch": 0.6040499056524066, "grad_norm": 0.47488229814225896, "learning_rate": 1.853552214574228e-06, "loss": 0.1517, "step": 2681 }, { "epoch": 0.604275213338215, "grad_norm": 0.44122280475937337, "learning_rate": 1.8517528523801226e-06, "loss": 0.1248, "step": 2682 }, { "epoch": 0.6045005210240234, "grad_norm": 0.4828760618283871, "learning_rate": 1.8499538501334955e-06, "loss": 0.1461, "step": 2683 }, { "epoch": 0.6047258287098318, "grad_norm": 0.5094872405551787, "learning_rate": 1.8481552088332656e-06, "loss": 0.1521, "step": 2684 }, { "epoch": 0.6049511363956402, "grad_norm": 0.4967269024341413, "learning_rate": 1.8463569294781509e-06, "loss": 0.1619, "step": 2685 }, { "epoch": 0.6051764440814488, "grad_norm": 0.4728795946203999, "learning_rate": 1.84455901306667e-06, "loss": 0.1457, "step": 2686 }, { "epoch": 0.6054017517672572, "grad_norm": 0.4499141690398027, "learning_rate": 1.842761460597138e-06, "loss": 0.1403, "step": 2687 }, { "epoch": 0.6056270594530656, "grad_norm": 0.4909515705165402, "learning_rate": 1.8409642730676692e-06, "loss": 0.1482, "step": 2688 }, { "epoch": 0.605852367138874, "grad_norm": 0.4877009904812424, "learning_rate": 1.8391674514761745e-06, "loss": 0.1478, "step": 2689 }, { "epoch": 0.6060776748246824, "grad_norm": 0.4750945191900288, "learning_rate": 1.8373709968203624e-06, "loss": 0.1523, "step": 2690 }, { "epoch": 0.6063029825104909, "grad_norm": 0.4508502573013392, "learning_rate": 1.8355749100977371e-06, "loss": 0.1374, "step": 2691 }, { "epoch": 0.6065282901962993, "grad_norm": 0.45921109689505535, "learning_rate": 1.8337791923055983e-06, "loss": 0.1427, "step": 2692 }, { "epoch": 0.6067535978821077, "grad_norm": 0.4453942067397089, "learning_rate": 1.8319838444410412e-06, "loss": 0.1296, "step": 2693 }, { "epoch": 0.6069789055679162, "grad_norm": 0.45520091195224116, "learning_rate": 1.8301888675009554e-06, "loss": 0.1401, "step": 2694 }, { "epoch": 0.6072042132537246, "grad_norm": 0.5085833788533062, "learning_rate": 1.8283942624820247e-06, "loss": 0.1666, "step": 2695 }, { "epoch": 0.6074295209395331, "grad_norm": 0.46436748594980676, "learning_rate": 1.8266000303807272e-06, "loss": 0.142, "step": 2696 }, { "epoch": 0.6076548286253415, "grad_norm": 0.46302456475407644, "learning_rate": 1.8248061721933325e-06, "loss": 0.1397, "step": 2697 }, { "epoch": 0.6078801363111499, "grad_norm": 0.49167502175273, "learning_rate": 1.8230126889159027e-06, "loss": 0.1543, "step": 2698 }, { "epoch": 0.6081054439969583, "grad_norm": 0.48436061466263614, "learning_rate": 1.8212195815442934e-06, "loss": 0.1502, "step": 2699 }, { "epoch": 0.6083307516827667, "grad_norm": 0.46125404579447626, "learning_rate": 1.8194268510741493e-06, "loss": 0.1372, "step": 2700 }, { "epoch": 0.6085560593685753, "grad_norm": 0.515519249077565, "learning_rate": 1.8176344985009064e-06, "loss": 0.1482, "step": 2701 }, { "epoch": 0.6087813670543837, "grad_norm": 0.49611842782376675, "learning_rate": 1.8158425248197931e-06, "loss": 0.1566, "step": 2702 }, { "epoch": 0.6090066747401921, "grad_norm": 0.4544133455659541, "learning_rate": 1.8140509310258238e-06, "loss": 0.1419, "step": 2703 }, { "epoch": 0.6092319824260005, "grad_norm": 0.4897278959152588, "learning_rate": 1.812259718113805e-06, "loss": 0.1553, "step": 2704 }, { "epoch": 0.6094572901118089, "grad_norm": 0.4550888635947903, "learning_rate": 1.8104688870783296e-06, "loss": 0.1421, "step": 2705 }, { "epoch": 0.6096825977976174, "grad_norm": 0.5439820043723024, "learning_rate": 1.8086784389137796e-06, "loss": 0.1675, "step": 2706 }, { "epoch": 0.6099079054834258, "grad_norm": 0.469320115995091, "learning_rate": 1.806888374614324e-06, "loss": 0.1489, "step": 2707 }, { "epoch": 0.6101332131692342, "grad_norm": 0.4764614698414576, "learning_rate": 1.8050986951739201e-06, "loss": 0.1529, "step": 2708 }, { "epoch": 0.6103585208550427, "grad_norm": 0.4683849207784724, "learning_rate": 1.8033094015863082e-06, "loss": 0.1447, "step": 2709 }, { "epoch": 0.6105838285408511, "grad_norm": 0.47474751534392706, "learning_rate": 1.8015204948450166e-06, "loss": 0.142, "step": 2710 }, { "epoch": 0.6108091362266596, "grad_norm": 0.4394569635606037, "learning_rate": 1.7997319759433596e-06, "loss": 0.1418, "step": 2711 }, { "epoch": 0.611034443912468, "grad_norm": 0.48160970529582176, "learning_rate": 1.7979438458744343e-06, "loss": 0.1532, "step": 2712 }, { "epoch": 0.6112597515982764, "grad_norm": 0.4645507221905752, "learning_rate": 1.7961561056311234e-06, "loss": 0.1527, "step": 2713 }, { "epoch": 0.6114850592840848, "grad_norm": 0.46326252743568164, "learning_rate": 1.7943687562060919e-06, "loss": 0.1552, "step": 2714 }, { "epoch": 0.6117103669698932, "grad_norm": 0.4860638067814299, "learning_rate": 1.792581798591788e-06, "loss": 0.1563, "step": 2715 }, { "epoch": 0.6119356746557016, "grad_norm": 0.4748546487696476, "learning_rate": 1.7907952337804429e-06, "loss": 0.1459, "step": 2716 }, { "epoch": 0.6121609823415102, "grad_norm": 0.4936458628920855, "learning_rate": 1.7890090627640699e-06, "loss": 0.1623, "step": 2717 }, { "epoch": 0.6123862900273186, "grad_norm": 0.47973768416221685, "learning_rate": 1.787223286534463e-06, "loss": 0.1557, "step": 2718 }, { "epoch": 0.612611597713127, "grad_norm": 0.5113779313428518, "learning_rate": 1.785437906083197e-06, "loss": 0.156, "step": 2719 }, { "epoch": 0.6128369053989354, "grad_norm": 0.4957171390447654, "learning_rate": 1.783652922401627e-06, "loss": 0.1584, "step": 2720 }, { "epoch": 0.6130622130847438, "grad_norm": 0.4508990357285264, "learning_rate": 1.7818683364808883e-06, "loss": 0.1393, "step": 2721 }, { "epoch": 0.6132875207705523, "grad_norm": 0.46535715982732545, "learning_rate": 1.7800841493118942e-06, "loss": 0.1433, "step": 2722 }, { "epoch": 0.6135128284563607, "grad_norm": 0.48346208257731116, "learning_rate": 1.7783003618853384e-06, "loss": 0.1511, "step": 2723 }, { "epoch": 0.6137381361421691, "grad_norm": 0.4838514534977142, "learning_rate": 1.776516975191691e-06, "loss": 0.147, "step": 2724 }, { "epoch": 0.6139634438279776, "grad_norm": 0.4768191513891604, "learning_rate": 1.7747339902212e-06, "loss": 0.1378, "step": 2725 }, { "epoch": 0.614188751513786, "grad_norm": 0.4575459272190301, "learning_rate": 1.7729514079638915e-06, "loss": 0.1375, "step": 2726 }, { "epoch": 0.6144140591995945, "grad_norm": 0.47887891978460684, "learning_rate": 1.7711692294095654e-06, "loss": 0.1575, "step": 2727 }, { "epoch": 0.6146393668854029, "grad_norm": 0.5050928760519903, "learning_rate": 1.7693874555477996e-06, "loss": 0.1607, "step": 2728 }, { "epoch": 0.6148646745712113, "grad_norm": 0.4787168044851947, "learning_rate": 1.7676060873679473e-06, "loss": 0.1469, "step": 2729 }, { "epoch": 0.6150899822570197, "grad_norm": 0.48281978453942287, "learning_rate": 1.7658251258591352e-06, "loss": 0.1522, "step": 2730 }, { "epoch": 0.6153152899428281, "grad_norm": 0.5109361705865251, "learning_rate": 1.764044572010265e-06, "loss": 0.1529, "step": 2731 }, { "epoch": 0.6155405976286366, "grad_norm": 0.49247695655010393, "learning_rate": 1.7622644268100116e-06, "loss": 0.1634, "step": 2732 }, { "epoch": 0.6157659053144451, "grad_norm": 0.5121770325492478, "learning_rate": 1.7604846912468243e-06, "loss": 0.1658, "step": 2733 }, { "epoch": 0.6159912130002535, "grad_norm": 0.4749585350535519, "learning_rate": 1.7587053663089233e-06, "loss": 0.1457, "step": 2734 }, { "epoch": 0.6162165206860619, "grad_norm": 0.4709334071762967, "learning_rate": 1.7569264529843009e-06, "loss": 0.1529, "step": 2735 }, { "epoch": 0.6164418283718703, "grad_norm": 0.47968940655553516, "learning_rate": 1.755147952260722e-06, "loss": 0.1485, "step": 2736 }, { "epoch": 0.6166671360576788, "grad_norm": 0.48261957330261573, "learning_rate": 1.753369865125722e-06, "loss": 0.1581, "step": 2737 }, { "epoch": 0.6168924437434872, "grad_norm": 0.4795332092297908, "learning_rate": 1.7515921925666053e-06, "loss": 0.1568, "step": 2738 }, { "epoch": 0.6171177514292956, "grad_norm": 0.4717642359972822, "learning_rate": 1.749814935570448e-06, "loss": 0.1558, "step": 2739 }, { "epoch": 0.617343059115104, "grad_norm": 0.4840128981249241, "learning_rate": 1.748038095124095e-06, "loss": 0.1455, "step": 2740 }, { "epoch": 0.6175683668009125, "grad_norm": 0.465278403538817, "learning_rate": 1.746261672214159e-06, "loss": 0.1433, "step": 2741 }, { "epoch": 0.617793674486721, "grad_norm": 0.48162713449264705, "learning_rate": 1.7444856678270218e-06, "loss": 0.1569, "step": 2742 }, { "epoch": 0.6180189821725294, "grad_norm": 0.4832915581984926, "learning_rate": 1.7427100829488325e-06, "loss": 0.1487, "step": 2743 }, { "epoch": 0.6182442898583378, "grad_norm": 0.4805571768792867, "learning_rate": 1.7409349185655067e-06, "loss": 0.1532, "step": 2744 }, { "epoch": 0.6184695975441462, "grad_norm": 0.45648256848154184, "learning_rate": 1.739160175662727e-06, "loss": 0.1424, "step": 2745 }, { "epoch": 0.6186949052299546, "grad_norm": 0.4588732490898067, "learning_rate": 1.7373858552259421e-06, "loss": 0.1522, "step": 2746 }, { "epoch": 0.618920212915763, "grad_norm": 0.47375321309353036, "learning_rate": 1.7356119582403663e-06, "loss": 0.1474, "step": 2747 }, { "epoch": 0.6191455206015716, "grad_norm": 0.5007531820480831, "learning_rate": 1.733838485690978e-06, "loss": 0.1501, "step": 2748 }, { "epoch": 0.61937082828738, "grad_norm": 0.480719639702643, "learning_rate": 1.7320654385625208e-06, "loss": 0.141, "step": 2749 }, { "epoch": 0.6195961359731884, "grad_norm": 0.4587097913676663, "learning_rate": 1.7302928178395018e-06, "loss": 0.1388, "step": 2750 }, { "epoch": 0.6198214436589968, "grad_norm": 0.4958585682683732, "learning_rate": 1.7285206245061908e-06, "loss": 0.1503, "step": 2751 }, { "epoch": 0.6200467513448052, "grad_norm": 0.4550933127673, "learning_rate": 1.726748859546621e-06, "loss": 0.1391, "step": 2752 }, { "epoch": 0.6202720590306137, "grad_norm": 0.44816444530545974, "learning_rate": 1.7249775239445875e-06, "loss": 0.1307, "step": 2753 }, { "epoch": 0.6204973667164221, "grad_norm": 0.4431721028100357, "learning_rate": 1.723206618683646e-06, "loss": 0.1331, "step": 2754 }, { "epoch": 0.6207226744022305, "grad_norm": 0.511980981222847, "learning_rate": 1.7214361447471156e-06, "loss": 0.1493, "step": 2755 }, { "epoch": 0.620947982088039, "grad_norm": 0.47289755552146395, "learning_rate": 1.7196661031180738e-06, "loss": 0.1585, "step": 2756 }, { "epoch": 0.6211732897738474, "grad_norm": 0.47535833672053196, "learning_rate": 1.7178964947793591e-06, "loss": 0.1437, "step": 2757 }, { "epoch": 0.6213985974596559, "grad_norm": 0.48646685714584237, "learning_rate": 1.716127320713568e-06, "loss": 0.146, "step": 2758 }, { "epoch": 0.6216239051454643, "grad_norm": 0.4634324355114312, "learning_rate": 1.7143585819030583e-06, "loss": 0.143, "step": 2759 }, { "epoch": 0.6218492128312727, "grad_norm": 0.4576000720937714, "learning_rate": 1.7125902793299434e-06, "loss": 0.1371, "step": 2760 }, { "epoch": 0.6220745205170811, "grad_norm": 0.4438078503440971, "learning_rate": 1.7108224139760982e-06, "loss": 0.1249, "step": 2761 }, { "epoch": 0.6222998282028895, "grad_norm": 0.4471311093079337, "learning_rate": 1.7090549868231492e-06, "loss": 0.1305, "step": 2762 }, { "epoch": 0.622525135888698, "grad_norm": 0.4808441910375895, "learning_rate": 1.707287998852485e-06, "loss": 0.1548, "step": 2763 }, { "epoch": 0.6227504435745065, "grad_norm": 0.4401254562234489, "learning_rate": 1.7055214510452462e-06, "loss": 0.1387, "step": 2764 }, { "epoch": 0.6229757512603149, "grad_norm": 0.4879263554502538, "learning_rate": 1.7037553443823332e-06, "loss": 0.1465, "step": 2765 }, { "epoch": 0.6232010589461233, "grad_norm": 0.4472646910236901, "learning_rate": 1.7019896798443984e-06, "loss": 0.1373, "step": 2766 }, { "epoch": 0.6234263666319317, "grad_norm": 0.48360400598865216, "learning_rate": 1.700224458411849e-06, "loss": 0.1559, "step": 2767 }, { "epoch": 0.6236516743177402, "grad_norm": 0.4830733201104498, "learning_rate": 1.6984596810648475e-06, "loss": 0.1579, "step": 2768 }, { "epoch": 0.6238769820035486, "grad_norm": 0.48335608809827474, "learning_rate": 1.6966953487833078e-06, "loss": 0.1596, "step": 2769 }, { "epoch": 0.624102289689357, "grad_norm": 0.4823123539206247, "learning_rate": 1.6949314625468985e-06, "loss": 0.1513, "step": 2770 }, { "epoch": 0.6243275973751654, "grad_norm": 0.4938808073292687, "learning_rate": 1.6931680233350404e-06, "loss": 0.151, "step": 2771 }, { "epoch": 0.6245529050609739, "grad_norm": 0.47738363812247797, "learning_rate": 1.6914050321269049e-06, "loss": 0.1473, "step": 2772 }, { "epoch": 0.6247782127467824, "grad_norm": 0.4889654971871984, "learning_rate": 1.6896424899014158e-06, "loss": 0.1413, "step": 2773 }, { "epoch": 0.6250035204325908, "grad_norm": 0.4903261247950262, "learning_rate": 1.6878803976372465e-06, "loss": 0.1663, "step": 2774 }, { "epoch": 0.6252288281183992, "grad_norm": 0.48272666070040504, "learning_rate": 1.6861187563128217e-06, "loss": 0.1535, "step": 2775 }, { "epoch": 0.6254541358042076, "grad_norm": 0.46179189709660806, "learning_rate": 1.6843575669063142e-06, "loss": 0.1375, "step": 2776 }, { "epoch": 0.625679443490016, "grad_norm": 0.46497370122587345, "learning_rate": 1.682596830395648e-06, "loss": 0.1407, "step": 2777 }, { "epoch": 0.6259047511758244, "grad_norm": 0.4702044055000179, "learning_rate": 1.6808365477584953e-06, "loss": 0.1534, "step": 2778 }, { "epoch": 0.6261300588616329, "grad_norm": 0.5071413974321057, "learning_rate": 1.6790767199722724e-06, "loss": 0.1626, "step": 2779 }, { "epoch": 0.6263553665474414, "grad_norm": 0.4805694941313356, "learning_rate": 1.6773173480141487e-06, "loss": 0.1482, "step": 2780 }, { "epoch": 0.6265806742332498, "grad_norm": 0.46670874376304294, "learning_rate": 1.6755584328610364e-06, "loss": 0.1446, "step": 2781 }, { "epoch": 0.6268059819190582, "grad_norm": 0.48126522962899204, "learning_rate": 1.6737999754895965e-06, "loss": 0.1433, "step": 2782 }, { "epoch": 0.6270312896048666, "grad_norm": 0.47189768674803717, "learning_rate": 1.6720419768762343e-06, "loss": 0.1515, "step": 2783 }, { "epoch": 0.6272565972906751, "grad_norm": 0.4721444472897269, "learning_rate": 1.6702844379971012e-06, "loss": 0.1427, "step": 2784 }, { "epoch": 0.6274819049764835, "grad_norm": 0.4876297281006269, "learning_rate": 1.668527359828092e-06, "loss": 0.1504, "step": 2785 }, { "epoch": 0.6277072126622919, "grad_norm": 0.4767240074099445, "learning_rate": 1.6667707433448482e-06, "loss": 0.1532, "step": 2786 }, { "epoch": 0.6279325203481003, "grad_norm": 0.47858125114578143, "learning_rate": 1.6650145895227532e-06, "loss": 0.1551, "step": 2787 }, { "epoch": 0.6281578280339088, "grad_norm": 0.4859216333502786, "learning_rate": 1.663258899336933e-06, "loss": 0.159, "step": 2788 }, { "epoch": 0.6283831357197173, "grad_norm": 0.46248161988504, "learning_rate": 1.6615036737622574e-06, "loss": 0.1421, "step": 2789 }, { "epoch": 0.6286084434055257, "grad_norm": 0.5055365786287488, "learning_rate": 1.6597489137733377e-06, "loss": 0.1597, "step": 2790 }, { "epoch": 0.6288337510913341, "grad_norm": 0.4743015397097211, "learning_rate": 1.6579946203445269e-06, "loss": 0.1489, "step": 2791 }, { "epoch": 0.6290590587771425, "grad_norm": 0.46321209388814183, "learning_rate": 1.6562407944499175e-06, "loss": 0.1405, "step": 2792 }, { "epoch": 0.6292843664629509, "grad_norm": 0.47427596876506406, "learning_rate": 1.6544874370633456e-06, "loss": 0.1389, "step": 2793 }, { "epoch": 0.6295096741487594, "grad_norm": 0.48628130797575075, "learning_rate": 1.652734549158384e-06, "loss": 0.1556, "step": 2794 }, { "epoch": 0.6297349818345679, "grad_norm": 0.48296863875610585, "learning_rate": 1.6509821317083466e-06, "loss": 0.1471, "step": 2795 }, { "epoch": 0.6299602895203763, "grad_norm": 0.44435202435685495, "learning_rate": 1.6492301856862855e-06, "loss": 0.1316, "step": 2796 }, { "epoch": 0.6301855972061847, "grad_norm": 0.4522766714459569, "learning_rate": 1.6474787120649903e-06, "loss": 0.1491, "step": 2797 }, { "epoch": 0.6304109048919931, "grad_norm": 0.476592061403675, "learning_rate": 1.6457277118169893e-06, "loss": 0.1495, "step": 2798 }, { "epoch": 0.6306362125778016, "grad_norm": 0.49291809206381026, "learning_rate": 1.6439771859145476e-06, "loss": 0.1477, "step": 2799 }, { "epoch": 0.63086152026361, "grad_norm": 0.4712511414943249, "learning_rate": 1.6422271353296675e-06, "loss": 0.1457, "step": 2800 }, { "epoch": 0.6310868279494184, "grad_norm": 0.5061894262618007, "learning_rate": 1.640477561034086e-06, "loss": 0.1494, "step": 2801 }, { "epoch": 0.6313121356352268, "grad_norm": 0.45287458848331624, "learning_rate": 1.6387284639992773e-06, "loss": 0.139, "step": 2802 }, { "epoch": 0.6315374433210353, "grad_norm": 0.4723200403220778, "learning_rate": 1.6369798451964496e-06, "loss": 0.1472, "step": 2803 }, { "epoch": 0.6317627510068438, "grad_norm": 0.5103378496664759, "learning_rate": 1.6352317055965458e-06, "loss": 0.159, "step": 2804 }, { "epoch": 0.6319880586926522, "grad_norm": 0.480394644135576, "learning_rate": 1.6334840461702422e-06, "loss": 0.1516, "step": 2805 }, { "epoch": 0.6322133663784606, "grad_norm": 0.47523540206244513, "learning_rate": 1.6317368678879497e-06, "loss": 0.1389, "step": 2806 }, { "epoch": 0.632438674064269, "grad_norm": 0.4558136082094893, "learning_rate": 1.6299901717198102e-06, "loss": 0.1382, "step": 2807 }, { "epoch": 0.6326639817500774, "grad_norm": 0.48295797707491017, "learning_rate": 1.6282439586356999e-06, "loss": 0.151, "step": 2808 }, { "epoch": 0.6328892894358858, "grad_norm": 0.4872663984317607, "learning_rate": 1.6264982296052256e-06, "loss": 0.1483, "step": 2809 }, { "epoch": 0.6331145971216943, "grad_norm": 0.44741252541960735, "learning_rate": 1.6247529855977256e-06, "loss": 0.1342, "step": 2810 }, { "epoch": 0.6333399048075028, "grad_norm": 0.4985585348392438, "learning_rate": 1.6230082275822687e-06, "loss": 0.1531, "step": 2811 }, { "epoch": 0.6335652124933112, "grad_norm": 0.4638990385262181, "learning_rate": 1.6212639565276538e-06, "loss": 0.136, "step": 2812 }, { "epoch": 0.6337905201791196, "grad_norm": 0.4650018045050846, "learning_rate": 1.6195201734024096e-06, "loss": 0.1381, "step": 2813 }, { "epoch": 0.634015827864928, "grad_norm": 0.4965714175065691, "learning_rate": 1.6177768791747957e-06, "loss": 0.1496, "step": 2814 }, { "epoch": 0.6342411355507365, "grad_norm": 0.47979460480702335, "learning_rate": 1.6160340748127959e-06, "loss": 0.1513, "step": 2815 }, { "epoch": 0.6344664432365449, "grad_norm": 0.442911411378338, "learning_rate": 1.6142917612841252e-06, "loss": 0.1299, "step": 2816 }, { "epoch": 0.6346917509223533, "grad_norm": 0.48375134793054897, "learning_rate": 1.612549939556225e-06, "loss": 0.1397, "step": 2817 }, { "epoch": 0.6349170586081617, "grad_norm": 0.4646454924086842, "learning_rate": 1.610808610596265e-06, "loss": 0.1407, "step": 2818 }, { "epoch": 0.6351423662939703, "grad_norm": 0.47665971541619717, "learning_rate": 1.6090677753711403e-06, "loss": 0.1385, "step": 2819 }, { "epoch": 0.6353676739797787, "grad_norm": 0.48639642703567026, "learning_rate": 1.607327434847471e-06, "loss": 0.1545, "step": 2820 }, { "epoch": 0.6355929816655871, "grad_norm": 0.5270081816426536, "learning_rate": 1.6055875899916034e-06, "loss": 0.1552, "step": 2821 }, { "epoch": 0.6358182893513955, "grad_norm": 0.47067747824845496, "learning_rate": 1.6038482417696095e-06, "loss": 0.1516, "step": 2822 }, { "epoch": 0.6360435970372039, "grad_norm": 0.4850765057650214, "learning_rate": 1.6021093911472825e-06, "loss": 0.1534, "step": 2823 }, { "epoch": 0.6362689047230123, "grad_norm": 0.47449309561057146, "learning_rate": 1.6003710390901434e-06, "loss": 0.1542, "step": 2824 }, { "epoch": 0.6364942124088208, "grad_norm": 0.5126023870225591, "learning_rate": 1.5986331865634335e-06, "loss": 0.1659, "step": 2825 }, { "epoch": 0.6367195200946292, "grad_norm": 0.5052408111958812, "learning_rate": 1.5968958345321178e-06, "loss": 0.1526, "step": 2826 }, { "epoch": 0.6369448277804377, "grad_norm": 0.47297868340618493, "learning_rate": 1.5951589839608828e-06, "loss": 0.1466, "step": 2827 }, { "epoch": 0.6371701354662461, "grad_norm": 0.46382611744276403, "learning_rate": 1.5934226358141368e-06, "loss": 0.1391, "step": 2828 }, { "epoch": 0.6373954431520545, "grad_norm": 0.49119112893116423, "learning_rate": 1.5916867910560092e-06, "loss": 0.1491, "step": 2829 }, { "epoch": 0.637620750837863, "grad_norm": 0.5016831914604146, "learning_rate": 1.5899514506503499e-06, "loss": 0.1649, "step": 2830 }, { "epoch": 0.6378460585236714, "grad_norm": 0.4817486742658559, "learning_rate": 1.5882166155607306e-06, "loss": 0.1592, "step": 2831 }, { "epoch": 0.6380713662094798, "grad_norm": 0.46605829216160005, "learning_rate": 1.5864822867504376e-06, "loss": 0.1467, "step": 2832 }, { "epoch": 0.6382966738952882, "grad_norm": 0.4804265967852996, "learning_rate": 1.58474846518248e-06, "loss": 0.152, "step": 2833 }, { "epoch": 0.6385219815810966, "grad_norm": 0.4457918826886218, "learning_rate": 1.5830151518195846e-06, "loss": 0.1393, "step": 2834 }, { "epoch": 0.6387472892669052, "grad_norm": 0.4545444717911307, "learning_rate": 1.5812823476241962e-06, "loss": 0.1403, "step": 2835 }, { "epoch": 0.6389725969527136, "grad_norm": 0.4756372945724833, "learning_rate": 1.5795500535584758e-06, "loss": 0.1511, "step": 2836 }, { "epoch": 0.639197904638522, "grad_norm": 0.4780977819516656, "learning_rate": 1.5778182705843017e-06, "loss": 0.1546, "step": 2837 }, { "epoch": 0.6394232123243304, "grad_norm": 0.47229095632094537, "learning_rate": 1.5760869996632685e-06, "loss": 0.1507, "step": 2838 }, { "epoch": 0.6396485200101388, "grad_norm": 0.47035365018536524, "learning_rate": 1.574356241756686e-06, "loss": 0.1427, "step": 2839 }, { "epoch": 0.6398738276959473, "grad_norm": 0.48265274771659217, "learning_rate": 1.572625997825581e-06, "loss": 0.151, "step": 2840 }, { "epoch": 0.6400991353817557, "grad_norm": 0.4813791272462431, "learning_rate": 1.5708962688306916e-06, "loss": 0.1607, "step": 2841 }, { "epoch": 0.6403244430675642, "grad_norm": 0.4931264969520297, "learning_rate": 1.5691670557324734e-06, "loss": 0.1507, "step": 2842 }, { "epoch": 0.6405497507533726, "grad_norm": 0.4891345812437931, "learning_rate": 1.5674383594910931e-06, "loss": 0.1528, "step": 2843 }, { "epoch": 0.640775058439181, "grad_norm": 0.4734338447627687, "learning_rate": 1.5657101810664314e-06, "loss": 0.1533, "step": 2844 }, { "epoch": 0.6410003661249895, "grad_norm": 0.4523902167413687, "learning_rate": 1.5639825214180808e-06, "loss": 0.1333, "step": 2845 }, { "epoch": 0.6412256738107979, "grad_norm": 0.5081364752489811, "learning_rate": 1.5622553815053476e-06, "loss": 0.1551, "step": 2846 }, { "epoch": 0.6414509814966063, "grad_norm": 0.46507190532041826, "learning_rate": 1.5605287622872478e-06, "loss": 0.1504, "step": 2847 }, { "epoch": 0.6416762891824147, "grad_norm": 0.4641771875874691, "learning_rate": 1.558802664722508e-06, "loss": 0.1371, "step": 2848 }, { "epoch": 0.6419015968682231, "grad_norm": 0.45632753268823106, "learning_rate": 1.5570770897695672e-06, "loss": 0.1445, "step": 2849 }, { "epoch": 0.6421269045540317, "grad_norm": 0.49783070157795684, "learning_rate": 1.555352038386571e-06, "loss": 0.1503, "step": 2850 }, { "epoch": 0.6423522122398401, "grad_norm": 0.46436518926277925, "learning_rate": 1.5536275115313776e-06, "loss": 0.1424, "step": 2851 }, { "epoch": 0.6425775199256485, "grad_norm": 0.48717850193393475, "learning_rate": 1.5519035101615518e-06, "loss": 0.152, "step": 2852 }, { "epoch": 0.6428028276114569, "grad_norm": 0.4910056281463092, "learning_rate": 1.5501800352343673e-06, "loss": 0.1539, "step": 2853 }, { "epoch": 0.6430281352972653, "grad_norm": 0.5118422933613982, "learning_rate": 1.5484570877068055e-06, "loss": 0.155, "step": 2854 }, { "epoch": 0.6432534429830737, "grad_norm": 0.5310399027828411, "learning_rate": 1.5467346685355553e-06, "loss": 0.1683, "step": 2855 }, { "epoch": 0.6434787506688822, "grad_norm": 0.506021853271099, "learning_rate": 1.5450127786770116e-06, "loss": 0.1571, "step": 2856 }, { "epoch": 0.6437040583546906, "grad_norm": 0.49174135641662325, "learning_rate": 1.5432914190872757e-06, "loss": 0.1518, "step": 2857 }, { "epoch": 0.6439293660404991, "grad_norm": 0.46911119250398337, "learning_rate": 1.5415705907221545e-06, "loss": 0.1364, "step": 2858 }, { "epoch": 0.6441546737263075, "grad_norm": 0.4843368641508339, "learning_rate": 1.53985029453716e-06, "loss": 0.1421, "step": 2859 }, { "epoch": 0.6443799814121159, "grad_norm": 0.5137282334737647, "learning_rate": 1.5381305314875084e-06, "loss": 0.1565, "step": 2860 }, { "epoch": 0.6446052890979244, "grad_norm": 0.4661382667029591, "learning_rate": 1.536411302528119e-06, "loss": 0.1484, "step": 2861 }, { "epoch": 0.6448305967837328, "grad_norm": 0.4698458690541328, "learning_rate": 1.5346926086136171e-06, "loss": 0.1496, "step": 2862 }, { "epoch": 0.6450559044695412, "grad_norm": 0.4734423472032637, "learning_rate": 1.5329744506983292e-06, "loss": 0.1532, "step": 2863 }, { "epoch": 0.6452812121553496, "grad_norm": 0.4793175583549265, "learning_rate": 1.5312568297362834e-06, "loss": 0.137, "step": 2864 }, { "epoch": 0.645506519841158, "grad_norm": 0.5120639200872323, "learning_rate": 1.5295397466812115e-06, "loss": 0.1515, "step": 2865 }, { "epoch": 0.6457318275269666, "grad_norm": 0.48304485914928136, "learning_rate": 1.5278232024865458e-06, "loss": 0.1449, "step": 2866 }, { "epoch": 0.645957135212775, "grad_norm": 0.478700554504123, "learning_rate": 1.5261071981054183e-06, "loss": 0.1512, "step": 2867 }, { "epoch": 0.6461824428985834, "grad_norm": 0.4884664261324441, "learning_rate": 1.5243917344906625e-06, "loss": 0.1459, "step": 2868 }, { "epoch": 0.6464077505843918, "grad_norm": 0.45259907738132826, "learning_rate": 1.5226768125948122e-06, "loss": 0.135, "step": 2869 }, { "epoch": 0.6466330582702002, "grad_norm": 0.43908078665268413, "learning_rate": 1.5209624333700985e-06, "loss": 0.1315, "step": 2870 }, { "epoch": 0.6468583659560087, "grad_norm": 0.4545357145159809, "learning_rate": 1.5192485977684528e-06, "loss": 0.1501, "step": 2871 }, { "epoch": 0.6470836736418171, "grad_norm": 0.49335283447767514, "learning_rate": 1.517535306741505e-06, "loss": 0.1559, "step": 2872 }, { "epoch": 0.6473089813276255, "grad_norm": 0.48260902139598805, "learning_rate": 1.5158225612405808e-06, "loss": 0.1402, "step": 2873 }, { "epoch": 0.647534289013434, "grad_norm": 0.4544937631611182, "learning_rate": 1.5141103622167042e-06, "loss": 0.1384, "step": 2874 }, { "epoch": 0.6477595966992424, "grad_norm": 0.4501153858395984, "learning_rate": 1.512398710620595e-06, "loss": 0.1427, "step": 2875 }, { "epoch": 0.6479849043850509, "grad_norm": 0.44247881743042544, "learning_rate": 1.51068760740267e-06, "loss": 0.1346, "step": 2876 }, { "epoch": 0.6482102120708593, "grad_norm": 0.4548380734877828, "learning_rate": 1.508977053513041e-06, "loss": 0.1371, "step": 2877 }, { "epoch": 0.6484355197566677, "grad_norm": 0.45590988633602597, "learning_rate": 1.5072670499015151e-06, "loss": 0.1413, "step": 2878 }, { "epoch": 0.6486608274424761, "grad_norm": 0.4894275363782382, "learning_rate": 1.5055575975175929e-06, "loss": 0.1607, "step": 2879 }, { "epoch": 0.6488861351282845, "grad_norm": 0.5019183155077958, "learning_rate": 1.5038486973104704e-06, "loss": 0.1625, "step": 2880 }, { "epoch": 0.6491114428140929, "grad_norm": 0.46667254314094453, "learning_rate": 1.5021403502290354e-06, "loss": 0.1344, "step": 2881 }, { "epoch": 0.6493367504999015, "grad_norm": 0.47319219186852757, "learning_rate": 1.5004325572218698e-06, "loss": 0.142, "step": 2882 }, { "epoch": 0.6495620581857099, "grad_norm": 0.4960962366132025, "learning_rate": 1.4987253192372471e-06, "loss": 0.157, "step": 2883 }, { "epoch": 0.6497873658715183, "grad_norm": 0.4787973264187811, "learning_rate": 1.4970186372231347e-06, "loss": 0.1518, "step": 2884 }, { "epoch": 0.6500126735573267, "grad_norm": 0.5159643516295653, "learning_rate": 1.4953125121271866e-06, "loss": 0.1581, "step": 2885 }, { "epoch": 0.6502379812431351, "grad_norm": 0.4665006065685688, "learning_rate": 1.493606944896751e-06, "loss": 0.1404, "step": 2886 }, { "epoch": 0.6504632889289436, "grad_norm": 0.45934804469692997, "learning_rate": 1.4919019364788678e-06, "loss": 0.1351, "step": 2887 }, { "epoch": 0.650688596614752, "grad_norm": 0.4900662596708013, "learning_rate": 1.490197487820263e-06, "loss": 0.1548, "step": 2888 }, { "epoch": 0.6509139043005604, "grad_norm": 0.4609436190944118, "learning_rate": 1.4884935998673539e-06, "loss": 0.1472, "step": 2889 }, { "epoch": 0.6511392119863689, "grad_norm": 0.4762971414293838, "learning_rate": 1.486790273566246e-06, "loss": 0.1546, "step": 2890 }, { "epoch": 0.6513645196721773, "grad_norm": 0.4556504565623819, "learning_rate": 1.4850875098627326e-06, "loss": 0.1453, "step": 2891 }, { "epoch": 0.6515898273579858, "grad_norm": 0.49763192413888446, "learning_rate": 1.483385309702295e-06, "loss": 0.1561, "step": 2892 }, { "epoch": 0.6518151350437942, "grad_norm": 0.4766316614497349, "learning_rate": 1.4816836740301019e-06, "loss": 0.1513, "step": 2893 }, { "epoch": 0.6520404427296026, "grad_norm": 0.5184596612142391, "learning_rate": 1.4799826037910082e-06, "loss": 0.1699, "step": 2894 }, { "epoch": 0.652265750415411, "grad_norm": 0.4536514939994665, "learning_rate": 1.478282099929554e-06, "loss": 0.1354, "step": 2895 }, { "epoch": 0.6524910581012194, "grad_norm": 0.4974095639499098, "learning_rate": 1.4765821633899663e-06, "loss": 0.151, "step": 2896 }, { "epoch": 0.652716365787028, "grad_norm": 0.4712272360337234, "learning_rate": 1.4748827951161566e-06, "loss": 0.1417, "step": 2897 }, { "epoch": 0.6529416734728364, "grad_norm": 0.475826250525577, "learning_rate": 1.4731839960517202e-06, "loss": 0.1504, "step": 2898 }, { "epoch": 0.6531669811586448, "grad_norm": 0.5037284144728293, "learning_rate": 1.4714857671399374e-06, "loss": 0.1651, "step": 2899 }, { "epoch": 0.6533922888444532, "grad_norm": 0.45473879817414803, "learning_rate": 1.4697881093237714e-06, "loss": 0.144, "step": 2900 }, { "epoch": 0.6536175965302616, "grad_norm": 0.47770176405517745, "learning_rate": 1.4680910235458692e-06, "loss": 0.1471, "step": 2901 }, { "epoch": 0.65384290421607, "grad_norm": 0.4723218687376834, "learning_rate": 1.4663945107485567e-06, "loss": 0.1431, "step": 2902 }, { "epoch": 0.6540682119018785, "grad_norm": 0.47416198265004644, "learning_rate": 1.4646985718738466e-06, "loss": 0.1491, "step": 2903 }, { "epoch": 0.6542935195876869, "grad_norm": 0.47272637335722917, "learning_rate": 1.4630032078634293e-06, "loss": 0.1442, "step": 2904 }, { "epoch": 0.6545188272734954, "grad_norm": 0.47902953399381804, "learning_rate": 1.461308419658678e-06, "loss": 0.1468, "step": 2905 }, { "epoch": 0.6547441349593038, "grad_norm": 0.4592522287454078, "learning_rate": 1.4596142082006448e-06, "loss": 0.1387, "step": 2906 }, { "epoch": 0.6549694426451123, "grad_norm": 0.5219859102858139, "learning_rate": 1.457920574430062e-06, "loss": 0.1742, "step": 2907 }, { "epoch": 0.6551947503309207, "grad_norm": 0.485989238913835, "learning_rate": 1.456227519287343e-06, "loss": 0.1563, "step": 2908 }, { "epoch": 0.6554200580167291, "grad_norm": 0.4718340205076326, "learning_rate": 1.4545350437125755e-06, "loss": 0.1429, "step": 2909 }, { "epoch": 0.6556453657025375, "grad_norm": 0.4990272336147076, "learning_rate": 1.4528431486455311e-06, "loss": 0.1543, "step": 2910 }, { "epoch": 0.6558706733883459, "grad_norm": 0.48413560475749645, "learning_rate": 1.451151835025653e-06, "loss": 0.1483, "step": 2911 }, { "epoch": 0.6560959810741543, "grad_norm": 0.47270868455230225, "learning_rate": 1.4494611037920667e-06, "loss": 0.1396, "step": 2912 }, { "epoch": 0.6563212887599629, "grad_norm": 0.47527159314782774, "learning_rate": 1.4477709558835724e-06, "loss": 0.1485, "step": 2913 }, { "epoch": 0.6565465964457713, "grad_norm": 0.4760435028537632, "learning_rate": 1.4460813922386446e-06, "loss": 0.1474, "step": 2914 }, { "epoch": 0.6567719041315797, "grad_norm": 0.4593543242080812, "learning_rate": 1.4443924137954368e-06, "loss": 0.1367, "step": 2915 }, { "epoch": 0.6569972118173881, "grad_norm": 0.4850288721265088, "learning_rate": 1.4427040214917742e-06, "loss": 0.1472, "step": 2916 }, { "epoch": 0.6572225195031965, "grad_norm": 0.4735458603071915, "learning_rate": 1.4410162162651586e-06, "loss": 0.1457, "step": 2917 }, { "epoch": 0.657447827189005, "grad_norm": 0.4919094654596746, "learning_rate": 1.4393289990527665e-06, "loss": 0.1353, "step": 2918 }, { "epoch": 0.6576731348748134, "grad_norm": 0.4858859607661872, "learning_rate": 1.4376423707914462e-06, "loss": 0.1494, "step": 2919 }, { "epoch": 0.6578984425606218, "grad_norm": 0.4690348062287447, "learning_rate": 1.4359563324177176e-06, "loss": 0.138, "step": 2920 }, { "epoch": 0.6581237502464303, "grad_norm": 0.46664358476670137, "learning_rate": 1.4342708848677774e-06, "loss": 0.1413, "step": 2921 }, { "epoch": 0.6583490579322387, "grad_norm": 0.46423462010928873, "learning_rate": 1.43258602907749e-06, "loss": 0.1447, "step": 2922 }, { "epoch": 0.6585743656180472, "grad_norm": 0.4702745421807713, "learning_rate": 1.430901765982395e-06, "loss": 0.1524, "step": 2923 }, { "epoch": 0.6587996733038556, "grad_norm": 0.48845161374721735, "learning_rate": 1.429218096517699e-06, "loss": 0.1502, "step": 2924 }, { "epoch": 0.659024980989664, "grad_norm": 0.4176507440012528, "learning_rate": 1.4275350216182824e-06, "loss": 0.1187, "step": 2925 }, { "epoch": 0.6592502886754724, "grad_norm": 0.4989305833263455, "learning_rate": 1.425852542218692e-06, "loss": 0.1602, "step": 2926 }, { "epoch": 0.6594755963612808, "grad_norm": 0.47470946640752715, "learning_rate": 1.4241706592531473e-06, "loss": 0.1491, "step": 2927 }, { "epoch": 0.6597009040470893, "grad_norm": 0.4600229064740794, "learning_rate": 1.4224893736555364e-06, "loss": 0.1499, "step": 2928 }, { "epoch": 0.6599262117328978, "grad_norm": 0.48995371068382887, "learning_rate": 1.420808686359412e-06, "loss": 0.1559, "step": 2929 }, { "epoch": 0.6601515194187062, "grad_norm": 0.4815411645705524, "learning_rate": 1.4191285982979992e-06, "loss": 0.1381, "step": 2930 }, { "epoch": 0.6603768271045146, "grad_norm": 0.47921268405426853, "learning_rate": 1.4174491104041866e-06, "loss": 0.1555, "step": 2931 }, { "epoch": 0.660602134790323, "grad_norm": 0.48304499533202994, "learning_rate": 1.4157702236105326e-06, "loss": 0.1555, "step": 2932 }, { "epoch": 0.6608274424761315, "grad_norm": 0.48077288960623543, "learning_rate": 1.414091938849259e-06, "loss": 0.1466, "step": 2933 }, { "epoch": 0.6610527501619399, "grad_norm": 0.46750831606409243, "learning_rate": 1.412414257052256e-06, "loss": 0.1424, "step": 2934 }, { "epoch": 0.6612780578477483, "grad_norm": 0.4999024192245585, "learning_rate": 1.410737179151078e-06, "loss": 0.1558, "step": 2935 }, { "epoch": 0.6615033655335567, "grad_norm": 0.45918538077085974, "learning_rate": 1.4090607060769423e-06, "loss": 0.1368, "step": 2936 }, { "epoch": 0.6617286732193652, "grad_norm": 0.46695971977298123, "learning_rate": 1.407384838760734e-06, "loss": 0.1516, "step": 2937 }, { "epoch": 0.6619539809051737, "grad_norm": 0.46204373726139225, "learning_rate": 1.4057095781329983e-06, "loss": 0.1374, "step": 2938 }, { "epoch": 0.6621792885909821, "grad_norm": 0.4575466919551392, "learning_rate": 1.4040349251239444e-06, "loss": 0.1268, "step": 2939 }, { "epoch": 0.6624045962767905, "grad_norm": 0.45667917133040004, "learning_rate": 1.402360880663447e-06, "loss": 0.1446, "step": 2940 }, { "epoch": 0.6626299039625989, "grad_norm": 0.47617239978779796, "learning_rate": 1.4006874456810377e-06, "loss": 0.1508, "step": 2941 }, { "epoch": 0.6628552116484073, "grad_norm": 0.47224186074775903, "learning_rate": 1.3990146211059141e-06, "loss": 0.1518, "step": 2942 }, { "epoch": 0.6630805193342157, "grad_norm": 0.4780137911390611, "learning_rate": 1.3973424078669346e-06, "loss": 0.1491, "step": 2943 }, { "epoch": 0.6633058270200243, "grad_norm": 0.5216227820812853, "learning_rate": 1.3956708068926141e-06, "loss": 0.1639, "step": 2944 }, { "epoch": 0.6635311347058327, "grad_norm": 0.5137910334803849, "learning_rate": 1.393999819111133e-06, "loss": 0.1488, "step": 2945 }, { "epoch": 0.6637564423916411, "grad_norm": 0.47392884476866215, "learning_rate": 1.3923294454503263e-06, "loss": 0.1434, "step": 2946 }, { "epoch": 0.6639817500774495, "grad_norm": 0.46492626782696317, "learning_rate": 1.3906596868376923e-06, "loss": 0.1491, "step": 2947 }, { "epoch": 0.664207057763258, "grad_norm": 0.44234123269093023, "learning_rate": 1.3889905442003836e-06, "loss": 0.128, "step": 2948 }, { "epoch": 0.6644323654490664, "grad_norm": 0.48399930599781904, "learning_rate": 1.3873220184652143e-06, "loss": 0.1558, "step": 2949 }, { "epoch": 0.6646576731348748, "grad_norm": 0.47807200291396135, "learning_rate": 1.3856541105586545e-06, "loss": 0.1526, "step": 2950 }, { "epoch": 0.6648829808206832, "grad_norm": 0.488387437019316, "learning_rate": 1.3839868214068303e-06, "loss": 0.1495, "step": 2951 }, { "epoch": 0.6651082885064917, "grad_norm": 0.47430374069411185, "learning_rate": 1.382320151935527e-06, "loss": 0.1384, "step": 2952 }, { "epoch": 0.6653335961923001, "grad_norm": 0.4753402488559886, "learning_rate": 1.380654103070182e-06, "loss": 0.1364, "step": 2953 }, { "epoch": 0.6655589038781086, "grad_norm": 0.47795924208162, "learning_rate": 1.3789886757358916e-06, "loss": 0.15, "step": 2954 }, { "epoch": 0.665784211563917, "grad_norm": 0.49872767832251386, "learning_rate": 1.3773238708574054e-06, "loss": 0.1553, "step": 2955 }, { "epoch": 0.6660095192497254, "grad_norm": 0.46166735586442514, "learning_rate": 1.375659689359126e-06, "loss": 0.1467, "step": 2956 }, { "epoch": 0.6662348269355338, "grad_norm": 0.4504446375247205, "learning_rate": 1.3739961321651139e-06, "loss": 0.1413, "step": 2957 }, { "epoch": 0.6664601346213422, "grad_norm": 0.4831570270140901, "learning_rate": 1.3723332001990774e-06, "loss": 0.1349, "step": 2958 }, { "epoch": 0.6666854423071507, "grad_norm": 0.43536534788662207, "learning_rate": 1.3706708943843822e-06, "loss": 0.1254, "step": 2959 }, { "epoch": 0.6669107499929592, "grad_norm": 0.4635897581330502, "learning_rate": 1.369009215644046e-06, "loss": 0.1408, "step": 2960 }, { "epoch": 0.6671360576787676, "grad_norm": 0.4607848426627536, "learning_rate": 1.3673481649007347e-06, "loss": 0.1413, "step": 2961 }, { "epoch": 0.667361365364576, "grad_norm": 0.45287392564560036, "learning_rate": 1.36568774307677e-06, "loss": 0.1339, "step": 2962 }, { "epoch": 0.6675866730503844, "grad_norm": 0.4653937508230895, "learning_rate": 1.36402795109412e-06, "loss": 0.142, "step": 2963 }, { "epoch": 0.6678119807361929, "grad_norm": 0.4456093510289949, "learning_rate": 1.362368789874407e-06, "loss": 0.133, "step": 2964 }, { "epoch": 0.6680372884220013, "grad_norm": 0.47497169797980815, "learning_rate": 1.3607102603389016e-06, "loss": 0.1445, "step": 2965 }, { "epoch": 0.6682625961078097, "grad_norm": 0.4808704744452402, "learning_rate": 1.3590523634085218e-06, "loss": 0.1464, "step": 2966 }, { "epoch": 0.6684879037936181, "grad_norm": 0.46680295816534506, "learning_rate": 1.3573951000038376e-06, "loss": 0.1351, "step": 2967 }, { "epoch": 0.6687132114794266, "grad_norm": 0.46521906793161333, "learning_rate": 1.3557384710450644e-06, "loss": 0.159, "step": 2968 }, { "epoch": 0.6689385191652351, "grad_norm": 0.46029143620950225, "learning_rate": 1.3540824774520678e-06, "loss": 0.1437, "step": 2969 }, { "epoch": 0.6691638268510435, "grad_norm": 0.4681229343815473, "learning_rate": 1.3524271201443578e-06, "loss": 0.1382, "step": 2970 }, { "epoch": 0.6693891345368519, "grad_norm": 0.5077166184117899, "learning_rate": 1.3507724000410933e-06, "loss": 0.1567, "step": 2971 }, { "epoch": 0.6696144422226603, "grad_norm": 0.43495619042875877, "learning_rate": 1.3491183180610807e-06, "loss": 0.1383, "step": 2972 }, { "epoch": 0.6698397499084687, "grad_norm": 0.45458783126562835, "learning_rate": 1.347464875122766e-06, "loss": 0.134, "step": 2973 }, { "epoch": 0.6700650575942771, "grad_norm": 0.44400295352887237, "learning_rate": 1.3458120721442464e-06, "loss": 0.1316, "step": 2974 }, { "epoch": 0.6702903652800856, "grad_norm": 0.45680543296708814, "learning_rate": 1.3441599100432635e-06, "loss": 0.135, "step": 2975 }, { "epoch": 0.6705156729658941, "grad_norm": 0.46350076737877277, "learning_rate": 1.3425083897371983e-06, "loss": 0.1365, "step": 2976 }, { "epoch": 0.6707409806517025, "grad_norm": 0.4925752036708255, "learning_rate": 1.3408575121430812e-06, "loss": 0.1456, "step": 2977 }, { "epoch": 0.6709662883375109, "grad_norm": 0.48149834877938713, "learning_rate": 1.3392072781775806e-06, "loss": 0.1448, "step": 2978 }, { "epoch": 0.6711915960233193, "grad_norm": 0.44479976304247376, "learning_rate": 1.337557688757012e-06, "loss": 0.1298, "step": 2979 }, { "epoch": 0.6714169037091278, "grad_norm": 0.49638144928389527, "learning_rate": 1.335908744797329e-06, "loss": 0.1478, "step": 2980 }, { "epoch": 0.6716422113949362, "grad_norm": 0.496661796314984, "learning_rate": 1.3342604472141296e-06, "loss": 0.154, "step": 2981 }, { "epoch": 0.6718675190807446, "grad_norm": 0.48523105822115803, "learning_rate": 1.3326127969226535e-06, "loss": 0.1545, "step": 2982 }, { "epoch": 0.672092826766553, "grad_norm": 0.4706474816730592, "learning_rate": 1.3309657948377768e-06, "loss": 0.1444, "step": 2983 }, { "epoch": 0.6723181344523615, "grad_norm": 0.46988659672089167, "learning_rate": 1.3293194418740207e-06, "loss": 0.146, "step": 2984 }, { "epoch": 0.67254344213817, "grad_norm": 0.47750222447061125, "learning_rate": 1.3276737389455416e-06, "loss": 0.1449, "step": 2985 }, { "epoch": 0.6727687498239784, "grad_norm": 0.4842288786003118, "learning_rate": 1.3260286869661378e-06, "loss": 0.1449, "step": 2986 }, { "epoch": 0.6729940575097868, "grad_norm": 0.47033341161419007, "learning_rate": 1.3243842868492468e-06, "loss": 0.1364, "step": 2987 }, { "epoch": 0.6732193651955952, "grad_norm": 0.47443391802769397, "learning_rate": 1.32274053950794e-06, "loss": 0.1436, "step": 2988 }, { "epoch": 0.6734446728814036, "grad_norm": 0.4731098263985566, "learning_rate": 1.3210974458549318e-06, "loss": 0.1454, "step": 2989 }, { "epoch": 0.6736699805672121, "grad_norm": 0.46654736287810944, "learning_rate": 1.3194550068025697e-06, "loss": 0.1466, "step": 2990 }, { "epoch": 0.6738952882530206, "grad_norm": 0.49532064733223036, "learning_rate": 1.3178132232628374e-06, "loss": 0.1478, "step": 2991 }, { "epoch": 0.674120595938829, "grad_norm": 0.4895887904560521, "learning_rate": 1.3161720961473583e-06, "loss": 0.1505, "step": 2992 }, { "epoch": 0.6743459036246374, "grad_norm": 0.47395701914670035, "learning_rate": 1.3145316263673874e-06, "loss": 0.1354, "step": 2993 }, { "epoch": 0.6745712113104458, "grad_norm": 0.49688525999940936, "learning_rate": 1.3128918148338183e-06, "loss": 0.1556, "step": 2994 }, { "epoch": 0.6747965189962543, "grad_norm": 0.48482779160388284, "learning_rate": 1.3112526624571753e-06, "loss": 0.1534, "step": 2995 }, { "epoch": 0.6750218266820627, "grad_norm": 0.4970336841742754, "learning_rate": 1.3096141701476189e-06, "loss": 0.1544, "step": 2996 }, { "epoch": 0.6752471343678711, "grad_norm": 0.49387939090629773, "learning_rate": 1.307976338814945e-06, "loss": 0.1533, "step": 2997 }, { "epoch": 0.6754724420536795, "grad_norm": 0.4988557664085388, "learning_rate": 1.3063391693685773e-06, "loss": 0.1691, "step": 2998 }, { "epoch": 0.675697749739488, "grad_norm": 0.45405104965930987, "learning_rate": 1.3047026627175774e-06, "loss": 0.1387, "step": 2999 }, { "epoch": 0.6759230574252965, "grad_norm": 0.47423882703768094, "learning_rate": 1.3030668197706347e-06, "loss": 0.1417, "step": 3000 }, { "epoch": 0.6759230574252965, "eval_loss": 0.1473056972026825, "eval_runtime": 56.9201, "eval_samples_per_second": 50.422, "eval_steps_per_second": 6.307, "step": 3000 }, { "epoch": 0.6761483651111049, "grad_norm": 0.4505313679594971, "learning_rate": 1.3014316414360732e-06, "loss": 0.1344, "step": 3001 }, { "epoch": 0.6763736727969133, "grad_norm": 0.4559618012667919, "learning_rate": 1.2997971286218448e-06, "loss": 0.1314, "step": 3002 }, { "epoch": 0.6765989804827217, "grad_norm": 0.47844832165319234, "learning_rate": 1.2981632822355344e-06, "loss": 0.1422, "step": 3003 }, { "epoch": 0.6768242881685301, "grad_norm": 0.48766877285251875, "learning_rate": 1.2965301031843574e-06, "loss": 0.1468, "step": 3004 }, { "epoch": 0.6770495958543385, "grad_norm": 0.458622595710425, "learning_rate": 1.294897592375155e-06, "loss": 0.1309, "step": 3005 }, { "epoch": 0.677274903540147, "grad_norm": 0.46647292917179217, "learning_rate": 1.2932657507144014e-06, "loss": 0.147, "step": 3006 }, { "epoch": 0.6775002112259555, "grad_norm": 0.4655457149851092, "learning_rate": 1.2916345791081964e-06, "loss": 0.1458, "step": 3007 }, { "epoch": 0.6777255189117639, "grad_norm": 0.4510539918644756, "learning_rate": 1.2900040784622686e-06, "loss": 0.1437, "step": 3008 }, { "epoch": 0.6779508265975723, "grad_norm": 0.4524162591560385, "learning_rate": 1.2883742496819751e-06, "loss": 0.1357, "step": 3009 }, { "epoch": 0.6781761342833807, "grad_norm": 0.4821849125579185, "learning_rate": 1.286745093672298e-06, "loss": 0.1477, "step": 3010 }, { "epoch": 0.6784014419691892, "grad_norm": 0.49276174937941813, "learning_rate": 1.2851166113378471e-06, "loss": 0.1514, "step": 3011 }, { "epoch": 0.6786267496549976, "grad_norm": 0.45060096939936306, "learning_rate": 1.2834888035828597e-06, "loss": 0.1368, "step": 3012 }, { "epoch": 0.678852057340806, "grad_norm": 0.46775710048149644, "learning_rate": 1.2818616713111945e-06, "loss": 0.1487, "step": 3013 }, { "epoch": 0.6790773650266144, "grad_norm": 0.4624662532836971, "learning_rate": 1.2802352154263392e-06, "loss": 0.1385, "step": 3014 }, { "epoch": 0.679302672712423, "grad_norm": 0.4537436062939665, "learning_rate": 1.2786094368314023e-06, "loss": 0.1345, "step": 3015 }, { "epoch": 0.6795279803982314, "grad_norm": 0.508926898966882, "learning_rate": 1.2769843364291202e-06, "loss": 0.1535, "step": 3016 }, { "epoch": 0.6797532880840398, "grad_norm": 0.4896959769941762, "learning_rate": 1.2753599151218483e-06, "loss": 0.1456, "step": 3017 }, { "epoch": 0.6799785957698482, "grad_norm": 0.44335947567847883, "learning_rate": 1.2737361738115681e-06, "loss": 0.1262, "step": 3018 }, { "epoch": 0.6802039034556566, "grad_norm": 0.49082267300728155, "learning_rate": 1.2721131133998837e-06, "loss": 0.1467, "step": 3019 }, { "epoch": 0.680429211141465, "grad_norm": 0.49221069818725716, "learning_rate": 1.2704907347880185e-06, "loss": 0.1507, "step": 3020 }, { "epoch": 0.6806545188272735, "grad_norm": 0.48681513514678987, "learning_rate": 1.2688690388768205e-06, "loss": 0.1538, "step": 3021 }, { "epoch": 0.6808798265130819, "grad_norm": 0.5121121495883596, "learning_rate": 1.2672480265667553e-06, "loss": 0.1539, "step": 3022 }, { "epoch": 0.6811051341988904, "grad_norm": 0.4926215441022388, "learning_rate": 1.2656276987579118e-06, "loss": 0.1554, "step": 3023 }, { "epoch": 0.6813304418846988, "grad_norm": 0.48046985165609934, "learning_rate": 1.2640080563499977e-06, "loss": 0.1402, "step": 3024 }, { "epoch": 0.6815557495705072, "grad_norm": 0.46929362908973327, "learning_rate": 1.2623891002423383e-06, "loss": 0.1438, "step": 3025 }, { "epoch": 0.6817810572563157, "grad_norm": 0.48803921565185054, "learning_rate": 1.2607708313338818e-06, "loss": 0.1614, "step": 3026 }, { "epoch": 0.6820063649421241, "grad_norm": 0.5093866941830406, "learning_rate": 1.2591532505231906e-06, "loss": 0.1593, "step": 3027 }, { "epoch": 0.6822316726279325, "grad_norm": 0.4876402897754427, "learning_rate": 1.2575363587084486e-06, "loss": 0.1444, "step": 3028 }, { "epoch": 0.6824569803137409, "grad_norm": 0.45939270218193157, "learning_rate": 1.2559201567874554e-06, "loss": 0.1439, "step": 3029 }, { "epoch": 0.6826822879995493, "grad_norm": 0.4936028113453703, "learning_rate": 1.2543046456576267e-06, "loss": 0.1494, "step": 3030 }, { "epoch": 0.6829075956853579, "grad_norm": 0.5003307934170121, "learning_rate": 1.252689826215997e-06, "loss": 0.1364, "step": 3031 }, { "epoch": 0.6831329033711663, "grad_norm": 0.47887429632024237, "learning_rate": 1.2510756993592138e-06, "loss": 0.1411, "step": 3032 }, { "epoch": 0.6833582110569747, "grad_norm": 0.4563775894109953, "learning_rate": 1.2494622659835421e-06, "loss": 0.1361, "step": 3033 }, { "epoch": 0.6835835187427831, "grad_norm": 0.4534763623106835, "learning_rate": 1.2478495269848626e-06, "loss": 0.1395, "step": 3034 }, { "epoch": 0.6838088264285915, "grad_norm": 0.5089995426109665, "learning_rate": 1.246237483258667e-06, "loss": 0.1511, "step": 3035 }, { "epoch": 0.6840341341144, "grad_norm": 0.47351700691400833, "learning_rate": 1.2446261357000655e-06, "loss": 0.1314, "step": 3036 }, { "epoch": 0.6842594418002084, "grad_norm": 0.4435009128269627, "learning_rate": 1.243015485203777e-06, "loss": 0.1355, "step": 3037 }, { "epoch": 0.6844847494860169, "grad_norm": 0.4581160172261627, "learning_rate": 1.2414055326641378e-06, "loss": 0.1481, "step": 3038 }, { "epoch": 0.6847100571718253, "grad_norm": 0.45155658956140055, "learning_rate": 1.2397962789750923e-06, "loss": 0.1355, "step": 3039 }, { "epoch": 0.6849353648576337, "grad_norm": 0.4907567897160798, "learning_rate": 1.2381877250302002e-06, "loss": 0.1566, "step": 3040 }, { "epoch": 0.6851606725434422, "grad_norm": 0.4899167405839578, "learning_rate": 1.236579871722633e-06, "loss": 0.1546, "step": 3041 }, { "epoch": 0.6853859802292506, "grad_norm": 0.5133340760228167, "learning_rate": 1.2349727199451696e-06, "loss": 0.16, "step": 3042 }, { "epoch": 0.685611287915059, "grad_norm": 0.45618746757252454, "learning_rate": 1.233366270590202e-06, "loss": 0.1421, "step": 3043 }, { "epoch": 0.6858365956008674, "grad_norm": 0.5079427568416787, "learning_rate": 1.2317605245497324e-06, "loss": 0.1442, "step": 3044 }, { "epoch": 0.6860619032866758, "grad_norm": 0.4735908283197607, "learning_rate": 1.2301554827153703e-06, "loss": 0.1444, "step": 3045 }, { "epoch": 0.6862872109724844, "grad_norm": 0.509591432751062, "learning_rate": 1.2285511459783373e-06, "loss": 0.1619, "step": 3046 }, { "epoch": 0.6865125186582928, "grad_norm": 0.4331611286622087, "learning_rate": 1.2269475152294601e-06, "loss": 0.1287, "step": 3047 }, { "epoch": 0.6867378263441012, "grad_norm": 0.4767098179638202, "learning_rate": 1.225344591359177e-06, "loss": 0.1425, "step": 3048 }, { "epoch": 0.6869631340299096, "grad_norm": 0.4870537149018392, "learning_rate": 1.2237423752575297e-06, "loss": 0.1539, "step": 3049 }, { "epoch": 0.687188441715718, "grad_norm": 0.4686958612332258, "learning_rate": 1.2221408678141702e-06, "loss": 0.1475, "step": 3050 }, { "epoch": 0.6874137494015264, "grad_norm": 0.4866315832405193, "learning_rate": 1.220540069918357e-06, "loss": 0.1494, "step": 3051 }, { "epoch": 0.6876390570873349, "grad_norm": 0.44919615760515846, "learning_rate": 1.2189399824589513e-06, "loss": 0.1344, "step": 3052 }, { "epoch": 0.6878643647731433, "grad_norm": 0.49054924470874584, "learning_rate": 1.217340606324424e-06, "loss": 0.1619, "step": 3053 }, { "epoch": 0.6880896724589518, "grad_norm": 0.47122438915104853, "learning_rate": 1.2157419424028473e-06, "loss": 0.1525, "step": 3054 }, { "epoch": 0.6883149801447602, "grad_norm": 0.47019681817702275, "learning_rate": 1.2141439915819008e-06, "loss": 0.1462, "step": 3055 }, { "epoch": 0.6885402878305686, "grad_norm": 0.5068235152597098, "learning_rate": 1.2125467547488676e-06, "loss": 0.1657, "step": 3056 }, { "epoch": 0.6887655955163771, "grad_norm": 0.48158637474706467, "learning_rate": 1.210950232790632e-06, "loss": 0.1539, "step": 3057 }, { "epoch": 0.6889909032021855, "grad_norm": 0.47591504746128516, "learning_rate": 1.2093544265936848e-06, "loss": 0.1462, "step": 3058 }, { "epoch": 0.6892162108879939, "grad_norm": 0.48149830718783326, "learning_rate": 1.2077593370441165e-06, "loss": 0.1446, "step": 3059 }, { "epoch": 0.6894415185738023, "grad_norm": 0.47270146200564156, "learning_rate": 1.206164965027622e-06, "loss": 0.1472, "step": 3060 }, { "epoch": 0.6896668262596107, "grad_norm": 0.46228603096579945, "learning_rate": 1.204571311429496e-06, "loss": 0.1417, "step": 3061 }, { "epoch": 0.6898921339454193, "grad_norm": 0.46404709896039853, "learning_rate": 1.2029783771346344e-06, "loss": 0.1498, "step": 3062 }, { "epoch": 0.6901174416312277, "grad_norm": 0.4831439997979854, "learning_rate": 1.2013861630275353e-06, "loss": 0.1541, "step": 3063 }, { "epoch": 0.6903427493170361, "grad_norm": 0.4737086389514101, "learning_rate": 1.1997946699922946e-06, "loss": 0.1419, "step": 3064 }, { "epoch": 0.6905680570028445, "grad_norm": 0.4690439737437469, "learning_rate": 1.1982038989126096e-06, "loss": 0.1466, "step": 3065 }, { "epoch": 0.6907933646886529, "grad_norm": 0.4594926738760316, "learning_rate": 1.1966138506717776e-06, "loss": 0.1465, "step": 3066 }, { "epoch": 0.6910186723744614, "grad_norm": 0.4779588042627649, "learning_rate": 1.195024526152691e-06, "loss": 0.1525, "step": 3067 }, { "epoch": 0.6912439800602698, "grad_norm": 0.4929196065929435, "learning_rate": 1.1934359262378443e-06, "loss": 0.1551, "step": 3068 }, { "epoch": 0.6914692877460782, "grad_norm": 0.47498771172548415, "learning_rate": 1.1918480518093259e-06, "loss": 0.1418, "step": 3069 }, { "epoch": 0.6916945954318867, "grad_norm": 0.46425031665508576, "learning_rate": 1.190260903748825e-06, "loss": 0.1239, "step": 3070 }, { "epoch": 0.6919199031176951, "grad_norm": 0.47318505042928816, "learning_rate": 1.1886744829376243e-06, "loss": 0.1431, "step": 3071 }, { "epoch": 0.6921452108035036, "grad_norm": 0.4795022202216388, "learning_rate": 1.187088790256605e-06, "loss": 0.1509, "step": 3072 }, { "epoch": 0.692370518489312, "grad_norm": 0.4894364245128437, "learning_rate": 1.185503826586244e-06, "loss": 0.163, "step": 3073 }, { "epoch": 0.6925958261751204, "grad_norm": 0.4731165025423434, "learning_rate": 1.1839195928066101e-06, "loss": 0.1475, "step": 3074 }, { "epoch": 0.6928211338609288, "grad_norm": 0.5098560337269166, "learning_rate": 1.1823360897973723e-06, "loss": 0.166, "step": 3075 }, { "epoch": 0.6930464415467372, "grad_norm": 0.4565925376100047, "learning_rate": 1.1807533184377882e-06, "loss": 0.1389, "step": 3076 }, { "epoch": 0.6932717492325456, "grad_norm": 0.4405166749911481, "learning_rate": 1.1791712796067134e-06, "loss": 0.1278, "step": 3077 }, { "epoch": 0.6934970569183542, "grad_norm": 0.4963434131636802, "learning_rate": 1.1775899741825947e-06, "loss": 0.1474, "step": 3078 }, { "epoch": 0.6937223646041626, "grad_norm": 0.48453922212336464, "learning_rate": 1.176009403043471e-06, "loss": 0.1501, "step": 3079 }, { "epoch": 0.693947672289971, "grad_norm": 0.5067028827209051, "learning_rate": 1.1744295670669752e-06, "loss": 0.1496, "step": 3080 }, { "epoch": 0.6941729799757794, "grad_norm": 0.5177069478578376, "learning_rate": 1.1728504671303326e-06, "loss": 0.1523, "step": 3081 }, { "epoch": 0.6943982876615878, "grad_norm": 0.4742299577477372, "learning_rate": 1.171272104110356e-06, "loss": 0.1474, "step": 3082 }, { "epoch": 0.6946235953473963, "grad_norm": 0.4576067845006196, "learning_rate": 1.1696944788834546e-06, "loss": 0.1444, "step": 3083 }, { "epoch": 0.6948489030332047, "grad_norm": 0.4799393814344733, "learning_rate": 1.168117592325622e-06, "loss": 0.1508, "step": 3084 }, { "epoch": 0.6950742107190131, "grad_norm": 0.4659138517296435, "learning_rate": 1.1665414453124468e-06, "loss": 0.1438, "step": 3085 }, { "epoch": 0.6952995184048216, "grad_norm": 0.48305159441188866, "learning_rate": 1.1649660387191027e-06, "loss": 0.1427, "step": 3086 }, { "epoch": 0.69552482609063, "grad_norm": 0.4772218284362722, "learning_rate": 1.1633913734203552e-06, "loss": 0.149, "step": 3087 }, { "epoch": 0.6957501337764385, "grad_norm": 0.45069555823830265, "learning_rate": 1.1618174502905586e-06, "loss": 0.1402, "step": 3088 }, { "epoch": 0.6959754414622469, "grad_norm": 0.5075563908277877, "learning_rate": 1.1602442702036513e-06, "loss": 0.1573, "step": 3089 }, { "epoch": 0.6962007491480553, "grad_norm": 0.49676498026711446, "learning_rate": 1.1586718340331634e-06, "loss": 0.1502, "step": 3090 }, { "epoch": 0.6964260568338637, "grad_norm": 0.4874275007939701, "learning_rate": 1.1571001426522088e-06, "loss": 0.1445, "step": 3091 }, { "epoch": 0.6966513645196721, "grad_norm": 0.48490688683211536, "learning_rate": 1.1555291969334907e-06, "loss": 0.1508, "step": 3092 }, { "epoch": 0.6968766722054807, "grad_norm": 0.4492305244596637, "learning_rate": 1.1539589977492946e-06, "loss": 0.1227, "step": 3093 }, { "epoch": 0.6971019798912891, "grad_norm": 0.5051266177903154, "learning_rate": 1.1523895459714948e-06, "loss": 0.1545, "step": 3094 }, { "epoch": 0.6973272875770975, "grad_norm": 0.4926487663638566, "learning_rate": 1.1508208424715511e-06, "loss": 0.153, "step": 3095 }, { "epoch": 0.6975525952629059, "grad_norm": 0.4848838310155643, "learning_rate": 1.1492528881205027e-06, "loss": 0.1451, "step": 3096 }, { "epoch": 0.6977779029487143, "grad_norm": 0.46075829468760326, "learning_rate": 1.1476856837889774e-06, "loss": 0.1488, "step": 3097 }, { "epoch": 0.6980032106345228, "grad_norm": 0.49275519224627967, "learning_rate": 1.146119230347187e-06, "loss": 0.15, "step": 3098 }, { "epoch": 0.6982285183203312, "grad_norm": 0.4629973565314605, "learning_rate": 1.1445535286649223e-06, "loss": 0.1393, "step": 3099 }, { "epoch": 0.6984538260061396, "grad_norm": 0.45565503946869035, "learning_rate": 1.142988579611561e-06, "loss": 0.1418, "step": 3100 }, { "epoch": 0.6986791336919481, "grad_norm": 0.4760860016759647, "learning_rate": 1.1414243840560595e-06, "loss": 0.1445, "step": 3101 }, { "epoch": 0.6989044413777565, "grad_norm": 0.46767206653780474, "learning_rate": 1.1398609428669582e-06, "loss": 0.1498, "step": 3102 }, { "epoch": 0.699129749063565, "grad_norm": 0.4712615428373321, "learning_rate": 1.1382982569123781e-06, "loss": 0.1312, "step": 3103 }, { "epoch": 0.6993550567493734, "grad_norm": 0.4831477550551603, "learning_rate": 1.136736327060019e-06, "loss": 0.1421, "step": 3104 }, { "epoch": 0.6995803644351818, "grad_norm": 0.49741492983541774, "learning_rate": 1.1351751541771644e-06, "loss": 0.1535, "step": 3105 }, { "epoch": 0.6998056721209902, "grad_norm": 0.48827851063729416, "learning_rate": 1.133614739130673e-06, "loss": 0.1436, "step": 3106 }, { "epoch": 0.7000309798067986, "grad_norm": 0.49902832291234195, "learning_rate": 1.1320550827869875e-06, "loss": 0.1606, "step": 3107 }, { "epoch": 0.700256287492607, "grad_norm": 0.5116087479087541, "learning_rate": 1.1304961860121246e-06, "loss": 0.1691, "step": 3108 }, { "epoch": 0.7004815951784156, "grad_norm": 0.510097472402678, "learning_rate": 1.128938049671683e-06, "loss": 0.1616, "step": 3109 }, { "epoch": 0.700706902864224, "grad_norm": 0.4833713433695349, "learning_rate": 1.127380674630838e-06, "loss": 0.152, "step": 3110 }, { "epoch": 0.7009322105500324, "grad_norm": 0.47423236577069905, "learning_rate": 1.1258240617543407e-06, "loss": 0.1496, "step": 3111 }, { "epoch": 0.7011575182358408, "grad_norm": 0.4613882318970388, "learning_rate": 1.1242682119065217e-06, "loss": 0.14, "step": 3112 }, { "epoch": 0.7013828259216492, "grad_norm": 0.5157137088816274, "learning_rate": 1.1227131259512857e-06, "loss": 0.1586, "step": 3113 }, { "epoch": 0.7016081336074577, "grad_norm": 0.49516447557609644, "learning_rate": 1.121158804752113e-06, "loss": 0.1527, "step": 3114 }, { "epoch": 0.7018334412932661, "grad_norm": 0.4952901654808642, "learning_rate": 1.119605249172062e-06, "loss": 0.1513, "step": 3115 }, { "epoch": 0.7020587489790745, "grad_norm": 0.4724530964207107, "learning_rate": 1.1180524600737624e-06, "loss": 0.1443, "step": 3116 }, { "epoch": 0.702284056664883, "grad_norm": 0.4810529386464414, "learning_rate": 1.1165004383194218e-06, "loss": 0.1541, "step": 3117 }, { "epoch": 0.7025093643506914, "grad_norm": 0.49948631908338825, "learning_rate": 1.1149491847708186e-06, "loss": 0.1446, "step": 3118 }, { "epoch": 0.7027346720364999, "grad_norm": 0.46435650648448656, "learning_rate": 1.1133987002893062e-06, "loss": 0.1402, "step": 3119 }, { "epoch": 0.7029599797223083, "grad_norm": 0.4748936522784897, "learning_rate": 1.1118489857358129e-06, "loss": 0.1434, "step": 3120 }, { "epoch": 0.7031852874081167, "grad_norm": 0.4742386565536852, "learning_rate": 1.1103000419708347e-06, "loss": 0.1406, "step": 3121 }, { "epoch": 0.7034105950939251, "grad_norm": 0.47132136003416986, "learning_rate": 1.1087518698544444e-06, "loss": 0.145, "step": 3122 }, { "epoch": 0.7036359027797335, "grad_norm": 0.48013507541614425, "learning_rate": 1.1072044702462825e-06, "loss": 0.1581, "step": 3123 }, { "epoch": 0.703861210465542, "grad_norm": 0.49721536280253587, "learning_rate": 1.1056578440055631e-06, "loss": 0.153, "step": 3124 }, { "epoch": 0.7040865181513505, "grad_norm": 0.49307643608270285, "learning_rate": 1.1041119919910715e-06, "loss": 0.148, "step": 3125 }, { "epoch": 0.7043118258371589, "grad_norm": 0.47170175416938803, "learning_rate": 1.1025669150611594e-06, "loss": 0.1481, "step": 3126 }, { "epoch": 0.7045371335229673, "grad_norm": 0.48379362267533554, "learning_rate": 1.101022614073752e-06, "loss": 0.1413, "step": 3127 }, { "epoch": 0.7047624412087757, "grad_norm": 0.4795946087501419, "learning_rate": 1.0994790898863409e-06, "loss": 0.1521, "step": 3128 }, { "epoch": 0.7049877488945842, "grad_norm": 0.4634515948168377, "learning_rate": 1.0979363433559892e-06, "loss": 0.1386, "step": 3129 }, { "epoch": 0.7052130565803926, "grad_norm": 0.4671318265320663, "learning_rate": 1.0963943753393252e-06, "loss": 0.1414, "step": 3130 }, { "epoch": 0.705438364266201, "grad_norm": 0.46605061932981395, "learning_rate": 1.094853186692546e-06, "loss": 0.1313, "step": 3131 }, { "epoch": 0.7056636719520094, "grad_norm": 0.445158905701931, "learning_rate": 1.0933127782714175e-06, "loss": 0.1295, "step": 3132 }, { "epoch": 0.7058889796378179, "grad_norm": 0.46717957236013813, "learning_rate": 1.0917731509312696e-06, "loss": 0.1419, "step": 3133 }, { "epoch": 0.7061142873236264, "grad_norm": 0.48689326140673095, "learning_rate": 1.0902343055270006e-06, "loss": 0.1519, "step": 3134 }, { "epoch": 0.7063395950094348, "grad_norm": 0.46153458245075457, "learning_rate": 1.0886962429130754e-06, "loss": 0.1368, "step": 3135 }, { "epoch": 0.7065649026952432, "grad_norm": 0.4859316993075991, "learning_rate": 1.0871589639435204e-06, "loss": 0.1458, "step": 3136 }, { "epoch": 0.7067902103810516, "grad_norm": 0.481466474298156, "learning_rate": 1.0856224694719313e-06, "loss": 0.1458, "step": 3137 }, { "epoch": 0.70701551806686, "grad_norm": 0.5198663823698088, "learning_rate": 1.0840867603514648e-06, "loss": 0.1662, "step": 3138 }, { "epoch": 0.7072408257526684, "grad_norm": 0.5043701860625105, "learning_rate": 1.0825518374348442e-06, "loss": 0.1592, "step": 3139 }, { "epoch": 0.707466133438477, "grad_norm": 0.4683284619656246, "learning_rate": 1.0810177015743536e-06, "loss": 0.141, "step": 3140 }, { "epoch": 0.7076914411242854, "grad_norm": 0.4772402808000686, "learning_rate": 1.079484353621842e-06, "loss": 0.1471, "step": 3141 }, { "epoch": 0.7079167488100938, "grad_norm": 0.4826855257837391, "learning_rate": 1.0779517944287216e-06, "loss": 0.1441, "step": 3142 }, { "epoch": 0.7081420564959022, "grad_norm": 0.4435103670159692, "learning_rate": 1.0764200248459633e-06, "loss": 0.1283, "step": 3143 }, { "epoch": 0.7083673641817106, "grad_norm": 0.4574956644323345, "learning_rate": 1.0748890457241037e-06, "loss": 0.1398, "step": 3144 }, { "epoch": 0.7085926718675191, "grad_norm": 0.5063357865724047, "learning_rate": 1.0733588579132365e-06, "loss": 0.1504, "step": 3145 }, { "epoch": 0.7088179795533275, "grad_norm": 0.47469368780329463, "learning_rate": 1.0718294622630188e-06, "loss": 0.1462, "step": 3146 }, { "epoch": 0.7090432872391359, "grad_norm": 0.4680882762833536, "learning_rate": 1.0703008596226692e-06, "loss": 0.1352, "step": 3147 }, { "epoch": 0.7092685949249444, "grad_norm": 0.47703364881096866, "learning_rate": 1.0687730508409594e-06, "loss": 0.157, "step": 3148 }, { "epoch": 0.7094939026107528, "grad_norm": 0.4786367298568394, "learning_rate": 1.0672460367662271e-06, "loss": 0.1422, "step": 3149 }, { "epoch": 0.7097192102965613, "grad_norm": 0.4744643950982306, "learning_rate": 1.065719818246367e-06, "loss": 0.146, "step": 3150 }, { "epoch": 0.7099445179823697, "grad_norm": 0.5116813633903049, "learning_rate": 1.0641943961288298e-06, "loss": 0.1512, "step": 3151 }, { "epoch": 0.7101698256681781, "grad_norm": 0.48955853146110767, "learning_rate": 1.062669771260627e-06, "loss": 0.1552, "step": 3152 }, { "epoch": 0.7103951333539865, "grad_norm": 0.4676410023274031, "learning_rate": 1.0611459444883243e-06, "loss": 0.1434, "step": 3153 }, { "epoch": 0.7106204410397949, "grad_norm": 0.47628413236442235, "learning_rate": 1.0596229166580477e-06, "loss": 0.149, "step": 3154 }, { "epoch": 0.7108457487256034, "grad_norm": 0.4838018330638107, "learning_rate": 1.0581006886154758e-06, "loss": 0.1413, "step": 3155 }, { "epoch": 0.7110710564114119, "grad_norm": 0.4772983055827193, "learning_rate": 1.0565792612058462e-06, "loss": 0.148, "step": 3156 }, { "epoch": 0.7112963640972203, "grad_norm": 0.4804248864966532, "learning_rate": 1.0550586352739519e-06, "loss": 0.147, "step": 3157 }, { "epoch": 0.7115216717830287, "grad_norm": 0.4494515513060731, "learning_rate": 1.0535388116641376e-06, "loss": 0.1391, "step": 3158 }, { "epoch": 0.7117469794688371, "grad_norm": 0.4538366196412275, "learning_rate": 1.0520197912203067e-06, "loss": 0.1226, "step": 3159 }, { "epoch": 0.7119722871546456, "grad_norm": 0.4981133094727563, "learning_rate": 1.050501574785913e-06, "loss": 0.1569, "step": 3160 }, { "epoch": 0.712197594840454, "grad_norm": 0.4939787669612392, "learning_rate": 1.048984163203967e-06, "loss": 0.1529, "step": 3161 }, { "epoch": 0.7124229025262624, "grad_norm": 0.480626233851763, "learning_rate": 1.0474675573170293e-06, "loss": 0.1436, "step": 3162 }, { "epoch": 0.7126482102120708, "grad_norm": 0.47851850089311315, "learning_rate": 1.045951757967215e-06, "loss": 0.1511, "step": 3163 }, { "epoch": 0.7128735178978793, "grad_norm": 0.4821007601563941, "learning_rate": 1.0444367659961927e-06, "loss": 0.1517, "step": 3164 }, { "epoch": 0.7130988255836878, "grad_norm": 0.4508947909552097, "learning_rate": 1.0429225822451792e-06, "loss": 0.1427, "step": 3165 }, { "epoch": 0.7133241332694962, "grad_norm": 0.4908658910207534, "learning_rate": 1.041409207554944e-06, "loss": 0.1469, "step": 3166 }, { "epoch": 0.7135494409553046, "grad_norm": 0.48281352061819466, "learning_rate": 1.0398966427658091e-06, "loss": 0.1463, "step": 3167 }, { "epoch": 0.713774748641113, "grad_norm": 0.5024966516256968, "learning_rate": 1.0383848887176437e-06, "loss": 0.155, "step": 3168 }, { "epoch": 0.7140000563269214, "grad_norm": 0.46731125814676394, "learning_rate": 1.0368739462498704e-06, "loss": 0.1354, "step": 3169 }, { "epoch": 0.7142253640127298, "grad_norm": 0.48861777845760407, "learning_rate": 1.035363816201457e-06, "loss": 0.1386, "step": 3170 }, { "epoch": 0.7144506716985383, "grad_norm": 0.4798593003406232, "learning_rate": 1.033854499410924e-06, "loss": 0.141, "step": 3171 }, { "epoch": 0.7146759793843468, "grad_norm": 0.4951005837529604, "learning_rate": 1.032345996716339e-06, "loss": 0.1376, "step": 3172 }, { "epoch": 0.7149012870701552, "grad_norm": 0.4649460338111168, "learning_rate": 1.030838308955316e-06, "loss": 0.1302, "step": 3173 }, { "epoch": 0.7151265947559636, "grad_norm": 0.5208561892477044, "learning_rate": 1.0293314369650193e-06, "loss": 0.1605, "step": 3174 }, { "epoch": 0.715351902441772, "grad_norm": 0.4446055841305565, "learning_rate": 1.027825381582157e-06, "loss": 0.1237, "step": 3175 }, { "epoch": 0.7155772101275805, "grad_norm": 0.4780739117334455, "learning_rate": 1.0263201436429873e-06, "loss": 0.1475, "step": 3176 }, { "epoch": 0.7158025178133889, "grad_norm": 0.45659031396739413, "learning_rate": 1.0248157239833111e-06, "loss": 0.1351, "step": 3177 }, { "epoch": 0.7160278254991973, "grad_norm": 0.49335546136403297, "learning_rate": 1.0233121234384777e-06, "loss": 0.1482, "step": 3178 }, { "epoch": 0.7162531331850057, "grad_norm": 0.48359513736268783, "learning_rate": 1.0218093428433807e-06, "loss": 0.1496, "step": 3179 }, { "epoch": 0.7164784408708142, "grad_norm": 0.47609665715625343, "learning_rate": 1.0203073830324566e-06, "loss": 0.1498, "step": 3180 }, { "epoch": 0.7167037485566227, "grad_norm": 0.4929326238522347, "learning_rate": 1.0188062448396897e-06, "loss": 0.1454, "step": 3181 }, { "epoch": 0.7169290562424311, "grad_norm": 0.502710848277265, "learning_rate": 1.0173059290986048e-06, "loss": 0.1369, "step": 3182 }, { "epoch": 0.7171543639282395, "grad_norm": 0.5268512619847401, "learning_rate": 1.015806436642271e-06, "loss": 0.1569, "step": 3183 }, { "epoch": 0.7173796716140479, "grad_norm": 0.4880551435141198, "learning_rate": 1.0143077683033017e-06, "loss": 0.1442, "step": 3184 }, { "epoch": 0.7176049792998563, "grad_norm": 0.5003825145945356, "learning_rate": 1.0128099249138502e-06, "loss": 0.1599, "step": 3185 }, { "epoch": 0.7178302869856648, "grad_norm": 0.4862755540205847, "learning_rate": 1.0113129073056149e-06, "loss": 0.1574, "step": 3186 }, { "epoch": 0.7180555946714733, "grad_norm": 0.4802506303500911, "learning_rate": 1.0098167163098319e-06, "loss": 0.1495, "step": 3187 }, { "epoch": 0.7182809023572817, "grad_norm": 0.499625726077395, "learning_rate": 1.008321352757281e-06, "loss": 0.1525, "step": 3188 }, { "epoch": 0.7185062100430901, "grad_norm": 0.48435055671852, "learning_rate": 1.0068268174782833e-06, "loss": 0.1593, "step": 3189 }, { "epoch": 0.7187315177288985, "grad_norm": 0.4921873230317497, "learning_rate": 1.0053331113026962e-06, "loss": 0.1482, "step": 3190 }, { "epoch": 0.718956825414707, "grad_norm": 0.4612564545959869, "learning_rate": 1.0038402350599214e-06, "loss": 0.1421, "step": 3191 }, { "epoch": 0.7191821331005154, "grad_norm": 0.5063759140786678, "learning_rate": 1.002348189578895e-06, "loss": 0.1563, "step": 3192 }, { "epoch": 0.7194074407863238, "grad_norm": 0.46213153113966987, "learning_rate": 1.0008569756880956e-06, "loss": 0.1378, "step": 3193 }, { "epoch": 0.7196327484721322, "grad_norm": 0.4691935929001728, "learning_rate": 9.993665942155395e-07, "loss": 0.1405, "step": 3194 }, { "epoch": 0.7198580561579407, "grad_norm": 0.47725314891400694, "learning_rate": 9.978770459887778e-07, "loss": 0.1449, "step": 3195 }, { "epoch": 0.7200833638437492, "grad_norm": 0.4993885829727644, "learning_rate": 9.963883318349039e-07, "loss": 0.1589, "step": 3196 }, { "epoch": 0.7203086715295576, "grad_norm": 0.47496255926004993, "learning_rate": 9.949004525805423e-07, "loss": 0.1372, "step": 3197 }, { "epoch": 0.720533979215366, "grad_norm": 0.42669134354141913, "learning_rate": 9.934134090518593e-07, "loss": 0.1199, "step": 3198 }, { "epoch": 0.7207592869011744, "grad_norm": 0.5020968589992666, "learning_rate": 9.919272020745529e-07, "loss": 0.1463, "step": 3199 }, { "epoch": 0.7209845945869828, "grad_norm": 0.4591744042346271, "learning_rate": 9.904418324738605e-07, "loss": 0.1406, "step": 3200 }, { "epoch": 0.7212099022727912, "grad_norm": 0.47887507892203496, "learning_rate": 9.889573010745507e-07, "loss": 0.1482, "step": 3201 }, { "epoch": 0.7214352099585997, "grad_norm": 0.4844452159745273, "learning_rate": 9.874736087009285e-07, "loss": 0.1431, "step": 3202 }, { "epoch": 0.7216605176444082, "grad_norm": 0.4680725094888077, "learning_rate": 9.859907561768335e-07, "loss": 0.1436, "step": 3203 }, { "epoch": 0.7218858253302166, "grad_norm": 0.4610780009513288, "learning_rate": 9.84508744325639e-07, "loss": 0.136, "step": 3204 }, { "epoch": 0.722111133016025, "grad_norm": 0.45728936356077726, "learning_rate": 9.830275739702497e-07, "loss": 0.1354, "step": 3205 }, { "epoch": 0.7223364407018334, "grad_norm": 0.4844421080714476, "learning_rate": 9.815472459331061e-07, "loss": 0.1456, "step": 3206 }, { "epoch": 0.7225617483876419, "grad_norm": 0.4486188705271111, "learning_rate": 9.800677610361768e-07, "loss": 0.1382, "step": 3207 }, { "epoch": 0.7227870560734503, "grad_norm": 0.461905229367304, "learning_rate": 9.785891201009667e-07, "loss": 0.1348, "step": 3208 }, { "epoch": 0.7230123637592587, "grad_norm": 0.4851114349292712, "learning_rate": 9.771113239485084e-07, "loss": 0.154, "step": 3209 }, { "epoch": 0.7232376714450671, "grad_norm": 0.5133236201683371, "learning_rate": 9.756343733993679e-07, "loss": 0.1604, "step": 3210 }, { "epoch": 0.7234629791308756, "grad_norm": 0.48957254607104733, "learning_rate": 9.741582692736412e-07, "loss": 0.1438, "step": 3211 }, { "epoch": 0.7236882868166841, "grad_norm": 0.5149125999254558, "learning_rate": 9.726830123909527e-07, "loss": 0.1542, "step": 3212 }, { "epoch": 0.7239135945024925, "grad_norm": 0.4900800650716252, "learning_rate": 9.71208603570459e-07, "loss": 0.1476, "step": 3213 }, { "epoch": 0.7241389021883009, "grad_norm": 0.4618249551389436, "learning_rate": 9.697350436308428e-07, "loss": 0.1448, "step": 3214 }, { "epoch": 0.7243642098741093, "grad_norm": 0.47297321499334205, "learning_rate": 9.68262333390318e-07, "loss": 0.138, "step": 3215 }, { "epoch": 0.7245895175599177, "grad_norm": 0.5044794941673408, "learning_rate": 9.667904736666258e-07, "loss": 0.1594, "step": 3216 }, { "epoch": 0.7248148252457262, "grad_norm": 0.4703026023000923, "learning_rate": 9.653194652770343e-07, "loss": 0.1399, "step": 3217 }, { "epoch": 0.7250401329315346, "grad_norm": 0.45240034715113475, "learning_rate": 9.638493090383408e-07, "loss": 0.1356, "step": 3218 }, { "epoch": 0.7252654406173431, "grad_norm": 0.492964125740421, "learning_rate": 9.623800057668675e-07, "loss": 0.1478, "step": 3219 }, { "epoch": 0.7254907483031515, "grad_norm": 0.4937862053665514, "learning_rate": 9.60911556278463e-07, "loss": 0.1457, "step": 3220 }, { "epoch": 0.7257160559889599, "grad_norm": 0.4745847894532621, "learning_rate": 9.594439613885044e-07, "loss": 0.1424, "step": 3221 }, { "epoch": 0.7259413636747684, "grad_norm": 0.47954034109239907, "learning_rate": 9.579772219118899e-07, "loss": 0.1316, "step": 3222 }, { "epoch": 0.7261666713605768, "grad_norm": 0.5004712882029183, "learning_rate": 9.565113386630482e-07, "loss": 0.1442, "step": 3223 }, { "epoch": 0.7263919790463852, "grad_norm": 0.5055191565679837, "learning_rate": 9.550463124559267e-07, "loss": 0.1499, "step": 3224 }, { "epoch": 0.7266172867321936, "grad_norm": 0.4833525032509096, "learning_rate": 9.535821441040017e-07, "loss": 0.1517, "step": 3225 }, { "epoch": 0.726842594418002, "grad_norm": 0.5025779445176393, "learning_rate": 9.521188344202717e-07, "loss": 0.1567, "step": 3226 }, { "epoch": 0.7270679021038106, "grad_norm": 0.47225884260255974, "learning_rate": 9.506563842172565e-07, "loss": 0.1418, "step": 3227 }, { "epoch": 0.727293209789619, "grad_norm": 0.4845795579559558, "learning_rate": 9.491947943070015e-07, "loss": 0.149, "step": 3228 }, { "epoch": 0.7275185174754274, "grad_norm": 0.47410389383751206, "learning_rate": 9.477340655010717e-07, "loss": 0.1405, "step": 3229 }, { "epoch": 0.7277438251612358, "grad_norm": 0.48915856141275144, "learning_rate": 9.462741986105573e-07, "loss": 0.1544, "step": 3230 }, { "epoch": 0.7279691328470442, "grad_norm": 0.4894249277430562, "learning_rate": 9.448151944460657e-07, "loss": 0.1431, "step": 3231 }, { "epoch": 0.7281944405328526, "grad_norm": 0.45973320661181566, "learning_rate": 9.433570538177289e-07, "loss": 0.1311, "step": 3232 }, { "epoch": 0.7284197482186611, "grad_norm": 0.4927844223585169, "learning_rate": 9.418997775351985e-07, "loss": 0.1597, "step": 3233 }, { "epoch": 0.7286450559044696, "grad_norm": 0.4916102224236364, "learning_rate": 9.404433664076442e-07, "loss": 0.148, "step": 3234 }, { "epoch": 0.728870363590278, "grad_norm": 0.4896074070395624, "learning_rate": 9.389878212437586e-07, "loss": 0.135, "step": 3235 }, { "epoch": 0.7290956712760864, "grad_norm": 0.49522223597253284, "learning_rate": 9.375331428517506e-07, "loss": 0.1433, "step": 3236 }, { "epoch": 0.7293209789618949, "grad_norm": 0.49628918046133014, "learning_rate": 9.360793320393483e-07, "loss": 0.1514, "step": 3237 }, { "epoch": 0.7295462866477033, "grad_norm": 0.4872571475285826, "learning_rate": 9.346263896138e-07, "loss": 0.1531, "step": 3238 }, { "epoch": 0.7297715943335117, "grad_norm": 0.49925623005616143, "learning_rate": 9.33174316381869e-07, "loss": 0.1521, "step": 3239 }, { "epoch": 0.7299969020193201, "grad_norm": 0.47482589032112493, "learning_rate": 9.317231131498383e-07, "loss": 0.1381, "step": 3240 }, { "epoch": 0.7302222097051285, "grad_norm": 0.47642699027713736, "learning_rate": 9.302727807235079e-07, "loss": 0.1583, "step": 3241 }, { "epoch": 0.730447517390937, "grad_norm": 0.5044203986372681, "learning_rate": 9.288233199081914e-07, "loss": 0.1646, "step": 3242 }, { "epoch": 0.7306728250767455, "grad_norm": 0.5059155038015059, "learning_rate": 9.273747315087223e-07, "loss": 0.1577, "step": 3243 }, { "epoch": 0.7308981327625539, "grad_norm": 0.4891834523919038, "learning_rate": 9.259270163294457e-07, "loss": 0.1558, "step": 3244 }, { "epoch": 0.7311234404483623, "grad_norm": 0.4633981735737557, "learning_rate": 9.244801751742258e-07, "loss": 0.1365, "step": 3245 }, { "epoch": 0.7313487481341707, "grad_norm": 0.4900172468624769, "learning_rate": 9.230342088464381e-07, "loss": 0.1487, "step": 3246 }, { "epoch": 0.7315740558199791, "grad_norm": 0.5164669604278073, "learning_rate": 9.215891181489742e-07, "loss": 0.1608, "step": 3247 }, { "epoch": 0.7317993635057876, "grad_norm": 0.47610476192112533, "learning_rate": 9.201449038842403e-07, "loss": 0.1542, "step": 3248 }, { "epoch": 0.732024671191596, "grad_norm": 0.4549458362963086, "learning_rate": 9.187015668541526e-07, "loss": 0.1473, "step": 3249 }, { "epoch": 0.7322499788774045, "grad_norm": 0.48625908990100747, "learning_rate": 9.172591078601448e-07, "loss": 0.1504, "step": 3250 }, { "epoch": 0.7324752865632129, "grad_norm": 0.4926262365479395, "learning_rate": 9.158175277031584e-07, "loss": 0.1512, "step": 3251 }, { "epoch": 0.7327005942490213, "grad_norm": 0.5063579543255717, "learning_rate": 9.143768271836506e-07, "loss": 0.1553, "step": 3252 }, { "epoch": 0.7329259019348298, "grad_norm": 0.4633422630917174, "learning_rate": 9.129370071015886e-07, "loss": 0.1361, "step": 3253 }, { "epoch": 0.7331512096206382, "grad_norm": 0.4408105126180263, "learning_rate": 9.114980682564492e-07, "loss": 0.1259, "step": 3254 }, { "epoch": 0.7333765173064466, "grad_norm": 0.5207729527814229, "learning_rate": 9.100600114472238e-07, "loss": 0.1589, "step": 3255 }, { "epoch": 0.733601824992255, "grad_norm": 0.49598935768061253, "learning_rate": 9.086228374724096e-07, "loss": 0.1538, "step": 3256 }, { "epoch": 0.7338271326780634, "grad_norm": 0.46713653693216217, "learning_rate": 9.071865471300168e-07, "loss": 0.1513, "step": 3257 }, { "epoch": 0.734052440363872, "grad_norm": 0.4755809875962297, "learning_rate": 9.057511412175646e-07, "loss": 0.1422, "step": 3258 }, { "epoch": 0.7342777480496804, "grad_norm": 0.44528902453284236, "learning_rate": 9.043166205320789e-07, "loss": 0.1357, "step": 3259 }, { "epoch": 0.7345030557354888, "grad_norm": 0.48993203656862394, "learning_rate": 9.028829858700974e-07, "loss": 0.1447, "step": 3260 }, { "epoch": 0.7347283634212972, "grad_norm": 0.47710624030120524, "learning_rate": 9.014502380276619e-07, "loss": 0.1413, "step": 3261 }, { "epoch": 0.7349536711071056, "grad_norm": 0.4592113859329267, "learning_rate": 9.000183778003246e-07, "loss": 0.1397, "step": 3262 }, { "epoch": 0.735178978792914, "grad_norm": 0.48027592981079564, "learning_rate": 8.985874059831456e-07, "loss": 0.1465, "step": 3263 }, { "epoch": 0.7354042864787225, "grad_norm": 0.4814714122399688, "learning_rate": 8.971573233706881e-07, "loss": 0.1491, "step": 3264 }, { "epoch": 0.7356295941645309, "grad_norm": 0.4758653935521041, "learning_rate": 8.957281307570254e-07, "loss": 0.1434, "step": 3265 }, { "epoch": 0.7358549018503394, "grad_norm": 0.4637943976042971, "learning_rate": 8.942998289357333e-07, "loss": 0.1425, "step": 3266 }, { "epoch": 0.7360802095361478, "grad_norm": 0.44714252120008274, "learning_rate": 8.928724186998961e-07, "loss": 0.1323, "step": 3267 }, { "epoch": 0.7363055172219563, "grad_norm": 0.5010997935498333, "learning_rate": 8.914459008421e-07, "loss": 0.1555, "step": 3268 }, { "epoch": 0.7365308249077647, "grad_norm": 0.49517423617843487, "learning_rate": 8.900202761544377e-07, "loss": 0.1466, "step": 3269 }, { "epoch": 0.7367561325935731, "grad_norm": 0.48746031060642947, "learning_rate": 8.885955454285078e-07, "loss": 0.1463, "step": 3270 }, { "epoch": 0.7369814402793815, "grad_norm": 0.48248001589906175, "learning_rate": 8.871717094554058e-07, "loss": 0.1444, "step": 3271 }, { "epoch": 0.7372067479651899, "grad_norm": 0.46149440281114745, "learning_rate": 8.857487690257374e-07, "loss": 0.1314, "step": 3272 }, { "epoch": 0.7374320556509983, "grad_norm": 0.5005341710829557, "learning_rate": 8.843267249296086e-07, "loss": 0.1482, "step": 3273 }, { "epoch": 0.7376573633368069, "grad_norm": 0.5015151392851019, "learning_rate": 8.829055779566262e-07, "loss": 0.1512, "step": 3274 }, { "epoch": 0.7378826710226153, "grad_norm": 0.47516804630917214, "learning_rate": 8.814853288959016e-07, "loss": 0.1488, "step": 3275 }, { "epoch": 0.7381079787084237, "grad_norm": 0.47292919626104574, "learning_rate": 8.800659785360444e-07, "loss": 0.1503, "step": 3276 }, { "epoch": 0.7383332863942321, "grad_norm": 0.4899251652160515, "learning_rate": 8.786475276651688e-07, "loss": 0.1421, "step": 3277 }, { "epoch": 0.7385585940800405, "grad_norm": 0.4691813049653209, "learning_rate": 8.772299770708859e-07, "loss": 0.1251, "step": 3278 }, { "epoch": 0.738783901765849, "grad_norm": 0.4690066583371361, "learning_rate": 8.758133275403097e-07, "loss": 0.1537, "step": 3279 }, { "epoch": 0.7390092094516574, "grad_norm": 0.4737472270670779, "learning_rate": 8.743975798600535e-07, "loss": 0.1556, "step": 3280 }, { "epoch": 0.7392345171374658, "grad_norm": 0.5017685432422245, "learning_rate": 8.729827348162278e-07, "loss": 0.1564, "step": 3281 }, { "epoch": 0.7394598248232743, "grad_norm": 0.4872577372479285, "learning_rate": 8.71568793194445e-07, "loss": 0.1489, "step": 3282 }, { "epoch": 0.7396851325090827, "grad_norm": 0.4535495009195591, "learning_rate": 8.701557557798121e-07, "loss": 0.1298, "step": 3283 }, { "epoch": 0.7399104401948912, "grad_norm": 0.4955801827364163, "learning_rate": 8.687436233569375e-07, "loss": 0.1605, "step": 3284 }, { "epoch": 0.7401357478806996, "grad_norm": 0.46483112374305074, "learning_rate": 8.673323967099259e-07, "loss": 0.1441, "step": 3285 }, { "epoch": 0.740361055566508, "grad_norm": 0.48141735604131547, "learning_rate": 8.659220766223778e-07, "loss": 0.1462, "step": 3286 }, { "epoch": 0.7405863632523164, "grad_norm": 0.4862752455378802, "learning_rate": 8.645126638773926e-07, "loss": 0.153, "step": 3287 }, { "epoch": 0.7408116709381248, "grad_norm": 0.46082477554977386, "learning_rate": 8.631041592575643e-07, "loss": 0.1339, "step": 3288 }, { "epoch": 0.7410369786239334, "grad_norm": 0.47467778641399166, "learning_rate": 8.616965635449814e-07, "loss": 0.1545, "step": 3289 }, { "epoch": 0.7412622863097418, "grad_norm": 0.45990095150736854, "learning_rate": 8.602898775212317e-07, "loss": 0.1356, "step": 3290 }, { "epoch": 0.7414875939955502, "grad_norm": 0.48722496822548544, "learning_rate": 8.588841019673938e-07, "loss": 0.1455, "step": 3291 }, { "epoch": 0.7417129016813586, "grad_norm": 0.47337013865038563, "learning_rate": 8.57479237664044e-07, "loss": 0.142, "step": 3292 }, { "epoch": 0.741938209367167, "grad_norm": 0.5024652414346156, "learning_rate": 8.560752853912494e-07, "loss": 0.1448, "step": 3293 }, { "epoch": 0.7421635170529755, "grad_norm": 0.4809913420160176, "learning_rate": 8.546722459285727e-07, "loss": 0.15, "step": 3294 }, { "epoch": 0.7423888247387839, "grad_norm": 0.4999499323219179, "learning_rate": 8.532701200550714e-07, "loss": 0.1481, "step": 3295 }, { "epoch": 0.7426141324245923, "grad_norm": 0.4689592016416926, "learning_rate": 8.518689085492909e-07, "loss": 0.1441, "step": 3296 }, { "epoch": 0.7428394401104008, "grad_norm": 0.46670042652287225, "learning_rate": 8.504686121892741e-07, "loss": 0.15, "step": 3297 }, { "epoch": 0.7430647477962092, "grad_norm": 0.4846860503801346, "learning_rate": 8.490692317525514e-07, "loss": 0.1426, "step": 3298 }, { "epoch": 0.7432900554820177, "grad_norm": 0.4821622091862189, "learning_rate": 8.476707680161486e-07, "loss": 0.1478, "step": 3299 }, { "epoch": 0.7435153631678261, "grad_norm": 0.47393658159990576, "learning_rate": 8.462732217565783e-07, "loss": 0.142, "step": 3300 }, { "epoch": 0.7437406708536345, "grad_norm": 0.4800748838632915, "learning_rate": 8.448765937498471e-07, "loss": 0.147, "step": 3301 }, { "epoch": 0.7439659785394429, "grad_norm": 0.507382353930732, "learning_rate": 8.434808847714512e-07, "loss": 0.1501, "step": 3302 }, { "epoch": 0.7441912862252513, "grad_norm": 0.47714205294667555, "learning_rate": 8.420860955963739e-07, "loss": 0.1503, "step": 3303 }, { "epoch": 0.7444165939110597, "grad_norm": 0.5072068700018229, "learning_rate": 8.406922269990917e-07, "loss": 0.1615, "step": 3304 }, { "epoch": 0.7446419015968683, "grad_norm": 0.4982128659213873, "learning_rate": 8.392992797535666e-07, "loss": 0.1512, "step": 3305 }, { "epoch": 0.7448672092826767, "grad_norm": 0.46834497278773835, "learning_rate": 8.379072546332498e-07, "loss": 0.1386, "step": 3306 }, { "epoch": 0.7450925169684851, "grad_norm": 0.472453143243406, "learning_rate": 8.365161524110823e-07, "loss": 0.1431, "step": 3307 }, { "epoch": 0.7453178246542935, "grad_norm": 0.46279948878360144, "learning_rate": 8.351259738594902e-07, "loss": 0.1479, "step": 3308 }, { "epoch": 0.7455431323401019, "grad_norm": 0.4627044576661592, "learning_rate": 8.337367197503881e-07, "loss": 0.14, "step": 3309 }, { "epoch": 0.7457684400259104, "grad_norm": 0.4819754464823401, "learning_rate": 8.323483908551783e-07, "loss": 0.1452, "step": 3310 }, { "epoch": 0.7459937477117188, "grad_norm": 0.4716373648789507, "learning_rate": 8.309609879447458e-07, "loss": 0.1439, "step": 3311 }, { "epoch": 0.7462190553975272, "grad_norm": 0.43775940967744453, "learning_rate": 8.29574511789466e-07, "loss": 0.1339, "step": 3312 }, { "epoch": 0.7464443630833357, "grad_norm": 0.4797136707395238, "learning_rate": 8.281889631591955e-07, "loss": 0.154, "step": 3313 }, { "epoch": 0.7466696707691441, "grad_norm": 0.4896839562641403, "learning_rate": 8.268043428232798e-07, "loss": 0.162, "step": 3314 }, { "epoch": 0.7468949784549526, "grad_norm": 0.46071254712821796, "learning_rate": 8.254206515505444e-07, "loss": 0.1453, "step": 3315 }, { "epoch": 0.747120286140761, "grad_norm": 0.45393470832573807, "learning_rate": 8.240378901093035e-07, "loss": 0.1347, "step": 3316 }, { "epoch": 0.7473455938265694, "grad_norm": 0.4647022595151276, "learning_rate": 8.22656059267353e-07, "loss": 0.1367, "step": 3317 }, { "epoch": 0.7475709015123778, "grad_norm": 0.4894365793407213, "learning_rate": 8.212751597919708e-07, "loss": 0.1463, "step": 3318 }, { "epoch": 0.7477962091981862, "grad_norm": 0.4874094265856785, "learning_rate": 8.198951924499202e-07, "loss": 0.1533, "step": 3319 }, { "epoch": 0.7480215168839947, "grad_norm": 0.4552551653740835, "learning_rate": 8.185161580074444e-07, "loss": 0.1367, "step": 3320 }, { "epoch": 0.7482468245698032, "grad_norm": 0.4676081851330181, "learning_rate": 8.171380572302712e-07, "loss": 0.1378, "step": 3321 }, { "epoch": 0.7484721322556116, "grad_norm": 0.4982658730940427, "learning_rate": 8.157608908836071e-07, "loss": 0.1536, "step": 3322 }, { "epoch": 0.74869743994142, "grad_norm": 0.4569590223747696, "learning_rate": 8.143846597321425e-07, "loss": 0.1341, "step": 3323 }, { "epoch": 0.7489227476272284, "grad_norm": 0.507077702702224, "learning_rate": 8.130093645400469e-07, "loss": 0.1631, "step": 3324 }, { "epoch": 0.7491480553130369, "grad_norm": 0.4800642788710456, "learning_rate": 8.116350060709696e-07, "loss": 0.1461, "step": 3325 }, { "epoch": 0.7493733629988453, "grad_norm": 0.4446086202012074, "learning_rate": 8.102615850880413e-07, "loss": 0.1246, "step": 3326 }, { "epoch": 0.7495986706846537, "grad_norm": 0.4562022923162509, "learning_rate": 8.088891023538722e-07, "loss": 0.1303, "step": 3327 }, { "epoch": 0.7498239783704621, "grad_norm": 0.5165662372340282, "learning_rate": 8.075175586305492e-07, "loss": 0.1656, "step": 3328 }, { "epoch": 0.7500492860562706, "grad_norm": 0.5027656419852117, "learning_rate": 8.061469546796413e-07, "loss": 0.1646, "step": 3329 }, { "epoch": 0.750274593742079, "grad_norm": 0.47424795152236715, "learning_rate": 8.047772912621921e-07, "loss": 0.147, "step": 3330 }, { "epoch": 0.7504999014278875, "grad_norm": 0.4799229558407996, "learning_rate": 8.034085691387253e-07, "loss": 0.1554, "step": 3331 }, { "epoch": 0.7507252091136959, "grad_norm": 0.44539800876812674, "learning_rate": 8.020407890692419e-07, "loss": 0.1284, "step": 3332 }, { "epoch": 0.7509505167995043, "grad_norm": 0.46596737623628415, "learning_rate": 8.006739518132179e-07, "loss": 0.1373, "step": 3333 }, { "epoch": 0.7511758244853127, "grad_norm": 0.47852726088194664, "learning_rate": 7.993080581296087e-07, "loss": 0.1469, "step": 3334 }, { "epoch": 0.7514011321711211, "grad_norm": 0.46838224015083574, "learning_rate": 7.979431087768424e-07, "loss": 0.1377, "step": 3335 }, { "epoch": 0.7516264398569297, "grad_norm": 0.487296225417356, "learning_rate": 7.96579104512826e-07, "loss": 0.1427, "step": 3336 }, { "epoch": 0.7518517475427381, "grad_norm": 0.46682500251151077, "learning_rate": 7.95216046094939e-07, "loss": 0.1379, "step": 3337 }, { "epoch": 0.7520770552285465, "grad_norm": 0.48515341220916963, "learning_rate": 7.938539342800373e-07, "loss": 0.1438, "step": 3338 }, { "epoch": 0.7523023629143549, "grad_norm": 0.46473632519103775, "learning_rate": 7.92492769824452e-07, "loss": 0.1334, "step": 3339 }, { "epoch": 0.7525276706001633, "grad_norm": 0.4719673542349233, "learning_rate": 7.911325534839851e-07, "loss": 0.1368, "step": 3340 }, { "epoch": 0.7527529782859718, "grad_norm": 0.47032858945280154, "learning_rate": 7.897732860139157e-07, "loss": 0.1367, "step": 3341 }, { "epoch": 0.7529782859717802, "grad_norm": 0.45765166719265943, "learning_rate": 7.884149681689937e-07, "loss": 0.1376, "step": 3342 }, { "epoch": 0.7532035936575886, "grad_norm": 0.47212795628146054, "learning_rate": 7.870576007034414e-07, "loss": 0.1424, "step": 3343 }, { "epoch": 0.7534289013433971, "grad_norm": 0.47987368156781673, "learning_rate": 7.857011843709559e-07, "loss": 0.1429, "step": 3344 }, { "epoch": 0.7536542090292055, "grad_norm": 0.5245624120355394, "learning_rate": 7.843457199247034e-07, "loss": 0.1566, "step": 3345 }, { "epoch": 0.753879516715014, "grad_norm": 0.4965488224022442, "learning_rate": 7.829912081173238e-07, "loss": 0.1427, "step": 3346 }, { "epoch": 0.7541048244008224, "grad_norm": 0.47706440489835394, "learning_rate": 7.816376497009262e-07, "loss": 0.1335, "step": 3347 }, { "epoch": 0.7543301320866308, "grad_norm": 0.48299077652397493, "learning_rate": 7.802850454270913e-07, "loss": 0.1449, "step": 3348 }, { "epoch": 0.7545554397724392, "grad_norm": 0.4741612790797712, "learning_rate": 7.789333960468707e-07, "loss": 0.1445, "step": 3349 }, { "epoch": 0.7547807474582476, "grad_norm": 0.49498587339603634, "learning_rate": 7.775827023107835e-07, "loss": 0.154, "step": 3350 }, { "epoch": 0.755006055144056, "grad_norm": 0.46834623356511995, "learning_rate": 7.762329649688214e-07, "loss": 0.1353, "step": 3351 }, { "epoch": 0.7552313628298646, "grad_norm": 0.4661163814611361, "learning_rate": 7.74884184770441e-07, "loss": 0.1292, "step": 3352 }, { "epoch": 0.755456670515673, "grad_norm": 0.4981050040486079, "learning_rate": 7.735363624645712e-07, "loss": 0.1477, "step": 3353 }, { "epoch": 0.7556819782014814, "grad_norm": 0.4480961010367873, "learning_rate": 7.721894987996076e-07, "loss": 0.1363, "step": 3354 }, { "epoch": 0.7559072858872898, "grad_norm": 0.47967498498134536, "learning_rate": 7.708435945234124e-07, "loss": 0.1431, "step": 3355 }, { "epoch": 0.7561325935730983, "grad_norm": 0.4670216916507628, "learning_rate": 7.694986503833171e-07, "loss": 0.1479, "step": 3356 }, { "epoch": 0.7563579012589067, "grad_norm": 0.515124251928348, "learning_rate": 7.681546671261181e-07, "loss": 0.1624, "step": 3357 }, { "epoch": 0.7565832089447151, "grad_norm": 0.5047300539541966, "learning_rate": 7.668116454980804e-07, "loss": 0.1455, "step": 3358 }, { "epoch": 0.7568085166305235, "grad_norm": 0.5022582248887345, "learning_rate": 7.654695862449327e-07, "loss": 0.1563, "step": 3359 }, { "epoch": 0.757033824316332, "grad_norm": 0.4403365656908437, "learning_rate": 7.641284901118703e-07, "loss": 0.1241, "step": 3360 }, { "epoch": 0.7572591320021405, "grad_norm": 0.46478175233204677, "learning_rate": 7.627883578435555e-07, "loss": 0.1495, "step": 3361 }, { "epoch": 0.7574844396879489, "grad_norm": 0.4680079929371967, "learning_rate": 7.614491901841118e-07, "loss": 0.1385, "step": 3362 }, { "epoch": 0.7577097473737573, "grad_norm": 0.45630374777367133, "learning_rate": 7.601109878771301e-07, "loss": 0.1404, "step": 3363 }, { "epoch": 0.7579350550595657, "grad_norm": 0.48047432753197256, "learning_rate": 7.587737516656651e-07, "loss": 0.1523, "step": 3364 }, { "epoch": 0.7581603627453741, "grad_norm": 0.4754340021655358, "learning_rate": 7.574374822922323e-07, "loss": 0.146, "step": 3365 }, { "epoch": 0.7583856704311825, "grad_norm": 0.4623116504838362, "learning_rate": 7.561021804988141e-07, "loss": 0.1314, "step": 3366 }, { "epoch": 0.758610978116991, "grad_norm": 0.4804463957409756, "learning_rate": 7.547678470268526e-07, "loss": 0.1458, "step": 3367 }, { "epoch": 0.7588362858027995, "grad_norm": 0.4629048572883021, "learning_rate": 7.534344826172546e-07, "loss": 0.1378, "step": 3368 }, { "epoch": 0.7590615934886079, "grad_norm": 0.48475687389213085, "learning_rate": 7.52102088010386e-07, "loss": 0.1471, "step": 3369 }, { "epoch": 0.7592869011744163, "grad_norm": 0.43776274622028544, "learning_rate": 7.507706639460768e-07, "loss": 0.1194, "step": 3370 }, { "epoch": 0.7595122088602247, "grad_norm": 0.49795081371303823, "learning_rate": 7.494402111636179e-07, "loss": 0.1586, "step": 3371 }, { "epoch": 0.7597375165460332, "grad_norm": 0.4772160878473002, "learning_rate": 7.481107304017588e-07, "loss": 0.1391, "step": 3372 }, { "epoch": 0.7599628242318416, "grad_norm": 0.4881633011030423, "learning_rate": 7.467822223987117e-07, "loss": 0.1468, "step": 3373 }, { "epoch": 0.76018813191765, "grad_norm": 0.4832816618224351, "learning_rate": 7.454546878921465e-07, "loss": 0.1383, "step": 3374 }, { "epoch": 0.7604134396034584, "grad_norm": 0.4553465039777067, "learning_rate": 7.441281276191939e-07, "loss": 0.1299, "step": 3375 }, { "epoch": 0.760638747289267, "grad_norm": 0.4563435010027247, "learning_rate": 7.428025423164456e-07, "loss": 0.1317, "step": 3376 }, { "epoch": 0.7608640549750754, "grad_norm": 0.49239236621925697, "learning_rate": 7.414779327199461e-07, "loss": 0.1547, "step": 3377 }, { "epoch": 0.7610893626608838, "grad_norm": 0.501508390358165, "learning_rate": 7.401542995652033e-07, "loss": 0.1506, "step": 3378 }, { "epoch": 0.7613146703466922, "grad_norm": 0.48179929137762706, "learning_rate": 7.388316435871825e-07, "loss": 0.1404, "step": 3379 }, { "epoch": 0.7615399780325006, "grad_norm": 0.4628844742100713, "learning_rate": 7.375099655203033e-07, "loss": 0.1369, "step": 3380 }, { "epoch": 0.761765285718309, "grad_norm": 0.4575415724895742, "learning_rate": 7.361892660984459e-07, "loss": 0.134, "step": 3381 }, { "epoch": 0.7619905934041175, "grad_norm": 0.4445676062794546, "learning_rate": 7.348695460549443e-07, "loss": 0.132, "step": 3382 }, { "epoch": 0.762215901089926, "grad_norm": 0.48156865787255837, "learning_rate": 7.335508061225907e-07, "loss": 0.1482, "step": 3383 }, { "epoch": 0.7624412087757344, "grad_norm": 0.4722120592143978, "learning_rate": 7.322330470336314e-07, "loss": 0.1487, "step": 3384 }, { "epoch": 0.7626665164615428, "grad_norm": 0.45923459135561195, "learning_rate": 7.309162695197692e-07, "loss": 0.1401, "step": 3385 }, { "epoch": 0.7628918241473512, "grad_norm": 0.49513812336272167, "learning_rate": 7.296004743121627e-07, "loss": 0.1434, "step": 3386 }, { "epoch": 0.7631171318331597, "grad_norm": 0.47090809838121395, "learning_rate": 7.28285662141422e-07, "loss": 0.137, "step": 3387 }, { "epoch": 0.7633424395189681, "grad_norm": 0.48067072748376605, "learning_rate": 7.26971833737615e-07, "loss": 0.1552, "step": 3388 }, { "epoch": 0.7635677472047765, "grad_norm": 0.49326359809106446, "learning_rate": 7.256589898302599e-07, "loss": 0.1421, "step": 3389 }, { "epoch": 0.7637930548905849, "grad_norm": 0.4709260975136618, "learning_rate": 7.243471311483322e-07, "loss": 0.1373, "step": 3390 }, { "epoch": 0.7640183625763934, "grad_norm": 0.46602218525293615, "learning_rate": 7.230362584202557e-07, "loss": 0.1375, "step": 3391 }, { "epoch": 0.7642436702622019, "grad_norm": 0.488504880542589, "learning_rate": 7.217263723739107e-07, "loss": 0.1434, "step": 3392 }, { "epoch": 0.7644689779480103, "grad_norm": 0.45585994244897526, "learning_rate": 7.204174737366293e-07, "loss": 0.1358, "step": 3393 }, { "epoch": 0.7646942856338187, "grad_norm": 0.4584522893893621, "learning_rate": 7.191095632351908e-07, "loss": 0.1428, "step": 3394 }, { "epoch": 0.7649195933196271, "grad_norm": 0.4968819934814164, "learning_rate": 7.178026415958311e-07, "loss": 0.1487, "step": 3395 }, { "epoch": 0.7651449010054355, "grad_norm": 0.5065663037710753, "learning_rate": 7.164967095442357e-07, "loss": 0.163, "step": 3396 }, { "epoch": 0.765370208691244, "grad_norm": 0.487232527516781, "learning_rate": 7.151917678055384e-07, "loss": 0.1492, "step": 3397 }, { "epoch": 0.7655955163770524, "grad_norm": 0.4678617657264611, "learning_rate": 7.138878171043262e-07, "loss": 0.1381, "step": 3398 }, { "epoch": 0.7658208240628609, "grad_norm": 0.49173600814669405, "learning_rate": 7.125848581646327e-07, "loss": 0.1566, "step": 3399 }, { "epoch": 0.7660461317486693, "grad_norm": 0.4726487112124558, "learning_rate": 7.112828917099438e-07, "loss": 0.137, "step": 3400 }, { "epoch": 0.7662714394344777, "grad_norm": 0.4680839672704657, "learning_rate": 7.099819184631929e-07, "loss": 0.1407, "step": 3401 }, { "epoch": 0.7664967471202861, "grad_norm": 0.4714623274461953, "learning_rate": 7.086819391467612e-07, "loss": 0.1445, "step": 3402 }, { "epoch": 0.7667220548060946, "grad_norm": 0.4985792687519659, "learning_rate": 7.073829544824795e-07, "loss": 0.1473, "step": 3403 }, { "epoch": 0.766947362491903, "grad_norm": 0.46973950135993436, "learning_rate": 7.060849651916244e-07, "loss": 0.1389, "step": 3404 }, { "epoch": 0.7671726701777114, "grad_norm": 0.48334204954281296, "learning_rate": 7.047879719949227e-07, "loss": 0.1414, "step": 3405 }, { "epoch": 0.7673979778635198, "grad_norm": 0.46175218020844494, "learning_rate": 7.034919756125447e-07, "loss": 0.1394, "step": 3406 }, { "epoch": 0.7676232855493283, "grad_norm": 0.49410571707078754, "learning_rate": 7.021969767641096e-07, "loss": 0.1423, "step": 3407 }, { "epoch": 0.7678485932351368, "grad_norm": 0.501188825151817, "learning_rate": 7.009029761686825e-07, "loss": 0.1496, "step": 3408 }, { "epoch": 0.7680739009209452, "grad_norm": 0.4793266257229126, "learning_rate": 6.996099745447726e-07, "loss": 0.1407, "step": 3409 }, { "epoch": 0.7682992086067536, "grad_norm": 0.5061394233782135, "learning_rate": 6.98317972610337e-07, "loss": 0.1578, "step": 3410 }, { "epoch": 0.768524516292562, "grad_norm": 0.4338884896826393, "learning_rate": 6.970269710827754e-07, "loss": 0.1235, "step": 3411 }, { "epoch": 0.7687498239783704, "grad_norm": 0.45937831755042746, "learning_rate": 6.957369706789319e-07, "loss": 0.143, "step": 3412 }, { "epoch": 0.7689751316641789, "grad_norm": 0.47034142663055073, "learning_rate": 6.944479721150971e-07, "loss": 0.1359, "step": 3413 }, { "epoch": 0.7692004393499873, "grad_norm": 0.4960066484738008, "learning_rate": 6.931599761070027e-07, "loss": 0.1539, "step": 3414 }, { "epoch": 0.7694257470357958, "grad_norm": 0.4709523524894798, "learning_rate": 6.91872983369826e-07, "loss": 0.1504, "step": 3415 }, { "epoch": 0.7696510547216042, "grad_norm": 0.47377291373340263, "learning_rate": 6.905869946181848e-07, "loss": 0.1488, "step": 3416 }, { "epoch": 0.7698763624074126, "grad_norm": 0.47037382567724695, "learning_rate": 6.893020105661416e-07, "loss": 0.1397, "step": 3417 }, { "epoch": 0.7701016700932211, "grad_norm": 0.4685633998550794, "learning_rate": 6.880180319272006e-07, "loss": 0.1419, "step": 3418 }, { "epoch": 0.7703269777790295, "grad_norm": 0.4907543802869892, "learning_rate": 6.867350594143058e-07, "loss": 0.1568, "step": 3419 }, { "epoch": 0.7705522854648379, "grad_norm": 0.49679497792993677, "learning_rate": 6.854530937398459e-07, "loss": 0.1487, "step": 3420 }, { "epoch": 0.7707775931506463, "grad_norm": 0.4654299885121034, "learning_rate": 6.841721356156466e-07, "loss": 0.1495, "step": 3421 }, { "epoch": 0.7710029008364547, "grad_norm": 0.4374517267903433, "learning_rate": 6.828921857529774e-07, "loss": 0.1311, "step": 3422 }, { "epoch": 0.7712282085222633, "grad_norm": 0.4597646748074688, "learning_rate": 6.816132448625474e-07, "loss": 0.1373, "step": 3423 }, { "epoch": 0.7714535162080717, "grad_norm": 0.47486822261268397, "learning_rate": 6.803353136545033e-07, "loss": 0.1399, "step": 3424 }, { "epoch": 0.7716788238938801, "grad_norm": 0.48137920510488824, "learning_rate": 6.790583928384339e-07, "loss": 0.1492, "step": 3425 }, { "epoch": 0.7719041315796885, "grad_norm": 0.5092249833274699, "learning_rate": 6.777824831233645e-07, "loss": 0.156, "step": 3426 }, { "epoch": 0.7721294392654969, "grad_norm": 0.4791475636794947, "learning_rate": 6.765075852177619e-07, "loss": 0.1417, "step": 3427 }, { "epoch": 0.7723547469513053, "grad_norm": 0.49452899112181636, "learning_rate": 6.752336998295281e-07, "loss": 0.1488, "step": 3428 }, { "epoch": 0.7725800546371138, "grad_norm": 0.48047913950608484, "learning_rate": 6.739608276660037e-07, "loss": 0.147, "step": 3429 }, { "epoch": 0.7728053623229223, "grad_norm": 0.4724754669670758, "learning_rate": 6.726889694339689e-07, "loss": 0.1433, "step": 3430 }, { "epoch": 0.7730306700087307, "grad_norm": 0.503636873558274, "learning_rate": 6.714181258396371e-07, "loss": 0.1652, "step": 3431 }, { "epoch": 0.7732559776945391, "grad_norm": 0.50474994731612, "learning_rate": 6.701482975886617e-07, "loss": 0.1597, "step": 3432 }, { "epoch": 0.7734812853803475, "grad_norm": 0.5076282017497796, "learning_rate": 6.688794853861316e-07, "loss": 0.1538, "step": 3433 }, { "epoch": 0.773706593066156, "grad_norm": 0.48011437833845344, "learning_rate": 6.676116899365692e-07, "loss": 0.1463, "step": 3434 }, { "epoch": 0.7739319007519644, "grad_norm": 0.4666723449374404, "learning_rate": 6.663449119439358e-07, "loss": 0.1304, "step": 3435 }, { "epoch": 0.7741572084377728, "grad_norm": 0.47606931232813704, "learning_rate": 6.650791521116243e-07, "loss": 0.1344, "step": 3436 }, { "epoch": 0.7743825161235812, "grad_norm": 0.4792947664321174, "learning_rate": 6.638144111424655e-07, "loss": 0.1448, "step": 3437 }, { "epoch": 0.7746078238093898, "grad_norm": 0.483992588918863, "learning_rate": 6.625506897387215e-07, "loss": 0.1347, "step": 3438 }, { "epoch": 0.7748331314951982, "grad_norm": 0.49226446774811605, "learning_rate": 6.612879886020907e-07, "loss": 0.1524, "step": 3439 }, { "epoch": 0.7750584391810066, "grad_norm": 0.4876555810986814, "learning_rate": 6.600263084337041e-07, "loss": 0.1416, "step": 3440 }, { "epoch": 0.775283746866815, "grad_norm": 0.4879932793568683, "learning_rate": 6.587656499341247e-07, "loss": 0.1485, "step": 3441 }, { "epoch": 0.7755090545526234, "grad_norm": 0.4637289855714098, "learning_rate": 6.575060138033504e-07, "loss": 0.1369, "step": 3442 }, { "epoch": 0.7757343622384318, "grad_norm": 0.462783115914039, "learning_rate": 6.562474007408087e-07, "loss": 0.1403, "step": 3443 }, { "epoch": 0.7759596699242403, "grad_norm": 0.45701063708272344, "learning_rate": 6.549898114453615e-07, "loss": 0.1356, "step": 3444 }, { "epoch": 0.7761849776100487, "grad_norm": 0.4885466428022282, "learning_rate": 6.537332466153018e-07, "loss": 0.1458, "step": 3445 }, { "epoch": 0.7764102852958572, "grad_norm": 0.4629424193330636, "learning_rate": 6.524777069483526e-07, "loss": 0.1437, "step": 3446 }, { "epoch": 0.7766355929816656, "grad_norm": 0.5012797628936799, "learning_rate": 6.512231931416674e-07, "loss": 0.1646, "step": 3447 }, { "epoch": 0.776860900667474, "grad_norm": 0.4904692323067029, "learning_rate": 6.499697058918326e-07, "loss": 0.1428, "step": 3448 }, { "epoch": 0.7770862083532825, "grad_norm": 0.4813223757776377, "learning_rate": 6.487172458948612e-07, "loss": 0.1489, "step": 3449 }, { "epoch": 0.7773115160390909, "grad_norm": 0.49212730316398134, "learning_rate": 6.474658138461992e-07, "loss": 0.1523, "step": 3450 }, { "epoch": 0.7775368237248993, "grad_norm": 0.4834210288352534, "learning_rate": 6.462154104407187e-07, "loss": 0.1464, "step": 3451 }, { "epoch": 0.7777621314107077, "grad_norm": 0.44844795640795554, "learning_rate": 6.449660363727236e-07, "loss": 0.1278, "step": 3452 }, { "epoch": 0.7779874390965161, "grad_norm": 0.4713579351940081, "learning_rate": 6.437176923359434e-07, "loss": 0.1378, "step": 3453 }, { "epoch": 0.7782127467823247, "grad_norm": 0.5120717655518697, "learning_rate": 6.424703790235374e-07, "loss": 0.1464, "step": 3454 }, { "epoch": 0.7784380544681331, "grad_norm": 0.48508407218751737, "learning_rate": 6.41224097128093e-07, "loss": 0.146, "step": 3455 }, { "epoch": 0.7786633621539415, "grad_norm": 0.5042237917237723, "learning_rate": 6.399788473416229e-07, "loss": 0.1628, "step": 3456 }, { "epoch": 0.7788886698397499, "grad_norm": 0.491435891395647, "learning_rate": 6.387346303555691e-07, "loss": 0.1456, "step": 3457 }, { "epoch": 0.7791139775255583, "grad_norm": 0.5030006112904724, "learning_rate": 6.374914468607976e-07, "loss": 0.1483, "step": 3458 }, { "epoch": 0.7793392852113668, "grad_norm": 0.4831057771694969, "learning_rate": 6.362492975476033e-07, "loss": 0.1546, "step": 3459 }, { "epoch": 0.7795645928971752, "grad_norm": 0.48384751710637675, "learning_rate": 6.35008183105704e-07, "loss": 0.1525, "step": 3460 }, { "epoch": 0.7797899005829836, "grad_norm": 0.4526387079979141, "learning_rate": 6.337681042242447e-07, "loss": 0.1344, "step": 3461 }, { "epoch": 0.7800152082687921, "grad_norm": 0.4617928977357517, "learning_rate": 6.325290615917961e-07, "loss": 0.1385, "step": 3462 }, { "epoch": 0.7802405159546005, "grad_norm": 0.5171721464243675, "learning_rate": 6.312910558963505e-07, "loss": 0.1577, "step": 3463 }, { "epoch": 0.780465823640409, "grad_norm": 0.4784660703129677, "learning_rate": 6.300540878253286e-07, "loss": 0.1531, "step": 3464 }, { "epoch": 0.7806911313262174, "grad_norm": 0.4828939193700581, "learning_rate": 6.288181580655709e-07, "loss": 0.1401, "step": 3465 }, { "epoch": 0.7809164390120258, "grad_norm": 0.47170142591052516, "learning_rate": 6.27583267303343e-07, "loss": 0.1431, "step": 3466 }, { "epoch": 0.7811417466978342, "grad_norm": 0.4754041163728465, "learning_rate": 6.263494162243352e-07, "loss": 0.1525, "step": 3467 }, { "epoch": 0.7813670543836426, "grad_norm": 0.5184205396388687, "learning_rate": 6.251166055136573e-07, "loss": 0.1569, "step": 3468 }, { "epoch": 0.781592362069451, "grad_norm": 0.44971934179254847, "learning_rate": 6.238848358558439e-07, "loss": 0.1377, "step": 3469 }, { "epoch": 0.7818176697552596, "grad_norm": 0.4819646264780796, "learning_rate": 6.226541079348517e-07, "loss": 0.147, "step": 3470 }, { "epoch": 0.782042977441068, "grad_norm": 0.48149851981924063, "learning_rate": 6.214244224340563e-07, "loss": 0.1371, "step": 3471 }, { "epoch": 0.7822682851268764, "grad_norm": 0.475720575596046, "learning_rate": 6.201957800362579e-07, "loss": 0.141, "step": 3472 }, { "epoch": 0.7824935928126848, "grad_norm": 0.47487728061467577, "learning_rate": 6.189681814236742e-07, "loss": 0.1431, "step": 3473 }, { "epoch": 0.7827189004984932, "grad_norm": 0.48977760168102874, "learning_rate": 6.177416272779468e-07, "loss": 0.1526, "step": 3474 }, { "epoch": 0.7829442081843017, "grad_norm": 0.4541128559164801, "learning_rate": 6.165161182801336e-07, "loss": 0.1338, "step": 3475 }, { "epoch": 0.7831695158701101, "grad_norm": 0.4974775276030226, "learning_rate": 6.152916551107149e-07, "loss": 0.1489, "step": 3476 }, { "epoch": 0.7833948235559185, "grad_norm": 0.47432347761436366, "learning_rate": 6.140682384495902e-07, "loss": 0.1418, "step": 3477 }, { "epoch": 0.783620131241727, "grad_norm": 0.4693783520677733, "learning_rate": 6.12845868976076e-07, "loss": 0.1478, "step": 3478 }, { "epoch": 0.7838454389275354, "grad_norm": 0.45579060670234034, "learning_rate": 6.116245473689094e-07, "loss": 0.1404, "step": 3479 }, { "epoch": 0.7840707466133439, "grad_norm": 0.4825732781016127, "learning_rate": 6.104042743062439e-07, "loss": 0.1557, "step": 3480 }, { "epoch": 0.7842960542991523, "grad_norm": 0.471553522747966, "learning_rate": 6.091850504656527e-07, "loss": 0.1448, "step": 3481 }, { "epoch": 0.7845213619849607, "grad_norm": 0.4573226658022487, "learning_rate": 6.079668765241248e-07, "loss": 0.1273, "step": 3482 }, { "epoch": 0.7847466696707691, "grad_norm": 0.5068814042932546, "learning_rate": 6.06749753158066e-07, "loss": 0.1488, "step": 3483 }, { "epoch": 0.7849719773565775, "grad_norm": 0.46266260496163186, "learning_rate": 6.05533681043301e-07, "loss": 0.1481, "step": 3484 }, { "epoch": 0.7851972850423861, "grad_norm": 0.47087969493676185, "learning_rate": 6.04318660855068e-07, "loss": 0.1373, "step": 3485 }, { "epoch": 0.7854225927281945, "grad_norm": 0.46922056971343673, "learning_rate": 6.031046932680229e-07, "loss": 0.1436, "step": 3486 }, { "epoch": 0.7856479004140029, "grad_norm": 0.5261567511445522, "learning_rate": 6.018917789562372e-07, "loss": 0.1581, "step": 3487 }, { "epoch": 0.7858732080998113, "grad_norm": 0.4666125182311981, "learning_rate": 6.006799185931964e-07, "loss": 0.137, "step": 3488 }, { "epoch": 0.7860985157856197, "grad_norm": 0.48593232146737003, "learning_rate": 5.994691128518019e-07, "loss": 0.1418, "step": 3489 }, { "epoch": 0.7863238234714282, "grad_norm": 0.5015236294415071, "learning_rate": 5.982593624043682e-07, "loss": 0.1542, "step": 3490 }, { "epoch": 0.7865491311572366, "grad_norm": 0.47338585550849926, "learning_rate": 5.970506679226249e-07, "loss": 0.1461, "step": 3491 }, { "epoch": 0.786774438843045, "grad_norm": 0.47921247062245814, "learning_rate": 5.958430300777157e-07, "loss": 0.1438, "step": 3492 }, { "epoch": 0.7869997465288535, "grad_norm": 0.46779328934505154, "learning_rate": 5.94636449540196e-07, "loss": 0.1439, "step": 3493 }, { "epoch": 0.7872250542146619, "grad_norm": 0.48007232443930653, "learning_rate": 5.934309269800359e-07, "loss": 0.1523, "step": 3494 }, { "epoch": 0.7874503619004704, "grad_norm": 0.46819545489036524, "learning_rate": 5.922264630666161e-07, "loss": 0.1396, "step": 3495 }, { "epoch": 0.7876756695862788, "grad_norm": 0.4786971066876147, "learning_rate": 5.910230584687316e-07, "loss": 0.1465, "step": 3496 }, { "epoch": 0.7879009772720872, "grad_norm": 0.47295526960505263, "learning_rate": 5.898207138545867e-07, "loss": 0.1441, "step": 3497 }, { "epoch": 0.7881262849578956, "grad_norm": 0.4953715652067572, "learning_rate": 5.886194298917994e-07, "loss": 0.1446, "step": 3498 }, { "epoch": 0.788351592643704, "grad_norm": 0.48175106893532155, "learning_rate": 5.874192072473995e-07, "loss": 0.1481, "step": 3499 }, { "epoch": 0.7885769003295124, "grad_norm": 0.4930876648821759, "learning_rate": 5.862200465878228e-07, "loss": 0.148, "step": 3500 }, { "epoch": 0.7885769003295124, "eval_loss": 0.14415566623210907, "eval_runtime": 56.9444, "eval_samples_per_second": 50.4, "eval_steps_per_second": 6.304, "step": 3500 }, { "epoch": 0.788802208015321, "grad_norm": 0.4744664460269437, "learning_rate": 5.850219485789199e-07, "loss": 0.1536, "step": 3501 }, { "epoch": 0.7890275157011294, "grad_norm": 0.4700558325774627, "learning_rate": 5.838249138859509e-07, "loss": 0.1479, "step": 3502 }, { "epoch": 0.7892528233869378, "grad_norm": 0.47987032673527097, "learning_rate": 5.826289431735832e-07, "loss": 0.1479, "step": 3503 }, { "epoch": 0.7894781310727462, "grad_norm": 0.47586202385382154, "learning_rate": 5.814340371058957e-07, "loss": 0.1392, "step": 3504 }, { "epoch": 0.7897034387585546, "grad_norm": 0.45872762028622155, "learning_rate": 5.802401963463741e-07, "loss": 0.1366, "step": 3505 }, { "epoch": 0.7899287464443631, "grad_norm": 0.4999088673800423, "learning_rate": 5.79047421557915e-07, "loss": 0.1552, "step": 3506 }, { "epoch": 0.7901540541301715, "grad_norm": 0.488380827577179, "learning_rate": 5.778557134028207e-07, "loss": 0.1455, "step": 3507 }, { "epoch": 0.7903793618159799, "grad_norm": 0.4821400153281917, "learning_rate": 5.766650725428027e-07, "loss": 0.1451, "step": 3508 }, { "epoch": 0.7906046695017884, "grad_norm": 0.44518148929688545, "learning_rate": 5.754754996389799e-07, "loss": 0.1244, "step": 3509 }, { "epoch": 0.7908299771875968, "grad_norm": 0.49627472177673576, "learning_rate": 5.742869953518773e-07, "loss": 0.1621, "step": 3510 }, { "epoch": 0.7910552848734053, "grad_norm": 0.4709678650005438, "learning_rate": 5.730995603414274e-07, "loss": 0.1398, "step": 3511 }, { "epoch": 0.7912805925592137, "grad_norm": 0.4995264521273658, "learning_rate": 5.719131952669679e-07, "loss": 0.1573, "step": 3512 }, { "epoch": 0.7915059002450221, "grad_norm": 0.4935413382590102, "learning_rate": 5.707279007872435e-07, "loss": 0.1443, "step": 3513 }, { "epoch": 0.7917312079308305, "grad_norm": 0.46811740322179435, "learning_rate": 5.695436775604049e-07, "loss": 0.1436, "step": 3514 }, { "epoch": 0.7919565156166389, "grad_norm": 0.47718189498348085, "learning_rate": 5.683605262440056e-07, "loss": 0.1478, "step": 3515 }, { "epoch": 0.7921818233024474, "grad_norm": 0.4513705321571012, "learning_rate": 5.671784474950068e-07, "loss": 0.1283, "step": 3516 }, { "epoch": 0.7924071309882559, "grad_norm": 0.48768074675572204, "learning_rate": 5.659974419697723e-07, "loss": 0.1492, "step": 3517 }, { "epoch": 0.7926324386740643, "grad_norm": 0.47371788564821754, "learning_rate": 5.648175103240694e-07, "loss": 0.142, "step": 3518 }, { "epoch": 0.7928577463598727, "grad_norm": 0.48134969198220917, "learning_rate": 5.636386532130717e-07, "loss": 0.1554, "step": 3519 }, { "epoch": 0.7930830540456811, "grad_norm": 0.47825140693570056, "learning_rate": 5.624608712913531e-07, "loss": 0.1392, "step": 3520 }, { "epoch": 0.7933083617314896, "grad_norm": 0.4958972676890111, "learning_rate": 5.612841652128939e-07, "loss": 0.1534, "step": 3521 }, { "epoch": 0.793533669417298, "grad_norm": 0.48208591825823033, "learning_rate": 5.601085356310734e-07, "loss": 0.1321, "step": 3522 }, { "epoch": 0.7937589771031064, "grad_norm": 0.48139189257298015, "learning_rate": 5.589339831986754e-07, "loss": 0.1535, "step": 3523 }, { "epoch": 0.7939842847889148, "grad_norm": 0.5076369107122871, "learning_rate": 5.577605085678858e-07, "loss": 0.1548, "step": 3524 }, { "epoch": 0.7942095924747233, "grad_norm": 0.4814164020663355, "learning_rate": 5.565881123902903e-07, "loss": 0.1352, "step": 3525 }, { "epoch": 0.7944349001605318, "grad_norm": 0.4949034169556831, "learning_rate": 5.554167953168779e-07, "loss": 0.1441, "step": 3526 }, { "epoch": 0.7946602078463402, "grad_norm": 0.489097629639028, "learning_rate": 5.542465579980361e-07, "loss": 0.1508, "step": 3527 }, { "epoch": 0.7948855155321486, "grad_norm": 0.4954683894495445, "learning_rate": 5.530774010835552e-07, "loss": 0.1521, "step": 3528 }, { "epoch": 0.795110823217957, "grad_norm": 0.4558862088107603, "learning_rate": 5.519093252226232e-07, "loss": 0.1352, "step": 3529 }, { "epoch": 0.7953361309037654, "grad_norm": 0.4767572914325259, "learning_rate": 5.507423310638299e-07, "loss": 0.1439, "step": 3530 }, { "epoch": 0.7955614385895738, "grad_norm": 0.4933828683243226, "learning_rate": 5.495764192551642e-07, "loss": 0.1428, "step": 3531 }, { "epoch": 0.7957867462753824, "grad_norm": 0.4790258722048279, "learning_rate": 5.48411590444012e-07, "loss": 0.1498, "step": 3532 }, { "epoch": 0.7960120539611908, "grad_norm": 0.48235971021995777, "learning_rate": 5.47247845277161e-07, "loss": 0.1502, "step": 3533 }, { "epoch": 0.7962373616469992, "grad_norm": 0.48860625321354456, "learning_rate": 5.460851844007945e-07, "loss": 0.1597, "step": 3534 }, { "epoch": 0.7964626693328076, "grad_norm": 0.43448018095407626, "learning_rate": 5.449236084604942e-07, "loss": 0.1199, "step": 3535 }, { "epoch": 0.796687977018616, "grad_norm": 0.4937221929181501, "learning_rate": 5.437631181012415e-07, "loss": 0.1469, "step": 3536 }, { "epoch": 0.7969132847044245, "grad_norm": 0.47705394970016807, "learning_rate": 5.426037139674117e-07, "loss": 0.136, "step": 3537 }, { "epoch": 0.7971385923902329, "grad_norm": 0.4931225755816937, "learning_rate": 5.414453967027797e-07, "loss": 0.1495, "step": 3538 }, { "epoch": 0.7973639000760413, "grad_norm": 0.5226037448579405, "learning_rate": 5.402881669505164e-07, "loss": 0.16, "step": 3539 }, { "epoch": 0.7975892077618498, "grad_norm": 0.4760864953251087, "learning_rate": 5.391320253531868e-07, "loss": 0.1394, "step": 3540 }, { "epoch": 0.7978145154476582, "grad_norm": 0.4896281847697234, "learning_rate": 5.37976972552755e-07, "loss": 0.1512, "step": 3541 }, { "epoch": 0.7980398231334667, "grad_norm": 0.4619253255008005, "learning_rate": 5.368230091905774e-07, "loss": 0.1391, "step": 3542 }, { "epoch": 0.7982651308192751, "grad_norm": 0.4886674760091969, "learning_rate": 5.356701359074076e-07, "loss": 0.1428, "step": 3543 }, { "epoch": 0.7984904385050835, "grad_norm": 0.4714475722790928, "learning_rate": 5.345183533433926e-07, "loss": 0.1422, "step": 3544 }, { "epoch": 0.7987157461908919, "grad_norm": 0.46688084342141173, "learning_rate": 5.333676621380746e-07, "loss": 0.1326, "step": 3545 }, { "epoch": 0.7989410538767003, "grad_norm": 0.46759152413970434, "learning_rate": 5.322180629303902e-07, "loss": 0.1349, "step": 3546 }, { "epoch": 0.7991663615625088, "grad_norm": 0.4718643364907308, "learning_rate": 5.310695563586676e-07, "loss": 0.1363, "step": 3547 }, { "epoch": 0.7993916692483173, "grad_norm": 0.48857271578568195, "learning_rate": 5.299221430606313e-07, "loss": 0.1488, "step": 3548 }, { "epoch": 0.7996169769341257, "grad_norm": 0.4940060034052037, "learning_rate": 5.287758236733956e-07, "loss": 0.1427, "step": 3549 }, { "epoch": 0.7998422846199341, "grad_norm": 0.45653832212422973, "learning_rate": 5.276305988334701e-07, "loss": 0.1312, "step": 3550 }, { "epoch": 0.8000675923057425, "grad_norm": 0.4807674425195621, "learning_rate": 5.264864691767551e-07, "loss": 0.1359, "step": 3551 }, { "epoch": 0.800292899991551, "grad_norm": 0.49869397116767594, "learning_rate": 5.253434353385422e-07, "loss": 0.1508, "step": 3552 }, { "epoch": 0.8005182076773594, "grad_norm": 0.49196778717615613, "learning_rate": 5.242014979535173e-07, "loss": 0.1441, "step": 3553 }, { "epoch": 0.8007435153631678, "grad_norm": 0.4646584950573559, "learning_rate": 5.23060657655754e-07, "loss": 0.1283, "step": 3554 }, { "epoch": 0.8009688230489762, "grad_norm": 0.4672535299105578, "learning_rate": 5.219209150787189e-07, "loss": 0.1353, "step": 3555 }, { "epoch": 0.8011941307347847, "grad_norm": 0.4700008614184278, "learning_rate": 5.207822708552695e-07, "loss": 0.1387, "step": 3556 }, { "epoch": 0.8014194384205932, "grad_norm": 0.4629867095595067, "learning_rate": 5.196447256176509e-07, "loss": 0.1364, "step": 3557 }, { "epoch": 0.8016447461064016, "grad_norm": 0.44529648280702133, "learning_rate": 5.185082799975013e-07, "loss": 0.1351, "step": 3558 }, { "epoch": 0.80187005379221, "grad_norm": 0.494508296543035, "learning_rate": 5.173729346258452e-07, "loss": 0.1463, "step": 3559 }, { "epoch": 0.8020953614780184, "grad_norm": 0.48079362147959903, "learning_rate": 5.162386901330977e-07, "loss": 0.1515, "step": 3560 }, { "epoch": 0.8023206691638268, "grad_norm": 0.46475195640142025, "learning_rate": 5.151055471490638e-07, "loss": 0.1438, "step": 3561 }, { "epoch": 0.8025459768496352, "grad_norm": 0.45803423853802877, "learning_rate": 5.139735063029338e-07, "loss": 0.1306, "step": 3562 }, { "epoch": 0.8027712845354437, "grad_norm": 0.4891276069352578, "learning_rate": 5.128425682232893e-07, "loss": 0.1444, "step": 3563 }, { "epoch": 0.8029965922212522, "grad_norm": 0.4673632430066725, "learning_rate": 5.117127335380967e-07, "loss": 0.1419, "step": 3564 }, { "epoch": 0.8032218999070606, "grad_norm": 0.4530586340062168, "learning_rate": 5.105840028747125e-07, "loss": 0.1365, "step": 3565 }, { "epoch": 0.803447207592869, "grad_norm": 0.43786476538662233, "learning_rate": 5.094563768598773e-07, "loss": 0.1258, "step": 3566 }, { "epoch": 0.8036725152786774, "grad_norm": 0.4740710531754334, "learning_rate": 5.083298561197205e-07, "loss": 0.138, "step": 3567 }, { "epoch": 0.8038978229644859, "grad_norm": 0.45026579308376163, "learning_rate": 5.07204441279758e-07, "loss": 0.1317, "step": 3568 }, { "epoch": 0.8041231306502943, "grad_norm": 0.4780330236506349, "learning_rate": 5.060801329648896e-07, "loss": 0.1354, "step": 3569 }, { "epoch": 0.8043484383361027, "grad_norm": 0.4699443411669041, "learning_rate": 5.049569317994013e-07, "loss": 0.1395, "step": 3570 }, { "epoch": 0.8045737460219111, "grad_norm": 0.47850393654361223, "learning_rate": 5.038348384069663e-07, "loss": 0.1463, "step": 3571 }, { "epoch": 0.8047990537077196, "grad_norm": 0.4504379700975781, "learning_rate": 5.027138534106399e-07, "loss": 0.1276, "step": 3572 }, { "epoch": 0.8050243613935281, "grad_norm": 0.4731891026940293, "learning_rate": 5.015939774328643e-07, "loss": 0.1402, "step": 3573 }, { "epoch": 0.8052496690793365, "grad_norm": 0.4672426623202153, "learning_rate": 5.004752110954642e-07, "loss": 0.1434, "step": 3574 }, { "epoch": 0.8054749767651449, "grad_norm": 0.4660024619598315, "learning_rate": 4.993575550196495e-07, "loss": 0.1367, "step": 3575 }, { "epoch": 0.8057002844509533, "grad_norm": 0.5192953141840139, "learning_rate": 4.982410098260118e-07, "loss": 0.164, "step": 3576 }, { "epoch": 0.8059255921367617, "grad_norm": 0.4927213535021479, "learning_rate": 4.971255761345278e-07, "loss": 0.1457, "step": 3577 }, { "epoch": 0.8061508998225702, "grad_norm": 0.49996850602429554, "learning_rate": 4.96011254564557e-07, "loss": 0.1545, "step": 3578 }, { "epoch": 0.8063762075083787, "grad_norm": 0.48978869042351664, "learning_rate": 4.948980457348393e-07, "loss": 0.1507, "step": 3579 }, { "epoch": 0.8066015151941871, "grad_norm": 0.4945780921266697, "learning_rate": 4.937859502634992e-07, "loss": 0.1531, "step": 3580 }, { "epoch": 0.8068268228799955, "grad_norm": 0.4926202843506412, "learning_rate": 4.926749687680407e-07, "loss": 0.1445, "step": 3581 }, { "epoch": 0.8070521305658039, "grad_norm": 0.49038103728173205, "learning_rate": 4.915651018653511e-07, "loss": 0.15, "step": 3582 }, { "epoch": 0.8072774382516124, "grad_norm": 0.4921243279323083, "learning_rate": 4.904563501716986e-07, "loss": 0.1421, "step": 3583 }, { "epoch": 0.8075027459374208, "grad_norm": 0.4864573378021889, "learning_rate": 4.893487143027307e-07, "loss": 0.1556, "step": 3584 }, { "epoch": 0.8077280536232292, "grad_norm": 0.48165400100364, "learning_rate": 4.88242194873477e-07, "loss": 0.1376, "step": 3585 }, { "epoch": 0.8079533613090376, "grad_norm": 0.46793419647308165, "learning_rate": 4.871367924983458e-07, "loss": 0.1283, "step": 3586 }, { "epoch": 0.8081786689948461, "grad_norm": 0.4563054622427974, "learning_rate": 4.860325077911271e-07, "loss": 0.1307, "step": 3587 }, { "epoch": 0.8084039766806546, "grad_norm": 0.4778147728712531, "learning_rate": 4.84929341364988e-07, "loss": 0.1426, "step": 3588 }, { "epoch": 0.808629284366463, "grad_norm": 0.470678825814214, "learning_rate": 4.838272938324753e-07, "loss": 0.1408, "step": 3589 }, { "epoch": 0.8088545920522714, "grad_norm": 0.47747949172864607, "learning_rate": 4.827263658055161e-07, "loss": 0.134, "step": 3590 }, { "epoch": 0.8090798997380798, "grad_norm": 0.45299779005735497, "learning_rate": 4.816265578954135e-07, "loss": 0.1267, "step": 3591 }, { "epoch": 0.8093052074238882, "grad_norm": 0.47263317790931686, "learning_rate": 4.805278707128505e-07, "loss": 0.1456, "step": 3592 }, { "epoch": 0.8095305151096966, "grad_norm": 0.46590099895733506, "learning_rate": 4.794303048678878e-07, "loss": 0.1369, "step": 3593 }, { "epoch": 0.8097558227955051, "grad_norm": 0.4739549078475938, "learning_rate": 4.783338609699614e-07, "loss": 0.1419, "step": 3594 }, { "epoch": 0.8099811304813136, "grad_norm": 0.482674145715541, "learning_rate": 4.772385396278872e-07, "loss": 0.1426, "step": 3595 }, { "epoch": 0.810206438167122, "grad_norm": 0.4842047503815621, "learning_rate": 4.7614434144985486e-07, "loss": 0.1493, "step": 3596 }, { "epoch": 0.8104317458529304, "grad_norm": 0.4688780747440212, "learning_rate": 4.750512670434332e-07, "loss": 0.1442, "step": 3597 }, { "epoch": 0.8106570535387388, "grad_norm": 0.47451732147636083, "learning_rate": 4.73959317015564e-07, "loss": 0.1415, "step": 3598 }, { "epoch": 0.8108823612245473, "grad_norm": 0.4738635298914328, "learning_rate": 4.728684919725679e-07, "loss": 0.1395, "step": 3599 }, { "epoch": 0.8111076689103557, "grad_norm": 0.4828211724052069, "learning_rate": 4.7177879252013945e-07, "loss": 0.1433, "step": 3600 }, { "epoch": 0.8113329765961641, "grad_norm": 0.46128104439434287, "learning_rate": 4.70690219263347e-07, "loss": 0.1423, "step": 3601 }, { "epoch": 0.8115582842819725, "grad_norm": 0.4775004231158638, "learning_rate": 4.6960277280663574e-07, "loss": 0.151, "step": 3602 }, { "epoch": 0.811783591967781, "grad_norm": 0.449064787635663, "learning_rate": 4.685164537538234e-07, "loss": 0.1333, "step": 3603 }, { "epoch": 0.8120088996535895, "grad_norm": 0.5072565630500627, "learning_rate": 4.674312627081032e-07, "loss": 0.1476, "step": 3604 }, { "epoch": 0.8122342073393979, "grad_norm": 0.4948390540213146, "learning_rate": 4.6634720027204093e-07, "loss": 0.1487, "step": 3605 }, { "epoch": 0.8124595150252063, "grad_norm": 0.4747666299747306, "learning_rate": 4.6526426704757545e-07, "loss": 0.1342, "step": 3606 }, { "epoch": 0.8126848227110147, "grad_norm": 0.45672093470701447, "learning_rate": 4.641824636360195e-07, "loss": 0.1315, "step": 3607 }, { "epoch": 0.8129101303968231, "grad_norm": 0.4805529418689535, "learning_rate": 4.6310179063805916e-07, "loss": 0.1417, "step": 3608 }, { "epoch": 0.8131354380826316, "grad_norm": 0.4496617237929147, "learning_rate": 4.620222486537507e-07, "loss": 0.1314, "step": 3609 }, { "epoch": 0.81336074576844, "grad_norm": 0.4661839377205937, "learning_rate": 4.609438382825246e-07, "loss": 0.1413, "step": 3610 }, { "epoch": 0.8135860534542485, "grad_norm": 0.4769628903500223, "learning_rate": 4.598665601231805e-07, "loss": 0.1523, "step": 3611 }, { "epoch": 0.8138113611400569, "grad_norm": 0.4567423398453036, "learning_rate": 4.587904147738925e-07, "loss": 0.146, "step": 3612 }, { "epoch": 0.8140366688258653, "grad_norm": 0.49509244987764606, "learning_rate": 4.577154028322023e-07, "loss": 0.1484, "step": 3613 }, { "epoch": 0.8142619765116738, "grad_norm": 0.48460550102516503, "learning_rate": 4.566415248950251e-07, "loss": 0.1357, "step": 3614 }, { "epoch": 0.8144872841974822, "grad_norm": 0.46882127288324, "learning_rate": 4.555687815586454e-07, "loss": 0.135, "step": 3615 }, { "epoch": 0.8147125918832906, "grad_norm": 0.4609617391001502, "learning_rate": 4.5449717341871646e-07, "loss": 0.1415, "step": 3616 }, { "epoch": 0.814937899569099, "grad_norm": 0.45803310017740106, "learning_rate": 4.534267010702639e-07, "loss": 0.1434, "step": 3617 }, { "epoch": 0.8151632072549074, "grad_norm": 0.47078556309532865, "learning_rate": 4.5235736510767957e-07, "loss": 0.1486, "step": 3618 }, { "epoch": 0.815388514940716, "grad_norm": 0.4814040730333512, "learning_rate": 4.5128916612472735e-07, "loss": 0.1593, "step": 3619 }, { "epoch": 0.8156138226265244, "grad_norm": 0.4474212148189157, "learning_rate": 4.5022210471453664e-07, "loss": 0.1288, "step": 3620 }, { "epoch": 0.8158391303123328, "grad_norm": 0.45554947792372646, "learning_rate": 4.49156181469608e-07, "loss": 0.1327, "step": 3621 }, { "epoch": 0.8160644379981412, "grad_norm": 0.4692652076043499, "learning_rate": 4.480913969818099e-07, "loss": 0.1431, "step": 3622 }, { "epoch": 0.8162897456839496, "grad_norm": 0.48211225107706485, "learning_rate": 4.470277518423749e-07, "loss": 0.1391, "step": 3623 }, { "epoch": 0.816515053369758, "grad_norm": 0.45803483894327357, "learning_rate": 4.4596524664190674e-07, "loss": 0.1372, "step": 3624 }, { "epoch": 0.8167403610555665, "grad_norm": 0.4596747253969047, "learning_rate": 4.449038819703758e-07, "loss": 0.1334, "step": 3625 }, { "epoch": 0.816965668741375, "grad_norm": 0.4908321119704428, "learning_rate": 4.4384365841711684e-07, "loss": 0.1503, "step": 3626 }, { "epoch": 0.8171909764271834, "grad_norm": 0.47937901395744276, "learning_rate": 4.427845765708341e-07, "loss": 0.1486, "step": 3627 }, { "epoch": 0.8174162841129918, "grad_norm": 0.522368049867057, "learning_rate": 4.417266370195944e-07, "loss": 0.1656, "step": 3628 }, { "epoch": 0.8176415917988002, "grad_norm": 0.4715115738148847, "learning_rate": 4.406698403508333e-07, "loss": 0.1425, "step": 3629 }, { "epoch": 0.8178668994846087, "grad_norm": 0.48330696787253935, "learning_rate": 4.3961418715135097e-07, "loss": 0.1375, "step": 3630 }, { "epoch": 0.8180922071704171, "grad_norm": 0.47495407680447216, "learning_rate": 4.385596780073112e-07, "loss": 0.1405, "step": 3631 }, { "epoch": 0.8183175148562255, "grad_norm": 0.4605801773738396, "learning_rate": 4.3750631350424456e-07, "loss": 0.1351, "step": 3632 }, { "epoch": 0.8185428225420339, "grad_norm": 0.44789070717297397, "learning_rate": 4.36454094227044e-07, "loss": 0.1338, "step": 3633 }, { "epoch": 0.8187681302278424, "grad_norm": 0.45002983584397277, "learning_rate": 4.354030207599691e-07, "loss": 0.1355, "step": 3634 }, { "epoch": 0.8189934379136509, "grad_norm": 0.4823857101816615, "learning_rate": 4.3435309368664024e-07, "loss": 0.1504, "step": 3635 }, { "epoch": 0.8192187455994593, "grad_norm": 0.480880910523841, "learning_rate": 4.333043135900436e-07, "loss": 0.1396, "step": 3636 }, { "epoch": 0.8194440532852677, "grad_norm": 0.4687653665919674, "learning_rate": 4.3225668105252834e-07, "loss": 0.1376, "step": 3637 }, { "epoch": 0.8196693609710761, "grad_norm": 0.4697261703304461, "learning_rate": 4.312101966558044e-07, "loss": 0.1362, "step": 3638 }, { "epoch": 0.8198946686568845, "grad_norm": 0.49584678128138465, "learning_rate": 4.3016486098094667e-07, "loss": 0.147, "step": 3639 }, { "epoch": 0.820119976342693, "grad_norm": 0.4575663797111487, "learning_rate": 4.2912067460839066e-07, "loss": 0.1358, "step": 3640 }, { "epoch": 0.8203452840285014, "grad_norm": 0.5065216822528078, "learning_rate": 4.280776381179336e-07, "loss": 0.1545, "step": 3641 }, { "epoch": 0.8205705917143099, "grad_norm": 0.47092622989286775, "learning_rate": 4.2703575208873585e-07, "loss": 0.1347, "step": 3642 }, { "epoch": 0.8207958994001183, "grad_norm": 0.4660334975585382, "learning_rate": 4.259950170993166e-07, "loss": 0.1434, "step": 3643 }, { "epoch": 0.8210212070859267, "grad_norm": 0.500680243028102, "learning_rate": 4.2495543372755854e-07, "loss": 0.1444, "step": 3644 }, { "epoch": 0.8212465147717352, "grad_norm": 0.46330055548885296, "learning_rate": 4.239170025507025e-07, "loss": 0.1367, "step": 3645 }, { "epoch": 0.8214718224575436, "grad_norm": 0.47460031403910136, "learning_rate": 4.2287972414535084e-07, "loss": 0.1464, "step": 3646 }, { "epoch": 0.821697130143352, "grad_norm": 0.4730302345220583, "learning_rate": 4.218435990874664e-07, "loss": 0.1364, "step": 3647 }, { "epoch": 0.8219224378291604, "grad_norm": 0.47555182850863376, "learning_rate": 4.208086279523699e-07, "loss": 0.1409, "step": 3648 }, { "epoch": 0.8221477455149688, "grad_norm": 0.48364053815683355, "learning_rate": 4.197748113147429e-07, "loss": 0.1538, "step": 3649 }, { "epoch": 0.8223730532007774, "grad_norm": 0.4946534870674275, "learning_rate": 4.1874214974862436e-07, "loss": 0.152, "step": 3650 }, { "epoch": 0.8225983608865858, "grad_norm": 0.4978085337800147, "learning_rate": 4.177106438274131e-07, "loss": 0.1457, "step": 3651 }, { "epoch": 0.8228236685723942, "grad_norm": 0.46412499698557874, "learning_rate": 4.1668029412386677e-07, "loss": 0.1384, "step": 3652 }, { "epoch": 0.8230489762582026, "grad_norm": 0.48937384930554245, "learning_rate": 4.1565110121009886e-07, "loss": 0.1535, "step": 3653 }, { "epoch": 0.823274283944011, "grad_norm": 0.5000518434004941, "learning_rate": 4.146230656575831e-07, "loss": 0.1603, "step": 3654 }, { "epoch": 0.8234995916298194, "grad_norm": 0.5005304792524752, "learning_rate": 4.1359618803714805e-07, "loss": 0.1618, "step": 3655 }, { "epoch": 0.8237248993156279, "grad_norm": 0.5345403594608715, "learning_rate": 4.125704689189819e-07, "loss": 0.1702, "step": 3656 }, { "epoch": 0.8239502070014363, "grad_norm": 0.4724524810649259, "learning_rate": 4.115459088726273e-07, "loss": 0.1545, "step": 3657 }, { "epoch": 0.8241755146872448, "grad_norm": 0.487616351503573, "learning_rate": 4.105225084669839e-07, "loss": 0.1563, "step": 3658 }, { "epoch": 0.8244008223730532, "grad_norm": 0.5008318927731561, "learning_rate": 4.095002682703092e-07, "loss": 0.1487, "step": 3659 }, { "epoch": 0.8246261300588617, "grad_norm": 0.46296159400562426, "learning_rate": 4.084791888502135e-07, "loss": 0.1375, "step": 3660 }, { "epoch": 0.8248514377446701, "grad_norm": 0.46774301209203334, "learning_rate": 4.0745927077366493e-07, "loss": 0.1417, "step": 3661 }, { "epoch": 0.8250767454304785, "grad_norm": 0.4798585938930743, "learning_rate": 4.0644051460698634e-07, "loss": 0.1445, "step": 3662 }, { "epoch": 0.8253020531162869, "grad_norm": 0.4636078969381435, "learning_rate": 4.0542292091585447e-07, "loss": 0.1315, "step": 3663 }, { "epoch": 0.8255273608020953, "grad_norm": 0.47110940624017383, "learning_rate": 4.0440649026530166e-07, "loss": 0.1437, "step": 3664 }, { "epoch": 0.8257526684879037, "grad_norm": 0.4845380348187428, "learning_rate": 4.033912232197132e-07, "loss": 0.1471, "step": 3665 }, { "epoch": 0.8259779761737123, "grad_norm": 0.49698390602523673, "learning_rate": 4.0237712034283004e-07, "loss": 0.1339, "step": 3666 }, { "epoch": 0.8262032838595207, "grad_norm": 0.4709200362568471, "learning_rate": 4.0136418219774457e-07, "loss": 0.1363, "step": 3667 }, { "epoch": 0.8264285915453291, "grad_norm": 0.47535675030537516, "learning_rate": 4.003524093469041e-07, "loss": 0.1335, "step": 3668 }, { "epoch": 0.8266538992311375, "grad_norm": 0.4954697566054315, "learning_rate": 3.993418023521092e-07, "loss": 0.1519, "step": 3669 }, { "epoch": 0.8268792069169459, "grad_norm": 0.45705098958634477, "learning_rate": 3.983323617745111e-07, "loss": 0.1451, "step": 3670 }, { "epoch": 0.8271045146027544, "grad_norm": 0.4772107912339602, "learning_rate": 3.9732408817461544e-07, "loss": 0.1357, "step": 3671 }, { "epoch": 0.8273298222885628, "grad_norm": 0.4540939185349331, "learning_rate": 3.963169821122778e-07, "loss": 0.1238, "step": 3672 }, { "epoch": 0.8275551299743712, "grad_norm": 0.47655176367622293, "learning_rate": 3.953110441467073e-07, "loss": 0.1295, "step": 3673 }, { "epoch": 0.8277804376601797, "grad_norm": 0.4974421135266379, "learning_rate": 3.943062748364651e-07, "loss": 0.1524, "step": 3674 }, { "epoch": 0.8280057453459881, "grad_norm": 0.4630936824471273, "learning_rate": 3.9330267473945973e-07, "loss": 0.1373, "step": 3675 }, { "epoch": 0.8282310530317966, "grad_norm": 0.47957905381516513, "learning_rate": 3.9230024441295394e-07, "loss": 0.1481, "step": 3676 }, { "epoch": 0.828456360717605, "grad_norm": 0.47595799930886623, "learning_rate": 3.9129898441356064e-07, "loss": 0.1417, "step": 3677 }, { "epoch": 0.8286816684034134, "grad_norm": 0.4795537018708493, "learning_rate": 3.9029889529724113e-07, "loss": 0.1355, "step": 3678 }, { "epoch": 0.8289069760892218, "grad_norm": 0.4767115392349057, "learning_rate": 3.892999776193085e-07, "loss": 0.1412, "step": 3679 }, { "epoch": 0.8291322837750302, "grad_norm": 0.45880264666880477, "learning_rate": 3.8830223193442345e-07, "loss": 0.126, "step": 3680 }, { "epoch": 0.8293575914608388, "grad_norm": 0.4849367733903804, "learning_rate": 3.8730565879659845e-07, "loss": 0.1536, "step": 3681 }, { "epoch": 0.8295828991466472, "grad_norm": 0.4679528634230345, "learning_rate": 3.863102587591919e-07, "loss": 0.1395, "step": 3682 }, { "epoch": 0.8298082068324556, "grad_norm": 0.5266202362340566, "learning_rate": 3.853160323749128e-07, "loss": 0.1748, "step": 3683 }, { "epoch": 0.830033514518264, "grad_norm": 0.5021640642613251, "learning_rate": 3.84322980195819e-07, "loss": 0.1464, "step": 3684 }, { "epoch": 0.8302588222040724, "grad_norm": 0.5141221995124274, "learning_rate": 3.833311027733139e-07, "loss": 0.1631, "step": 3685 }, { "epoch": 0.8304841298898809, "grad_norm": 0.4973624549310441, "learning_rate": 3.823404006581513e-07, "loss": 0.1418, "step": 3686 }, { "epoch": 0.8307094375756893, "grad_norm": 0.47186531333299525, "learning_rate": 3.8135087440043017e-07, "loss": 0.1404, "step": 3687 }, { "epoch": 0.8309347452614977, "grad_norm": 0.4951193897109337, "learning_rate": 3.8036252454959844e-07, "loss": 0.1584, "step": 3688 }, { "epoch": 0.8311600529473062, "grad_norm": 0.48335252169394144, "learning_rate": 3.7937535165444875e-07, "loss": 0.1518, "step": 3689 }, { "epoch": 0.8313853606331146, "grad_norm": 0.43298312754915047, "learning_rate": 3.7838935626312246e-07, "loss": 0.1217, "step": 3690 }, { "epoch": 0.831610668318923, "grad_norm": 0.4772264241587924, "learning_rate": 3.7740453892310596e-07, "loss": 0.148, "step": 3691 }, { "epoch": 0.8318359760047315, "grad_norm": 0.4986281566598413, "learning_rate": 3.764209001812316e-07, "loss": 0.1563, "step": 3692 }, { "epoch": 0.8320612836905399, "grad_norm": 0.49596555246639523, "learning_rate": 3.754384405836767e-07, "loss": 0.1472, "step": 3693 }, { "epoch": 0.8322865913763483, "grad_norm": 0.4625550974162768, "learning_rate": 3.7445716067596506e-07, "loss": 0.1367, "step": 3694 }, { "epoch": 0.8325118990621567, "grad_norm": 0.4707691697369795, "learning_rate": 3.734770610029642e-07, "loss": 0.1478, "step": 3695 }, { "epoch": 0.8327372067479651, "grad_norm": 0.4863393786315704, "learning_rate": 3.72498142108888e-07, "loss": 0.1467, "step": 3696 }, { "epoch": 0.8329625144337737, "grad_norm": 0.46568031143045824, "learning_rate": 3.7152040453729223e-07, "loss": 0.1514, "step": 3697 }, { "epoch": 0.8331878221195821, "grad_norm": 0.4451139600576869, "learning_rate": 3.705438488310792e-07, "loss": 0.1331, "step": 3698 }, { "epoch": 0.8334131298053905, "grad_norm": 0.4386885790715076, "learning_rate": 3.695684755324938e-07, "loss": 0.1318, "step": 3699 }, { "epoch": 0.8336384374911989, "grad_norm": 0.4936072561997163, "learning_rate": 3.6859428518312394e-07, "loss": 0.1526, "step": 3700 }, { "epoch": 0.8338637451770073, "grad_norm": 0.4777576391614967, "learning_rate": 3.6762127832390194e-07, "loss": 0.1527, "step": 3701 }, { "epoch": 0.8340890528628158, "grad_norm": 0.47375366285599557, "learning_rate": 3.666494554951014e-07, "loss": 0.1347, "step": 3702 }, { "epoch": 0.8343143605486242, "grad_norm": 0.4708439662424002, "learning_rate": 3.656788172363401e-07, "loss": 0.1276, "step": 3703 }, { "epoch": 0.8345396682344326, "grad_norm": 0.4710695568080402, "learning_rate": 3.6470936408657647e-07, "loss": 0.144, "step": 3704 }, { "epoch": 0.8347649759202411, "grad_norm": 0.4974200303377148, "learning_rate": 3.6374109658411207e-07, "loss": 0.1502, "step": 3705 }, { "epoch": 0.8349902836060495, "grad_norm": 0.4761356662500996, "learning_rate": 3.6277401526659067e-07, "loss": 0.1387, "step": 3706 }, { "epoch": 0.835215591291858, "grad_norm": 0.4830728913581897, "learning_rate": 3.6180812067099477e-07, "loss": 0.1454, "step": 3707 }, { "epoch": 0.8354408989776664, "grad_norm": 0.46440622198669923, "learning_rate": 3.6084341333365135e-07, "loss": 0.1389, "step": 3708 }, { "epoch": 0.8356662066634748, "grad_norm": 0.4782555940658288, "learning_rate": 3.5987989379022536e-07, "loss": 0.1428, "step": 3709 }, { "epoch": 0.8358915143492832, "grad_norm": 0.44845896618127046, "learning_rate": 3.58917562575723e-07, "loss": 0.1274, "step": 3710 }, { "epoch": 0.8361168220350916, "grad_norm": 0.4747354070350136, "learning_rate": 3.57956420224492e-07, "loss": 0.1414, "step": 3711 }, { "epoch": 0.8363421297209, "grad_norm": 0.4788958507356607, "learning_rate": 3.569964672702178e-07, "loss": 0.1409, "step": 3712 }, { "epoch": 0.8365674374067086, "grad_norm": 0.4583045970863074, "learning_rate": 3.5603770424592785e-07, "loss": 0.1311, "step": 3713 }, { "epoch": 0.836792745092517, "grad_norm": 0.4714639173787331, "learning_rate": 3.550801316839858e-07, "loss": 0.1331, "step": 3714 }, { "epoch": 0.8370180527783254, "grad_norm": 0.5013819286501238, "learning_rate": 3.5412375011609714e-07, "loss": 0.1605, "step": 3715 }, { "epoch": 0.8372433604641338, "grad_norm": 0.46978437287956937, "learning_rate": 3.531685600733051e-07, "loss": 0.1359, "step": 3716 }, { "epoch": 0.8374686681499423, "grad_norm": 0.4805452566101585, "learning_rate": 3.5221456208598987e-07, "loss": 0.1385, "step": 3717 }, { "epoch": 0.8376939758357507, "grad_norm": 0.47136760789390303, "learning_rate": 3.5126175668387275e-07, "loss": 0.1421, "step": 3718 }, { "epoch": 0.8379192835215591, "grad_norm": 0.4763275734278578, "learning_rate": 3.503101443960094e-07, "loss": 0.1445, "step": 3719 }, { "epoch": 0.8381445912073675, "grad_norm": 0.4838455228665239, "learning_rate": 3.4935972575079524e-07, "loss": 0.1422, "step": 3720 }, { "epoch": 0.838369898893176, "grad_norm": 0.4592355710517882, "learning_rate": 3.484105012759631e-07, "loss": 0.1359, "step": 3721 }, { "epoch": 0.8385952065789845, "grad_norm": 0.5076200514758693, "learning_rate": 3.474624714985805e-07, "loss": 0.1509, "step": 3722 }, { "epoch": 0.8388205142647929, "grad_norm": 0.5013287029320084, "learning_rate": 3.465156369450545e-07, "loss": 0.1507, "step": 3723 }, { "epoch": 0.8390458219506013, "grad_norm": 0.46463610064480193, "learning_rate": 3.455699981411259e-07, "loss": 0.1386, "step": 3724 }, { "epoch": 0.8392711296364097, "grad_norm": 0.469792875927176, "learning_rate": 3.446255556118736e-07, "loss": 0.1393, "step": 3725 }, { "epoch": 0.8394964373222181, "grad_norm": 0.4668736724148676, "learning_rate": 3.436823098817102e-07, "loss": 0.1401, "step": 3726 }, { "epoch": 0.8397217450080265, "grad_norm": 0.45187758940267403, "learning_rate": 3.427402614743863e-07, "loss": 0.1278, "step": 3727 }, { "epoch": 0.8399470526938351, "grad_norm": 0.4841475700937596, "learning_rate": 3.417994109129852e-07, "loss": 0.1422, "step": 3728 }, { "epoch": 0.8401723603796435, "grad_norm": 0.4625871616784283, "learning_rate": 3.408597587199261e-07, "loss": 0.1338, "step": 3729 }, { "epoch": 0.8403976680654519, "grad_norm": 0.47550866799296265, "learning_rate": 3.3992130541696336e-07, "loss": 0.1369, "step": 3730 }, { "epoch": 0.8406229757512603, "grad_norm": 0.4490907761475444, "learning_rate": 3.389840515251855e-07, "loss": 0.1339, "step": 3731 }, { "epoch": 0.8408482834370687, "grad_norm": 0.5026847491625468, "learning_rate": 3.3804799756501335e-07, "loss": 0.1373, "step": 3732 }, { "epoch": 0.8410735911228772, "grad_norm": 0.4796871566183159, "learning_rate": 3.371131440562042e-07, "loss": 0.1359, "step": 3733 }, { "epoch": 0.8412988988086856, "grad_norm": 0.489908655988409, "learning_rate": 3.3617949151784623e-07, "loss": 0.1446, "step": 3734 }, { "epoch": 0.841524206494494, "grad_norm": 0.4611673387819876, "learning_rate": 3.3524704046836305e-07, "loss": 0.146, "step": 3735 }, { "epoch": 0.8417495141803025, "grad_norm": 0.47569199775126225, "learning_rate": 3.343157914255085e-07, "loss": 0.1313, "step": 3736 }, { "epoch": 0.841974821866111, "grad_norm": 0.4447117260347802, "learning_rate": 3.3338574490637154e-07, "loss": 0.1307, "step": 3737 }, { "epoch": 0.8422001295519194, "grad_norm": 0.4541365818430463, "learning_rate": 3.3245690142737236e-07, "loss": 0.1264, "step": 3738 }, { "epoch": 0.8424254372377278, "grad_norm": 0.4896570773348926, "learning_rate": 3.3152926150426256e-07, "loss": 0.1383, "step": 3739 }, { "epoch": 0.8426507449235362, "grad_norm": 0.47351248738261037, "learning_rate": 3.306028256521265e-07, "loss": 0.1378, "step": 3740 }, { "epoch": 0.8428760526093446, "grad_norm": 0.498959457382647, "learning_rate": 3.296775943853789e-07, "loss": 0.1525, "step": 3741 }, { "epoch": 0.843101360295153, "grad_norm": 0.4913202107412484, "learning_rate": 3.287535682177667e-07, "loss": 0.15, "step": 3742 }, { "epoch": 0.8433266679809615, "grad_norm": 0.4894838756998072, "learning_rate": 3.278307476623674e-07, "loss": 0.1388, "step": 3743 }, { "epoch": 0.84355197566677, "grad_norm": 0.4986603478893013, "learning_rate": 3.2690913323158795e-07, "loss": 0.148, "step": 3744 }, { "epoch": 0.8437772833525784, "grad_norm": 0.5147315679695823, "learning_rate": 3.259887254371677e-07, "loss": 0.1518, "step": 3745 }, { "epoch": 0.8440025910383868, "grad_norm": 0.5093748124342478, "learning_rate": 3.2506952479017417e-07, "loss": 0.1495, "step": 3746 }, { "epoch": 0.8442278987241952, "grad_norm": 0.47134348911112495, "learning_rate": 3.241515318010044e-07, "loss": 0.1479, "step": 3747 }, { "epoch": 0.8444532064100037, "grad_norm": 0.48836861657128516, "learning_rate": 3.2323474697938727e-07, "loss": 0.1525, "step": 3748 }, { "epoch": 0.8446785140958121, "grad_norm": 0.5219802457338291, "learning_rate": 3.223191708343776e-07, "loss": 0.1558, "step": 3749 }, { "epoch": 0.8449038217816205, "grad_norm": 0.4751955627178434, "learning_rate": 3.214048038743622e-07, "loss": 0.1405, "step": 3750 }, { "epoch": 0.8451291294674289, "grad_norm": 0.4602528146172868, "learning_rate": 3.204916466070537e-07, "loss": 0.1369, "step": 3751 }, { "epoch": 0.8453544371532374, "grad_norm": 0.4867880285193427, "learning_rate": 3.1957969953949506e-07, "loss": 0.1454, "step": 3752 }, { "epoch": 0.8455797448390459, "grad_norm": 0.47421301891609136, "learning_rate": 3.186689631780565e-07, "loss": 0.139, "step": 3753 }, { "epoch": 0.8458050525248543, "grad_norm": 0.5230980631330083, "learning_rate": 3.1775943802843546e-07, "loss": 0.1655, "step": 3754 }, { "epoch": 0.8460303602106627, "grad_norm": 0.4792049371459631, "learning_rate": 3.168511245956582e-07, "loss": 0.1456, "step": 3755 }, { "epoch": 0.8462556678964711, "grad_norm": 0.4911553003467961, "learning_rate": 3.1594402338407633e-07, "loss": 0.1471, "step": 3756 }, { "epoch": 0.8464809755822795, "grad_norm": 0.46160048181105984, "learning_rate": 3.1503813489736995e-07, "loss": 0.1351, "step": 3757 }, { "epoch": 0.846706283268088, "grad_norm": 0.4542443875832939, "learning_rate": 3.141334596385448e-07, "loss": 0.1399, "step": 3758 }, { "epoch": 0.8469315909538964, "grad_norm": 0.5083072602650772, "learning_rate": 3.132299981099335e-07, "loss": 0.1545, "step": 3759 }, { "epoch": 0.8471568986397049, "grad_norm": 0.4967217063074372, "learning_rate": 3.12327750813195e-07, "loss": 0.1526, "step": 3760 }, { "epoch": 0.8473822063255133, "grad_norm": 0.4600415740247211, "learning_rate": 3.1142671824931275e-07, "loss": 0.1382, "step": 3761 }, { "epoch": 0.8476075140113217, "grad_norm": 0.4800819510127324, "learning_rate": 3.105269009185974e-07, "loss": 0.148, "step": 3762 }, { "epoch": 0.8478328216971301, "grad_norm": 0.4779608519409477, "learning_rate": 3.096282993206837e-07, "loss": 0.1432, "step": 3763 }, { "epoch": 0.8480581293829386, "grad_norm": 0.4734713665797522, "learning_rate": 3.087309139545311e-07, "loss": 0.1423, "step": 3764 }, { "epoch": 0.848283437068747, "grad_norm": 0.4706362568362203, "learning_rate": 3.0783474531842497e-07, "loss": 0.1381, "step": 3765 }, { "epoch": 0.8485087447545554, "grad_norm": 0.4533526963952121, "learning_rate": 3.0693979390997333e-07, "loss": 0.1279, "step": 3766 }, { "epoch": 0.8487340524403638, "grad_norm": 0.48340443376537645, "learning_rate": 3.0604606022611033e-07, "loss": 0.1491, "step": 3767 }, { "epoch": 0.8489593601261723, "grad_norm": 0.4694014430810065, "learning_rate": 3.0515354476309293e-07, "loss": 0.1415, "step": 3768 }, { "epoch": 0.8491846678119808, "grad_norm": 0.45863277428008126, "learning_rate": 3.042622480165011e-07, "loss": 0.1303, "step": 3769 }, { "epoch": 0.8494099754977892, "grad_norm": 0.5038764071443956, "learning_rate": 3.033721704812395e-07, "loss": 0.1557, "step": 3770 }, { "epoch": 0.8496352831835976, "grad_norm": 0.4674809016759538, "learning_rate": 3.024833126515339e-07, "loss": 0.1398, "step": 3771 }, { "epoch": 0.849860590869406, "grad_norm": 0.5036409352245586, "learning_rate": 3.0159567502093535e-07, "loss": 0.141, "step": 3772 }, { "epoch": 0.8500858985552144, "grad_norm": 0.46259561465234855, "learning_rate": 3.0070925808231456e-07, "loss": 0.1329, "step": 3773 }, { "epoch": 0.8503112062410229, "grad_norm": 0.4410361643206542, "learning_rate": 2.9982406232786614e-07, "loss": 0.1299, "step": 3774 }, { "epoch": 0.8505365139268314, "grad_norm": 0.4491162934123659, "learning_rate": 2.9894008824910726e-07, "loss": 0.1248, "step": 3775 }, { "epoch": 0.8507618216126398, "grad_norm": 0.4925158953606324, "learning_rate": 2.9805733633687467e-07, "loss": 0.1452, "step": 3776 }, { "epoch": 0.8509871292984482, "grad_norm": 0.4924716105067092, "learning_rate": 2.9717580708132856e-07, "loss": 0.1406, "step": 3777 }, { "epoch": 0.8512124369842566, "grad_norm": 0.46442448447501566, "learning_rate": 2.9629550097194787e-07, "loss": 0.1387, "step": 3778 }, { "epoch": 0.851437744670065, "grad_norm": 0.46692915113918126, "learning_rate": 2.9541641849753557e-07, "loss": 0.14, "step": 3779 }, { "epoch": 0.8516630523558735, "grad_norm": 0.47585439999813806, "learning_rate": 2.9453856014621224e-07, "loss": 0.1346, "step": 3780 }, { "epoch": 0.8518883600416819, "grad_norm": 0.45401892300040164, "learning_rate": 2.936619264054194e-07, "loss": 0.1332, "step": 3781 }, { "epoch": 0.8521136677274903, "grad_norm": 0.46019664304026237, "learning_rate": 2.9278651776192073e-07, "loss": 0.1377, "step": 3782 }, { "epoch": 0.8523389754132988, "grad_norm": 0.48199717160160416, "learning_rate": 2.919123347017963e-07, "loss": 0.1443, "step": 3783 }, { "epoch": 0.8525642830991073, "grad_norm": 0.4804863037487851, "learning_rate": 2.910393777104481e-07, "loss": 0.1354, "step": 3784 }, { "epoch": 0.8527895907849157, "grad_norm": 0.4615664259895404, "learning_rate": 2.901676472725973e-07, "loss": 0.1365, "step": 3785 }, { "epoch": 0.8530148984707241, "grad_norm": 0.47659295388773687, "learning_rate": 2.892971438722822e-07, "loss": 0.1412, "step": 3786 }, { "epoch": 0.8532402061565325, "grad_norm": 0.4756320606626126, "learning_rate": 2.8842786799286204e-07, "loss": 0.1454, "step": 3787 }, { "epoch": 0.8534655138423409, "grad_norm": 0.4668571095658293, "learning_rate": 2.8755982011701183e-07, "loss": 0.1409, "step": 3788 }, { "epoch": 0.8536908215281493, "grad_norm": 0.4596291324992224, "learning_rate": 2.866930007267274e-07, "loss": 0.1342, "step": 3789 }, { "epoch": 0.8539161292139578, "grad_norm": 0.4722187049851435, "learning_rate": 2.8582741030332095e-07, "loss": 0.1406, "step": 3790 }, { "epoch": 0.8541414368997663, "grad_norm": 0.45961773897137925, "learning_rate": 2.8496304932742247e-07, "loss": 0.1355, "step": 3791 }, { "epoch": 0.8543667445855747, "grad_norm": 0.48484033285073747, "learning_rate": 2.840999182789797e-07, "loss": 0.1407, "step": 3792 }, { "epoch": 0.8545920522713831, "grad_norm": 0.46030708547491617, "learning_rate": 2.8323801763725623e-07, "loss": 0.1358, "step": 3793 }, { "epoch": 0.8548173599571915, "grad_norm": 0.5098699348994705, "learning_rate": 2.823773478808348e-07, "loss": 0.1577, "step": 3794 }, { "epoch": 0.855042667643, "grad_norm": 0.4532517106481429, "learning_rate": 2.8151790948761165e-07, "loss": 0.1349, "step": 3795 }, { "epoch": 0.8552679753288084, "grad_norm": 0.47392199223502923, "learning_rate": 2.806597029348018e-07, "loss": 0.1495, "step": 3796 }, { "epoch": 0.8554932830146168, "grad_norm": 0.4720938077032227, "learning_rate": 2.7980272869893633e-07, "loss": 0.1395, "step": 3797 }, { "epoch": 0.8557185907004252, "grad_norm": 0.48573209575016485, "learning_rate": 2.7894698725585866e-07, "loss": 0.1419, "step": 3798 }, { "epoch": 0.8559438983862337, "grad_norm": 0.49549561049525453, "learning_rate": 2.7809247908073184e-07, "loss": 0.1599, "step": 3799 }, { "epoch": 0.8561692060720422, "grad_norm": 0.45867812854518997, "learning_rate": 2.772392046480324e-07, "loss": 0.1354, "step": 3800 }, { "epoch": 0.8563945137578506, "grad_norm": 0.48750250388740535, "learning_rate": 2.763871644315508e-07, "loss": 0.1453, "step": 3801 }, { "epoch": 0.856619821443659, "grad_norm": 0.46278874013656607, "learning_rate": 2.755363589043944e-07, "loss": 0.1421, "step": 3802 }, { "epoch": 0.8568451291294674, "grad_norm": 0.4861918783813127, "learning_rate": 2.746867885389828e-07, "loss": 0.1466, "step": 3803 }, { "epoch": 0.8570704368152758, "grad_norm": 0.4625862382559484, "learning_rate": 2.738384538070518e-07, "loss": 0.1323, "step": 3804 }, { "epoch": 0.8572957445010843, "grad_norm": 0.4932943365028231, "learning_rate": 2.7299135517964897e-07, "loss": 0.1514, "step": 3805 }, { "epoch": 0.8575210521868927, "grad_norm": 0.45459505703034975, "learning_rate": 2.7214549312713723e-07, "loss": 0.1329, "step": 3806 }, { "epoch": 0.8577463598727012, "grad_norm": 0.4657582875813703, "learning_rate": 2.713008681191923e-07, "loss": 0.1454, "step": 3807 }, { "epoch": 0.8579716675585096, "grad_norm": 0.44863377479765015, "learning_rate": 2.7045748062480254e-07, "loss": 0.1315, "step": 3808 }, { "epoch": 0.858196975244318, "grad_norm": 0.4600757566899341, "learning_rate": 2.696153311122704e-07, "loss": 0.1355, "step": 3809 }, { "epoch": 0.8584222829301265, "grad_norm": 0.4783882851837259, "learning_rate": 2.6877442004920873e-07, "loss": 0.1429, "step": 3810 }, { "epoch": 0.8586475906159349, "grad_norm": 0.47769058593487773, "learning_rate": 2.6793474790254516e-07, "loss": 0.1457, "step": 3811 }, { "epoch": 0.8588728983017433, "grad_norm": 0.47847573245900543, "learning_rate": 2.6709631513851834e-07, "loss": 0.1422, "step": 3812 }, { "epoch": 0.8590982059875517, "grad_norm": 0.4729488813409411, "learning_rate": 2.6625912222267844e-07, "loss": 0.1464, "step": 3813 }, { "epoch": 0.8593235136733601, "grad_norm": 0.4742682819045076, "learning_rate": 2.654231696198878e-07, "loss": 0.1394, "step": 3814 }, { "epoch": 0.8595488213591687, "grad_norm": 0.4496738891729263, "learning_rate": 2.645884577943192e-07, "loss": 0.1308, "step": 3815 }, { "epoch": 0.8597741290449771, "grad_norm": 0.4857654847446031, "learning_rate": 2.6375498720945717e-07, "loss": 0.1457, "step": 3816 }, { "epoch": 0.8599994367307855, "grad_norm": 0.4835247667742374, "learning_rate": 2.629227583280972e-07, "loss": 0.1408, "step": 3817 }, { "epoch": 0.8602247444165939, "grad_norm": 0.512090740233511, "learning_rate": 2.620917716123444e-07, "loss": 0.1638, "step": 3818 }, { "epoch": 0.8604500521024023, "grad_norm": 0.5050589725161969, "learning_rate": 2.6126202752361554e-07, "loss": 0.1475, "step": 3819 }, { "epoch": 0.8606753597882107, "grad_norm": 0.4797067731221806, "learning_rate": 2.604335265226354e-07, "loss": 0.138, "step": 3820 }, { "epoch": 0.8609006674740192, "grad_norm": 0.47641645566421503, "learning_rate": 2.5960626906944066e-07, "loss": 0.1429, "step": 3821 }, { "epoch": 0.8611259751598277, "grad_norm": 0.4949453699572789, "learning_rate": 2.587802556233765e-07, "loss": 0.1489, "step": 3822 }, { "epoch": 0.8613512828456361, "grad_norm": 0.46752972020335043, "learning_rate": 2.5795548664309695e-07, "loss": 0.1362, "step": 3823 }, { "epoch": 0.8615765905314445, "grad_norm": 0.4617599130698046, "learning_rate": 2.571319625865662e-07, "loss": 0.1379, "step": 3824 }, { "epoch": 0.861801898217253, "grad_norm": 0.46615766557140464, "learning_rate": 2.5630968391105515e-07, "loss": 0.1458, "step": 3825 }, { "epoch": 0.8620272059030614, "grad_norm": 0.4713775308849494, "learning_rate": 2.5548865107314606e-07, "loss": 0.1344, "step": 3826 }, { "epoch": 0.8622525135888698, "grad_norm": 0.4718354741536646, "learning_rate": 2.546688645287268e-07, "loss": 0.1346, "step": 3827 }, { "epoch": 0.8624778212746782, "grad_norm": 0.4762759961692442, "learning_rate": 2.5385032473299433e-07, "loss": 0.1403, "step": 3828 }, { "epoch": 0.8627031289604866, "grad_norm": 0.45935519635317734, "learning_rate": 2.5303303214045423e-07, "loss": 0.1381, "step": 3829 }, { "epoch": 0.8629284366462951, "grad_norm": 0.4890217885035087, "learning_rate": 2.522169872049174e-07, "loss": 0.1565, "step": 3830 }, { "epoch": 0.8631537443321036, "grad_norm": 0.4647472157245905, "learning_rate": 2.5140219037950416e-07, "loss": 0.1424, "step": 3831 }, { "epoch": 0.863379052017912, "grad_norm": 0.5084070140741922, "learning_rate": 2.5058864211664064e-07, "loss": 0.1532, "step": 3832 }, { "epoch": 0.8636043597037204, "grad_norm": 0.5116273779430994, "learning_rate": 2.4977634286805887e-07, "loss": 0.1622, "step": 3833 }, { "epoch": 0.8638296673895288, "grad_norm": 0.47251055408026604, "learning_rate": 2.4896529308479966e-07, "loss": 0.1394, "step": 3834 }, { "epoch": 0.8640549750753372, "grad_norm": 0.45085871056941923, "learning_rate": 2.4815549321720755e-07, "loss": 0.1309, "step": 3835 }, { "epoch": 0.8642802827611457, "grad_norm": 0.45540371760054993, "learning_rate": 2.4734694371493507e-07, "loss": 0.1301, "step": 3836 }, { "epoch": 0.8645055904469541, "grad_norm": 0.4601648252123752, "learning_rate": 2.4653964502693974e-07, "loss": 0.137, "step": 3837 }, { "epoch": 0.8647308981327626, "grad_norm": 0.48395343387323725, "learning_rate": 2.4573359760148354e-07, "loss": 0.1418, "step": 3838 }, { "epoch": 0.864956205818571, "grad_norm": 0.466410678978402, "learning_rate": 2.449288018861354e-07, "loss": 0.1443, "step": 3839 }, { "epoch": 0.8651815135043794, "grad_norm": 0.4594105233412585, "learning_rate": 2.441252583277678e-07, "loss": 0.1329, "step": 3840 }, { "epoch": 0.8654068211901879, "grad_norm": 0.4645514686858519, "learning_rate": 2.433229673725593e-07, "loss": 0.1309, "step": 3841 }, { "epoch": 0.8656321288759963, "grad_norm": 0.4862741309606466, "learning_rate": 2.425219294659908e-07, "loss": 0.1497, "step": 3842 }, { "epoch": 0.8658574365618047, "grad_norm": 0.5030380126363495, "learning_rate": 2.4172214505285006e-07, "loss": 0.1501, "step": 3843 }, { "epoch": 0.8660827442476131, "grad_norm": 0.47748877151047414, "learning_rate": 2.409236145772276e-07, "loss": 0.137, "step": 3844 }, { "epoch": 0.8663080519334215, "grad_norm": 0.46467065040499544, "learning_rate": 2.401263384825164e-07, "loss": 0.1382, "step": 3845 }, { "epoch": 0.8665333596192301, "grad_norm": 0.464481683671317, "learning_rate": 2.393303172114159e-07, "loss": 0.1337, "step": 3846 }, { "epoch": 0.8667586673050385, "grad_norm": 0.46035212211555143, "learning_rate": 2.3853555120592506e-07, "loss": 0.13, "step": 3847 }, { "epoch": 0.8669839749908469, "grad_norm": 0.4637142538065829, "learning_rate": 2.377420409073497e-07, "loss": 0.1331, "step": 3848 }, { "epoch": 0.8672092826766553, "grad_norm": 0.4632361803177915, "learning_rate": 2.3694978675629476e-07, "loss": 0.1452, "step": 3849 }, { "epoch": 0.8674345903624637, "grad_norm": 0.4986572200737204, "learning_rate": 2.3615878919267116e-07, "loss": 0.1577, "step": 3850 }, { "epoch": 0.8676598980482721, "grad_norm": 0.4914501429064226, "learning_rate": 2.3536904865568949e-07, "loss": 0.1461, "step": 3851 }, { "epoch": 0.8678852057340806, "grad_norm": 0.46554246389297854, "learning_rate": 2.345805655838626e-07, "loss": 0.1444, "step": 3852 }, { "epoch": 0.868110513419889, "grad_norm": 0.49645474855635946, "learning_rate": 2.337933404150064e-07, "loss": 0.1579, "step": 3853 }, { "epoch": 0.8683358211056975, "grad_norm": 0.47708434402385985, "learning_rate": 2.3300737358623843e-07, "loss": 0.1497, "step": 3854 }, { "epoch": 0.8685611287915059, "grad_norm": 0.46968566557594305, "learning_rate": 2.3222266553397542e-07, "loss": 0.1492, "step": 3855 }, { "epoch": 0.8687864364773143, "grad_norm": 0.4878675904608552, "learning_rate": 2.314392166939375e-07, "loss": 0.1471, "step": 3856 }, { "epoch": 0.8690117441631228, "grad_norm": 0.45020022653810526, "learning_rate": 2.3065702750114383e-07, "loss": 0.1292, "step": 3857 }, { "epoch": 0.8692370518489312, "grad_norm": 0.4843459055982729, "learning_rate": 2.2987609838991536e-07, "loss": 0.1387, "step": 3858 }, { "epoch": 0.8694623595347396, "grad_norm": 0.47005449497641894, "learning_rate": 2.2909642979387331e-07, "loss": 0.1445, "step": 3859 }, { "epoch": 0.869687667220548, "grad_norm": 0.49344268598068725, "learning_rate": 2.2831802214593774e-07, "loss": 0.1513, "step": 3860 }, { "epoch": 0.8699129749063564, "grad_norm": 0.48046850809610137, "learning_rate": 2.2754087587833014e-07, "loss": 0.136, "step": 3861 }, { "epoch": 0.870138282592165, "grad_norm": 0.5006165000856634, "learning_rate": 2.2676499142257002e-07, "loss": 0.1596, "step": 3862 }, { "epoch": 0.8703635902779734, "grad_norm": 0.47972327496783845, "learning_rate": 2.2599036920947836e-07, "loss": 0.1416, "step": 3863 }, { "epoch": 0.8705888979637818, "grad_norm": 0.47816719003504604, "learning_rate": 2.2521700966917276e-07, "loss": 0.1361, "step": 3864 }, { "epoch": 0.8708142056495902, "grad_norm": 0.4664317470965975, "learning_rate": 2.2444491323107138e-07, "loss": 0.1368, "step": 3865 }, { "epoch": 0.8710395133353986, "grad_norm": 0.46907919230886, "learning_rate": 2.23674080323891e-07, "loss": 0.143, "step": 3866 }, { "epoch": 0.8712648210212071, "grad_norm": 0.48697418501853557, "learning_rate": 2.229045113756456e-07, "loss": 0.1542, "step": 3867 }, { "epoch": 0.8714901287070155, "grad_norm": 0.5031905886611365, "learning_rate": 2.221362068136493e-07, "loss": 0.1484, "step": 3868 }, { "epoch": 0.8717154363928239, "grad_norm": 0.5014679159314208, "learning_rate": 2.2136916706451212e-07, "loss": 0.1432, "step": 3869 }, { "epoch": 0.8719407440786324, "grad_norm": 0.48675398397776903, "learning_rate": 2.2060339255414232e-07, "loss": 0.1448, "step": 3870 }, { "epoch": 0.8721660517644408, "grad_norm": 0.4719989893940155, "learning_rate": 2.198388837077467e-07, "loss": 0.1399, "step": 3871 }, { "epoch": 0.8723913594502493, "grad_norm": 0.4698672166182498, "learning_rate": 2.190756409498282e-07, "loss": 0.1432, "step": 3872 }, { "epoch": 0.8726166671360577, "grad_norm": 0.5003866524002453, "learning_rate": 2.1831366470418725e-07, "loss": 0.1597, "step": 3873 }, { "epoch": 0.8728419748218661, "grad_norm": 0.4762260917827092, "learning_rate": 2.175529553939204e-07, "loss": 0.1407, "step": 3874 }, { "epoch": 0.8730672825076745, "grad_norm": 0.46479989802739047, "learning_rate": 2.1679351344142146e-07, "loss": 0.1342, "step": 3875 }, { "epoch": 0.8732925901934829, "grad_norm": 0.4739375337047061, "learning_rate": 2.1603533926838088e-07, "loss": 0.1481, "step": 3876 }, { "epoch": 0.8735178978792915, "grad_norm": 0.46262128348097714, "learning_rate": 2.1527843329578328e-07, "loss": 0.1402, "step": 3877 }, { "epoch": 0.8737432055650999, "grad_norm": 0.5027757553131618, "learning_rate": 2.1452279594391167e-07, "loss": 0.1517, "step": 3878 }, { "epoch": 0.8739685132509083, "grad_norm": 0.4552996612527901, "learning_rate": 2.1376842763234178e-07, "loss": 0.1328, "step": 3879 }, { "epoch": 0.8741938209367167, "grad_norm": 0.4554471045600218, "learning_rate": 2.1301532877994747e-07, "loss": 0.1294, "step": 3880 }, { "epoch": 0.8744191286225251, "grad_norm": 0.49367764827210675, "learning_rate": 2.1226349980489614e-07, "loss": 0.1566, "step": 3881 }, { "epoch": 0.8746444363083336, "grad_norm": 0.4661026591919418, "learning_rate": 2.1151294112464997e-07, "loss": 0.1416, "step": 3882 }, { "epoch": 0.874869743994142, "grad_norm": 0.4920345042810052, "learning_rate": 2.1076365315596704e-07, "loss": 0.153, "step": 3883 }, { "epoch": 0.8750950516799504, "grad_norm": 0.4789670218119234, "learning_rate": 2.1001563631489807e-07, "loss": 0.1434, "step": 3884 }, { "epoch": 0.8753203593657589, "grad_norm": 0.4716950050495697, "learning_rate": 2.0926889101679004e-07, "loss": 0.1387, "step": 3885 }, { "epoch": 0.8755456670515673, "grad_norm": 0.47883106851183405, "learning_rate": 2.0852341767628182e-07, "loss": 0.1496, "step": 3886 }, { "epoch": 0.8757709747373758, "grad_norm": 0.4959167987665322, "learning_rate": 2.07779216707307e-07, "loss": 0.151, "step": 3887 }, { "epoch": 0.8759962824231842, "grad_norm": 0.47334563856241746, "learning_rate": 2.0703628852309336e-07, "loss": 0.1407, "step": 3888 }, { "epoch": 0.8762215901089926, "grad_norm": 0.4627588097838559, "learning_rate": 2.0629463353616013e-07, "loss": 0.1437, "step": 3889 }, { "epoch": 0.876446897794801, "grad_norm": 0.48615089687091206, "learning_rate": 2.0555425215832176e-07, "loss": 0.1352, "step": 3890 }, { "epoch": 0.8766722054806094, "grad_norm": 0.4911026246700992, "learning_rate": 2.048151448006841e-07, "loss": 0.1551, "step": 3891 }, { "epoch": 0.8768975131664178, "grad_norm": 0.46591980702852437, "learning_rate": 2.0407731187364556e-07, "loss": 0.1416, "step": 3892 }, { "epoch": 0.8771228208522264, "grad_norm": 0.49095474891414137, "learning_rate": 2.0334075378689781e-07, "loss": 0.1534, "step": 3893 }, { "epoch": 0.8773481285380348, "grad_norm": 0.4667276709190779, "learning_rate": 2.026054709494235e-07, "loss": 0.1446, "step": 3894 }, { "epoch": 0.8775734362238432, "grad_norm": 0.47067318796567426, "learning_rate": 2.0187146376949852e-07, "loss": 0.1362, "step": 3895 }, { "epoch": 0.8777987439096516, "grad_norm": 0.5010169634897643, "learning_rate": 2.0113873265468875e-07, "loss": 0.1497, "step": 3896 }, { "epoch": 0.87802405159546, "grad_norm": 0.47896885342438095, "learning_rate": 2.0040727801185323e-07, "loss": 0.1404, "step": 3897 }, { "epoch": 0.8782493592812685, "grad_norm": 0.4435881047514352, "learning_rate": 1.996771002471415e-07, "loss": 0.13, "step": 3898 }, { "epoch": 0.8784746669670769, "grad_norm": 0.46596846807949904, "learning_rate": 1.9894819976599338e-07, "loss": 0.1377, "step": 3899 }, { "epoch": 0.8786999746528853, "grad_norm": 0.4644431447384091, "learning_rate": 1.9822057697314102e-07, "loss": 0.1326, "step": 3900 }, { "epoch": 0.8789252823386938, "grad_norm": 0.4672389855441275, "learning_rate": 1.9749423227260533e-07, "loss": 0.1299, "step": 3901 }, { "epoch": 0.8791505900245022, "grad_norm": 0.4686815604792627, "learning_rate": 1.9676916606769874e-07, "loss": 0.1469, "step": 3902 }, { "epoch": 0.8793758977103107, "grad_norm": 0.4659336099421951, "learning_rate": 1.9604537876102448e-07, "loss": 0.1398, "step": 3903 }, { "epoch": 0.8796012053961191, "grad_norm": 0.443210123551164, "learning_rate": 1.9532287075447325e-07, "loss": 0.1341, "step": 3904 }, { "epoch": 0.8798265130819275, "grad_norm": 0.47878160196488945, "learning_rate": 1.9460164244922698e-07, "loss": 0.1429, "step": 3905 }, { "epoch": 0.8800518207677359, "grad_norm": 0.48539640117019267, "learning_rate": 1.9388169424575802e-07, "loss": 0.14, "step": 3906 }, { "epoch": 0.8802771284535443, "grad_norm": 0.46434475314245555, "learning_rate": 1.9316302654382528e-07, "loss": 0.1337, "step": 3907 }, { "epoch": 0.8805024361393528, "grad_norm": 0.4851787509947083, "learning_rate": 1.9244563974247953e-07, "loss": 0.1489, "step": 3908 }, { "epoch": 0.8807277438251613, "grad_norm": 0.4911309040191241, "learning_rate": 1.917295342400577e-07, "loss": 0.1497, "step": 3909 }, { "epoch": 0.8809530515109697, "grad_norm": 0.45607420361153655, "learning_rate": 1.910147104341875e-07, "loss": 0.1301, "step": 3910 }, { "epoch": 0.8811783591967781, "grad_norm": 0.47282797295865775, "learning_rate": 1.9030116872178317e-07, "loss": 0.1537, "step": 3911 }, { "epoch": 0.8814036668825865, "grad_norm": 0.5049449835303502, "learning_rate": 1.8958890949904802e-07, "loss": 0.1581, "step": 3912 }, { "epoch": 0.881628974568395, "grad_norm": 0.47520130714031167, "learning_rate": 1.8887793316147373e-07, "loss": 0.1397, "step": 3913 }, { "epoch": 0.8818542822542034, "grad_norm": 0.48093307567540367, "learning_rate": 1.881682401038379e-07, "loss": 0.1369, "step": 3914 }, { "epoch": 0.8820795899400118, "grad_norm": 0.4374287939212524, "learning_rate": 1.8745983072020774e-07, "loss": 0.1315, "step": 3915 }, { "epoch": 0.8823048976258202, "grad_norm": 0.501261525882023, "learning_rate": 1.8675270540393532e-07, "loss": 0.1461, "step": 3916 }, { "epoch": 0.8825302053116287, "grad_norm": 0.4878953885886656, "learning_rate": 1.8604686454766208e-07, "loss": 0.152, "step": 3917 }, { "epoch": 0.8827555129974372, "grad_norm": 0.47618468982397694, "learning_rate": 1.8534230854331454e-07, "loss": 0.1473, "step": 3918 }, { "epoch": 0.8829808206832456, "grad_norm": 0.48255019165112745, "learning_rate": 1.8463903778210612e-07, "loss": 0.1525, "step": 3919 }, { "epoch": 0.883206128369054, "grad_norm": 0.4709200797557006, "learning_rate": 1.8393705265453838e-07, "loss": 0.1445, "step": 3920 }, { "epoch": 0.8834314360548624, "grad_norm": 0.4726612900241033, "learning_rate": 1.832363535503956e-07, "loss": 0.1465, "step": 3921 }, { "epoch": 0.8836567437406708, "grad_norm": 0.4648816875604027, "learning_rate": 1.8253694085875047e-07, "loss": 0.1289, "step": 3922 }, { "epoch": 0.8838820514264792, "grad_norm": 0.49096699990870324, "learning_rate": 1.8183881496796146e-07, "loss": 0.1502, "step": 3923 }, { "epoch": 0.8841073591122878, "grad_norm": 0.5020026232845864, "learning_rate": 1.8114197626567105e-07, "loss": 0.1589, "step": 3924 }, { "epoch": 0.8843326667980962, "grad_norm": 0.5030844197467426, "learning_rate": 1.8044642513880827e-07, "loss": 0.1557, "step": 3925 }, { "epoch": 0.8845579744839046, "grad_norm": 0.48572411012187333, "learning_rate": 1.7975216197358648e-07, "loss": 0.1422, "step": 3926 }, { "epoch": 0.884783282169713, "grad_norm": 0.47185579135933253, "learning_rate": 1.790591871555039e-07, "loss": 0.1424, "step": 3927 }, { "epoch": 0.8850085898555214, "grad_norm": 0.45572123796883646, "learning_rate": 1.7836750106934475e-07, "loss": 0.1333, "step": 3928 }, { "epoch": 0.8852338975413299, "grad_norm": 0.4777374273399248, "learning_rate": 1.776771040991751e-07, "loss": 0.1381, "step": 3929 }, { "epoch": 0.8854592052271383, "grad_norm": 0.49018085298222697, "learning_rate": 1.7698799662834776e-07, "loss": 0.1467, "step": 3930 }, { "epoch": 0.8856845129129467, "grad_norm": 0.47340916213212886, "learning_rate": 1.7630017903949775e-07, "loss": 0.1451, "step": 3931 }, { "epoch": 0.8859098205987552, "grad_norm": 0.49233987363836046, "learning_rate": 1.7561365171454488e-07, "loss": 0.1507, "step": 3932 }, { "epoch": 0.8861351282845636, "grad_norm": 0.44864984855065165, "learning_rate": 1.7492841503469165e-07, "loss": 0.136, "step": 3933 }, { "epoch": 0.8863604359703721, "grad_norm": 0.46498551221740236, "learning_rate": 1.7424446938042517e-07, "loss": 0.1435, "step": 3934 }, { "epoch": 0.8865857436561805, "grad_norm": 0.494552407996217, "learning_rate": 1.7356181513151464e-07, "loss": 0.1347, "step": 3935 }, { "epoch": 0.8868110513419889, "grad_norm": 0.46218453673553506, "learning_rate": 1.7288045266701247e-07, "loss": 0.1271, "step": 3936 }, { "epoch": 0.8870363590277973, "grad_norm": 0.4620631245597858, "learning_rate": 1.7220038236525406e-07, "loss": 0.1411, "step": 3937 }, { "epoch": 0.8872616667136057, "grad_norm": 0.5238585649293778, "learning_rate": 1.7152160460385703e-07, "loss": 0.1372, "step": 3938 }, { "epoch": 0.8874869743994142, "grad_norm": 0.4980800570271065, "learning_rate": 1.7084411975972076e-07, "loss": 0.1514, "step": 3939 }, { "epoch": 0.8877122820852227, "grad_norm": 0.4779022749671553, "learning_rate": 1.701679282090285e-07, "loss": 0.1486, "step": 3940 }, { "epoch": 0.8879375897710311, "grad_norm": 0.4723075786530799, "learning_rate": 1.6949303032724297e-07, "loss": 0.1373, "step": 3941 }, { "epoch": 0.8881628974568395, "grad_norm": 0.4787812903509886, "learning_rate": 1.6881942648911077e-07, "loss": 0.1341, "step": 3942 }, { "epoch": 0.8883882051426479, "grad_norm": 0.49509450884332845, "learning_rate": 1.6814711706865827e-07, "loss": 0.1512, "step": 3943 }, { "epoch": 0.8886135128284564, "grad_norm": 0.48155729793324203, "learning_rate": 1.6747610243919437e-07, "loss": 0.1398, "step": 3944 }, { "epoch": 0.8888388205142648, "grad_norm": 0.4859521054322674, "learning_rate": 1.6680638297330854e-07, "loss": 0.1456, "step": 3945 }, { "epoch": 0.8890641282000732, "grad_norm": 0.4922788181816615, "learning_rate": 1.661379590428705e-07, "loss": 0.1546, "step": 3946 }, { "epoch": 0.8892894358858816, "grad_norm": 0.4720865220150437, "learning_rate": 1.6547083101903173e-07, "loss": 0.1407, "step": 3947 }, { "epoch": 0.8895147435716901, "grad_norm": 0.5004812140336558, "learning_rate": 1.6480499927222283e-07, "loss": 0.1535, "step": 3948 }, { "epoch": 0.8897400512574986, "grad_norm": 0.44759678855803414, "learning_rate": 1.641404641721561e-07, "loss": 0.1288, "step": 3949 }, { "epoch": 0.889965358943307, "grad_norm": 0.48781988498782375, "learning_rate": 1.6347722608782284e-07, "loss": 0.1499, "step": 3950 }, { "epoch": 0.8901906666291154, "grad_norm": 0.4383912218136226, "learning_rate": 1.6281528538749425e-07, "loss": 0.1324, "step": 3951 }, { "epoch": 0.8904159743149238, "grad_norm": 0.4598988414847289, "learning_rate": 1.6215464243872186e-07, "loss": 0.1333, "step": 3952 }, { "epoch": 0.8906412820007322, "grad_norm": 0.5021102204148211, "learning_rate": 1.6149529760833504e-07, "loss": 0.148, "step": 3953 }, { "epoch": 0.8908665896865406, "grad_norm": 0.43661485835748093, "learning_rate": 1.608372512624448e-07, "loss": 0.1243, "step": 3954 }, { "epoch": 0.8910918973723491, "grad_norm": 0.45412666194626644, "learning_rate": 1.6018050376643863e-07, "loss": 0.1348, "step": 3955 }, { "epoch": 0.8913172050581576, "grad_norm": 0.4703275266482927, "learning_rate": 1.595250554849842e-07, "loss": 0.1433, "step": 3956 }, { "epoch": 0.891542512743966, "grad_norm": 0.47470587203113024, "learning_rate": 1.5887090678202793e-07, "loss": 0.1404, "step": 3957 }, { "epoch": 0.8917678204297744, "grad_norm": 0.4754627455351376, "learning_rate": 1.5821805802079343e-07, "loss": 0.141, "step": 3958 }, { "epoch": 0.8919931281155828, "grad_norm": 0.46616519097330594, "learning_rate": 1.5756650956378377e-07, "loss": 0.1485, "step": 3959 }, { "epoch": 0.8922184358013913, "grad_norm": 0.461205924770751, "learning_rate": 1.5691626177277986e-07, "loss": 0.1416, "step": 3960 }, { "epoch": 0.8924437434871997, "grad_norm": 0.47273107411957194, "learning_rate": 1.5626731500883951e-07, "loss": 0.1411, "step": 3961 }, { "epoch": 0.8926690511730081, "grad_norm": 0.48751682609530655, "learning_rate": 1.5561966963229925e-07, "loss": 0.1438, "step": 3962 }, { "epoch": 0.8928943588588165, "grad_norm": 0.46850837700229225, "learning_rate": 1.5497332600277137e-07, "loss": 0.1418, "step": 3963 }, { "epoch": 0.893119666544625, "grad_norm": 0.45741592189608293, "learning_rate": 1.5432828447914743e-07, "loss": 0.143, "step": 3964 }, { "epoch": 0.8933449742304335, "grad_norm": 0.4616383354901837, "learning_rate": 1.5368454541959453e-07, "loss": 0.1324, "step": 3965 }, { "epoch": 0.8935702819162419, "grad_norm": 0.5020609237394296, "learning_rate": 1.5304210918155677e-07, "loss": 0.1537, "step": 3966 }, { "epoch": 0.8937955896020503, "grad_norm": 0.4568905419965394, "learning_rate": 1.524009761217557e-07, "loss": 0.1378, "step": 3967 }, { "epoch": 0.8940208972878587, "grad_norm": 0.46651422878987997, "learning_rate": 1.5176114659618796e-07, "loss": 0.1406, "step": 3968 }, { "epoch": 0.8942462049736671, "grad_norm": 0.4667624257365965, "learning_rate": 1.5112262096012743e-07, "loss": 0.1356, "step": 3969 }, { "epoch": 0.8944715126594756, "grad_norm": 0.46212811202290244, "learning_rate": 1.5048539956812324e-07, "loss": 0.141, "step": 3970 }, { "epoch": 0.8946968203452841, "grad_norm": 0.4854084542243427, "learning_rate": 1.4984948277400074e-07, "loss": 0.1496, "step": 3971 }, { "epoch": 0.8949221280310925, "grad_norm": 0.46644509385921373, "learning_rate": 1.4921487093086134e-07, "loss": 0.1478, "step": 3972 }, { "epoch": 0.8951474357169009, "grad_norm": 0.4593367044330634, "learning_rate": 1.4858156439108097e-07, "loss": 0.1269, "step": 3973 }, { "epoch": 0.8953727434027093, "grad_norm": 0.48517550690122074, "learning_rate": 1.4794956350631106e-07, "loss": 0.15, "step": 3974 }, { "epoch": 0.8955980510885178, "grad_norm": 0.4808142374337991, "learning_rate": 1.473188686274782e-07, "loss": 0.1421, "step": 3975 }, { "epoch": 0.8958233587743262, "grad_norm": 0.49699266447125817, "learning_rate": 1.4668948010478358e-07, "loss": 0.1486, "step": 3976 }, { "epoch": 0.8960486664601346, "grad_norm": 0.4718085033512432, "learning_rate": 1.4606139828770378e-07, "loss": 0.1481, "step": 3977 }, { "epoch": 0.896273974145943, "grad_norm": 0.480493817674567, "learning_rate": 1.4543462352498844e-07, "loss": 0.1378, "step": 3978 }, { "epoch": 0.8964992818317515, "grad_norm": 0.44881325605798783, "learning_rate": 1.448091561646628e-07, "loss": 0.1418, "step": 3979 }, { "epoch": 0.89672458951756, "grad_norm": 0.45855300593285897, "learning_rate": 1.4418499655402512e-07, "loss": 0.1333, "step": 3980 }, { "epoch": 0.8969498972033684, "grad_norm": 0.4667832417601448, "learning_rate": 1.435621450396485e-07, "loss": 0.142, "step": 3981 }, { "epoch": 0.8971752048891768, "grad_norm": 0.4861168968624223, "learning_rate": 1.4294060196737874e-07, "loss": 0.1379, "step": 3982 }, { "epoch": 0.8974005125749852, "grad_norm": 0.5123715647299666, "learning_rate": 1.4232036768233565e-07, "loss": 0.1524, "step": 3983 }, { "epoch": 0.8976258202607936, "grad_norm": 0.46111866378689154, "learning_rate": 1.417014425289126e-07, "loss": 0.1382, "step": 3984 }, { "epoch": 0.897851127946602, "grad_norm": 0.46962766756276614, "learning_rate": 1.4108382685077498e-07, "loss": 0.138, "step": 3985 }, { "epoch": 0.8980764356324105, "grad_norm": 0.4868405824344992, "learning_rate": 1.4046752099086236e-07, "loss": 0.1451, "step": 3986 }, { "epoch": 0.898301743318219, "grad_norm": 0.4836538182472601, "learning_rate": 1.398525252913857e-07, "loss": 0.1465, "step": 3987 }, { "epoch": 0.8985270510040274, "grad_norm": 0.4552388575669367, "learning_rate": 1.3923884009382994e-07, "loss": 0.1383, "step": 3988 }, { "epoch": 0.8987523586898358, "grad_norm": 0.46674175236849347, "learning_rate": 1.3862646573895134e-07, "loss": 0.1465, "step": 3989 }, { "epoch": 0.8989776663756442, "grad_norm": 0.46416844608939184, "learning_rate": 1.380154025667782e-07, "loss": 0.139, "step": 3990 }, { "epoch": 0.8992029740614527, "grad_norm": 0.48341279291397105, "learning_rate": 1.374056509166119e-07, "loss": 0.146, "step": 3991 }, { "epoch": 0.8994282817472611, "grad_norm": 0.44154753930596685, "learning_rate": 1.367972111270241e-07, "loss": 0.1264, "step": 3992 }, { "epoch": 0.8996535894330695, "grad_norm": 0.45677739202857504, "learning_rate": 1.3619008353585873e-07, "loss": 0.1413, "step": 3993 }, { "epoch": 0.8998788971188779, "grad_norm": 0.4915720070809868, "learning_rate": 1.3558426848023165e-07, "loss": 0.1444, "step": 3994 }, { "epoch": 0.9001042048046864, "grad_norm": 0.4613124404503582, "learning_rate": 1.3497976629652882e-07, "loss": 0.1366, "step": 3995 }, { "epoch": 0.9003295124904949, "grad_norm": 0.4866944759203155, "learning_rate": 1.3437657732040783e-07, "loss": 0.1401, "step": 3996 }, { "epoch": 0.9005548201763033, "grad_norm": 0.48683156668560257, "learning_rate": 1.337747018867977e-07, "loss": 0.144, "step": 3997 }, { "epoch": 0.9007801278621117, "grad_norm": 0.4795408253255835, "learning_rate": 1.3317414032989668e-07, "loss": 0.139, "step": 3998 }, { "epoch": 0.9010054355479201, "grad_norm": 0.458705770255175, "learning_rate": 1.3257489298317466e-07, "loss": 0.1388, "step": 3999 }, { "epoch": 0.9012307432337285, "grad_norm": 0.43108938142225783, "learning_rate": 1.3197696017937106e-07, "loss": 0.1278, "step": 4000 }, { "epoch": 0.9012307432337285, "eval_loss": 0.14278066158294678, "eval_runtime": 56.8248, "eval_samples_per_second": 50.506, "eval_steps_per_second": 6.318, "step": 4000 }, { "epoch": 0.901456050919537, "grad_norm": 0.4664469040013792, "learning_rate": 1.3138034225049583e-07, "loss": 0.14, "step": 4001 }, { "epoch": 0.9016813586053454, "grad_norm": 0.4754535001909423, "learning_rate": 1.3078503952782845e-07, "loss": 0.1505, "step": 4002 }, { "epoch": 0.9019066662911539, "grad_norm": 0.4941711804629973, "learning_rate": 1.3019105234191865e-07, "loss": 0.1508, "step": 4003 }, { "epoch": 0.9021319739769623, "grad_norm": 0.47736169945139195, "learning_rate": 1.2959838102258537e-07, "loss": 0.1426, "step": 4004 }, { "epoch": 0.9023572816627707, "grad_norm": 0.4702410812281165, "learning_rate": 1.2900702589891652e-07, "loss": 0.1378, "step": 4005 }, { "epoch": 0.9025825893485792, "grad_norm": 0.4802058798581131, "learning_rate": 1.2841698729927022e-07, "loss": 0.151, "step": 4006 }, { "epoch": 0.9028078970343876, "grad_norm": 0.4718303474580779, "learning_rate": 1.278282655512722e-07, "loss": 0.1477, "step": 4007 }, { "epoch": 0.903033204720196, "grad_norm": 0.44777196172649736, "learning_rate": 1.272408609818182e-07, "loss": 0.1367, "step": 4008 }, { "epoch": 0.9032585124060044, "grad_norm": 0.4410043019443886, "learning_rate": 1.2665477391707203e-07, "loss": 0.1253, "step": 4009 }, { "epoch": 0.9034838200918128, "grad_norm": 0.46387277123794624, "learning_rate": 1.2607000468246533e-07, "loss": 0.1329, "step": 4010 }, { "epoch": 0.9037091277776214, "grad_norm": 0.4694158218919607, "learning_rate": 1.2548655360269974e-07, "loss": 0.1418, "step": 4011 }, { "epoch": 0.9039344354634298, "grad_norm": 0.4774664008686541, "learning_rate": 1.2490442100174278e-07, "loss": 0.1419, "step": 4012 }, { "epoch": 0.9041597431492382, "grad_norm": 0.45399983828562024, "learning_rate": 1.243236072028317e-07, "loss": 0.141, "step": 4013 }, { "epoch": 0.9043850508350466, "grad_norm": 0.4685371261241214, "learning_rate": 1.237441125284708e-07, "loss": 0.1385, "step": 4014 }, { "epoch": 0.904610358520855, "grad_norm": 0.4728214519874054, "learning_rate": 1.2316593730043154e-07, "loss": 0.1494, "step": 4015 }, { "epoch": 0.9048356662066634, "grad_norm": 0.46208919509274266, "learning_rate": 1.2258908183975322e-07, "loss": 0.1321, "step": 4016 }, { "epoch": 0.9050609738924719, "grad_norm": 0.4586787135362811, "learning_rate": 1.2201354646674212e-07, "loss": 0.1317, "step": 4017 }, { "epoch": 0.9052862815782804, "grad_norm": 0.4856955819035042, "learning_rate": 1.2143933150097154e-07, "loss": 0.1462, "step": 4018 }, { "epoch": 0.9055115892640888, "grad_norm": 0.4708554386279845, "learning_rate": 1.2086643726128194e-07, "loss": 0.1355, "step": 4019 }, { "epoch": 0.9057368969498972, "grad_norm": 0.4975683906389299, "learning_rate": 1.2029486406577972e-07, "loss": 0.1393, "step": 4020 }, { "epoch": 0.9059622046357056, "grad_norm": 0.471183068628529, "learning_rate": 1.1972461223183878e-07, "loss": 0.1412, "step": 4021 }, { "epoch": 0.9061875123215141, "grad_norm": 0.4920568295014266, "learning_rate": 1.191556820760978e-07, "loss": 0.1517, "step": 4022 }, { "epoch": 0.9064128200073225, "grad_norm": 0.48367463833210245, "learning_rate": 1.1858807391446319e-07, "loss": 0.1405, "step": 4023 }, { "epoch": 0.9066381276931309, "grad_norm": 0.5003956217619391, "learning_rate": 1.1802178806210624e-07, "loss": 0.1469, "step": 4024 }, { "epoch": 0.9068634353789393, "grad_norm": 0.4810574594091294, "learning_rate": 1.1745682483346454e-07, "loss": 0.1477, "step": 4025 }, { "epoch": 0.9070887430647478, "grad_norm": 0.4531146593689317, "learning_rate": 1.1689318454224191e-07, "loss": 0.1296, "step": 4026 }, { "epoch": 0.9073140507505563, "grad_norm": 0.4575572613766583, "learning_rate": 1.1633086750140521e-07, "loss": 0.1261, "step": 4027 }, { "epoch": 0.9075393584363647, "grad_norm": 0.48780587471832954, "learning_rate": 1.1576987402318884e-07, "loss": 0.1459, "step": 4028 }, { "epoch": 0.9077646661221731, "grad_norm": 0.46528623662642415, "learning_rate": 1.1521020441909226e-07, "loss": 0.1351, "step": 4029 }, { "epoch": 0.9079899738079815, "grad_norm": 0.4966456351879077, "learning_rate": 1.1465185899987797e-07, "loss": 0.1479, "step": 4030 }, { "epoch": 0.9082152814937899, "grad_norm": 0.47403911627624085, "learning_rate": 1.140948380755752e-07, "loss": 0.1459, "step": 4031 }, { "epoch": 0.9084405891795984, "grad_norm": 0.48887357002467957, "learning_rate": 1.1353914195547655e-07, "loss": 0.1445, "step": 4032 }, { "epoch": 0.9086658968654068, "grad_norm": 0.43742134361709295, "learning_rate": 1.1298477094813965e-07, "loss": 0.1282, "step": 4033 }, { "epoch": 0.9088912045512153, "grad_norm": 0.5078126004993612, "learning_rate": 1.1243172536138547e-07, "loss": 0.1539, "step": 4034 }, { "epoch": 0.9091165122370237, "grad_norm": 0.4945032415866532, "learning_rate": 1.1188000550230005e-07, "loss": 0.1426, "step": 4035 }, { "epoch": 0.9093418199228321, "grad_norm": 0.496800918331347, "learning_rate": 1.1132961167723305e-07, "loss": 0.1532, "step": 4036 }, { "epoch": 0.9095671276086406, "grad_norm": 0.48007976714054434, "learning_rate": 1.1078054419179724e-07, "loss": 0.15, "step": 4037 }, { "epoch": 0.909792435294449, "grad_norm": 0.47821352867896594, "learning_rate": 1.1023280335086956e-07, "loss": 0.1441, "step": 4038 }, { "epoch": 0.9100177429802574, "grad_norm": 0.503025431983095, "learning_rate": 1.0968638945858978e-07, "loss": 0.1452, "step": 4039 }, { "epoch": 0.9102430506660658, "grad_norm": 0.4960063821194838, "learning_rate": 1.091413028183616e-07, "loss": 0.1573, "step": 4040 }, { "epoch": 0.9104683583518742, "grad_norm": 0.4669980833464851, "learning_rate": 1.0859754373285125e-07, "loss": 0.1506, "step": 4041 }, { "epoch": 0.9106936660376828, "grad_norm": 0.46358731915184515, "learning_rate": 1.0805511250398748e-07, "loss": 0.137, "step": 4042 }, { "epoch": 0.9109189737234912, "grad_norm": 0.46194698650882016, "learning_rate": 1.0751400943296269e-07, "loss": 0.1361, "step": 4043 }, { "epoch": 0.9111442814092996, "grad_norm": 0.4797787938586833, "learning_rate": 1.06974234820231e-07, "loss": 0.1326, "step": 4044 }, { "epoch": 0.911369589095108, "grad_norm": 0.45647525029825825, "learning_rate": 1.0643578896550877e-07, "loss": 0.1399, "step": 4045 }, { "epoch": 0.9115948967809164, "grad_norm": 0.4769578133095442, "learning_rate": 1.0589867216777544e-07, "loss": 0.1483, "step": 4046 }, { "epoch": 0.9118202044667248, "grad_norm": 0.47151532859509643, "learning_rate": 1.0536288472527162e-07, "loss": 0.14, "step": 4047 }, { "epoch": 0.9120455121525333, "grad_norm": 0.47164880427720157, "learning_rate": 1.0482842693550044e-07, "loss": 0.1369, "step": 4048 }, { "epoch": 0.9122708198383417, "grad_norm": 0.450133586818689, "learning_rate": 1.042952990952259e-07, "loss": 0.1373, "step": 4049 }, { "epoch": 0.9124961275241502, "grad_norm": 0.5002594849950737, "learning_rate": 1.0376350150047427e-07, "loss": 0.1487, "step": 4050 }, { "epoch": 0.9127214352099586, "grad_norm": 0.45280315659636944, "learning_rate": 1.032330344465332e-07, "loss": 0.1336, "step": 4051 }, { "epoch": 0.912946742895767, "grad_norm": 0.49636257095046055, "learning_rate": 1.0270389822795073e-07, "loss": 0.1394, "step": 4052 }, { "epoch": 0.9131720505815755, "grad_norm": 0.4662242703415579, "learning_rate": 1.0217609313853738e-07, "loss": 0.1396, "step": 4053 }, { "epoch": 0.9133973582673839, "grad_norm": 0.4626581189870639, "learning_rate": 1.0164961947136232e-07, "loss": 0.1399, "step": 4054 }, { "epoch": 0.9136226659531923, "grad_norm": 0.4780719271605745, "learning_rate": 1.0112447751875809e-07, "loss": 0.1474, "step": 4055 }, { "epoch": 0.9138479736390007, "grad_norm": 0.4927695082239918, "learning_rate": 1.0060066757231535e-07, "loss": 0.1347, "step": 4056 }, { "epoch": 0.9140732813248091, "grad_norm": 0.4538682665106712, "learning_rate": 1.0007818992288671e-07, "loss": 0.1287, "step": 4057 }, { "epoch": 0.9142985890106177, "grad_norm": 0.45836132297611115, "learning_rate": 9.955704486058482e-08, "loss": 0.1377, "step": 4058 }, { "epoch": 0.9145238966964261, "grad_norm": 0.46646988612796003, "learning_rate": 9.903723267478154e-08, "loss": 0.1278, "step": 4059 }, { "epoch": 0.9147492043822345, "grad_norm": 0.4735433165512558, "learning_rate": 9.85187536541099e-08, "loss": 0.1333, "step": 4060 }, { "epoch": 0.9149745120680429, "grad_norm": 0.46501747980793207, "learning_rate": 9.800160808646154e-08, "loss": 0.1339, "step": 4061 }, { "epoch": 0.9151998197538513, "grad_norm": 0.4735027259975002, "learning_rate": 9.748579625898758e-08, "loss": 0.1399, "step": 4062 }, { "epoch": 0.9154251274396598, "grad_norm": 0.5016523883899949, "learning_rate": 9.697131845810032e-08, "loss": 0.148, "step": 4063 }, { "epoch": 0.9156504351254682, "grad_norm": 0.4585294421782351, "learning_rate": 9.645817496946902e-08, "loss": 0.1382, "step": 4064 }, { "epoch": 0.9158757428112767, "grad_norm": 0.49296755886219573, "learning_rate": 9.594636607802355e-08, "loss": 0.1529, "step": 4065 }, { "epoch": 0.9161010504970851, "grad_norm": 0.4840871686662579, "learning_rate": 9.54358920679524e-08, "loss": 0.1504, "step": 4066 }, { "epoch": 0.9163263581828935, "grad_norm": 0.4673411493667281, "learning_rate": 9.492675322270273e-08, "loss": 0.1364, "step": 4067 }, { "epoch": 0.916551665868702, "grad_norm": 0.45852533144391544, "learning_rate": 9.441894982498035e-08, "loss": 0.1248, "step": 4068 }, { "epoch": 0.9167769735545104, "grad_norm": 0.5194322131946961, "learning_rate": 9.391248215674942e-08, "loss": 0.1576, "step": 4069 }, { "epoch": 0.9170022812403188, "grad_norm": 0.4841221144035149, "learning_rate": 9.340735049923277e-08, "loss": 0.1455, "step": 4070 }, { "epoch": 0.9172275889261272, "grad_norm": 0.43263175772768625, "learning_rate": 9.290355513291105e-08, "loss": 0.1208, "step": 4071 }, { "epoch": 0.9174528966119356, "grad_norm": 0.4527159715269973, "learning_rate": 9.24010963375227e-08, "loss": 0.1368, "step": 4072 }, { "epoch": 0.9176782042977442, "grad_norm": 0.5077271246523941, "learning_rate": 9.189997439206538e-08, "loss": 0.164, "step": 4073 }, { "epoch": 0.9179035119835526, "grad_norm": 0.46080076191265873, "learning_rate": 9.140018957479236e-08, "loss": 0.1294, "step": 4074 }, { "epoch": 0.918128819669361, "grad_norm": 0.46911421219992144, "learning_rate": 9.090174216321607e-08, "loss": 0.1381, "step": 4075 }, { "epoch": 0.9183541273551694, "grad_norm": 0.504179840271321, "learning_rate": 9.040463243410541e-08, "loss": 0.1602, "step": 4076 }, { "epoch": 0.9185794350409778, "grad_norm": 0.4794307845268256, "learning_rate": 8.990886066348764e-08, "loss": 0.1312, "step": 4077 }, { "epoch": 0.9188047427267862, "grad_norm": 0.49026049223089996, "learning_rate": 8.941442712664561e-08, "loss": 0.1452, "step": 4078 }, { "epoch": 0.9190300504125947, "grad_norm": 0.49867226746965787, "learning_rate": 8.892133209811971e-08, "loss": 0.1459, "step": 4079 }, { "epoch": 0.9192553580984031, "grad_norm": 0.46105305653005, "learning_rate": 8.842957585170814e-08, "loss": 0.1409, "step": 4080 }, { "epoch": 0.9194806657842116, "grad_norm": 0.45755223484497676, "learning_rate": 8.79391586604636e-08, "loss": 0.1317, "step": 4081 }, { "epoch": 0.91970597347002, "grad_norm": 0.4910936827240193, "learning_rate": 8.745008079669742e-08, "loss": 0.153, "step": 4082 }, { "epoch": 0.9199312811558285, "grad_norm": 0.4776348509081474, "learning_rate": 8.696234253197599e-08, "loss": 0.1509, "step": 4083 }, { "epoch": 0.9201565888416369, "grad_norm": 0.4892957646542491, "learning_rate": 8.647594413712212e-08, "loss": 0.1449, "step": 4084 }, { "epoch": 0.9203818965274453, "grad_norm": 0.46500853465974756, "learning_rate": 8.599088588221504e-08, "loss": 0.1334, "step": 4085 }, { "epoch": 0.9206072042132537, "grad_norm": 0.4729412778091547, "learning_rate": 8.550716803658904e-08, "loss": 0.1425, "step": 4086 }, { "epoch": 0.9208325118990621, "grad_norm": 0.4941003039455095, "learning_rate": 8.502479086883481e-08, "loss": 0.1575, "step": 4087 }, { "epoch": 0.9210578195848705, "grad_norm": 0.4798745839790279, "learning_rate": 8.454375464679865e-08, "loss": 0.1542, "step": 4088 }, { "epoch": 0.9212831272706791, "grad_norm": 0.4965004585488724, "learning_rate": 8.406405963758162e-08, "loss": 0.1532, "step": 4089 }, { "epoch": 0.9215084349564875, "grad_norm": 0.46374661519707955, "learning_rate": 8.358570610754097e-08, "loss": 0.1438, "step": 4090 }, { "epoch": 0.9217337426422959, "grad_norm": 0.4948131528817748, "learning_rate": 8.310869432228808e-08, "loss": 0.1488, "step": 4091 }, { "epoch": 0.9219590503281043, "grad_norm": 0.453044774452747, "learning_rate": 8.263302454669025e-08, "loss": 0.1318, "step": 4092 }, { "epoch": 0.9221843580139127, "grad_norm": 0.48266271674169603, "learning_rate": 8.215869704486873e-08, "loss": 0.1493, "step": 4093 }, { "epoch": 0.9224096656997212, "grad_norm": 0.4585831905667495, "learning_rate": 8.168571208020032e-08, "loss": 0.1372, "step": 4094 }, { "epoch": 0.9226349733855296, "grad_norm": 0.4866299635039302, "learning_rate": 8.121406991531577e-08, "loss": 0.1455, "step": 4095 }, { "epoch": 0.922860281071338, "grad_norm": 0.4478029123064623, "learning_rate": 8.074377081210033e-08, "loss": 0.1312, "step": 4096 }, { "epoch": 0.9230855887571465, "grad_norm": 0.4665798318007941, "learning_rate": 8.027481503169371e-08, "loss": 0.1432, "step": 4097 }, { "epoch": 0.9233108964429549, "grad_norm": 0.4529312789736691, "learning_rate": 7.980720283448957e-08, "loss": 0.1324, "step": 4098 }, { "epoch": 0.9235362041287634, "grad_norm": 0.49127056991795576, "learning_rate": 7.934093448013492e-08, "loss": 0.1518, "step": 4099 }, { "epoch": 0.9237615118145718, "grad_norm": 0.47449397767122997, "learning_rate": 7.887601022753238e-08, "loss": 0.1344, "step": 4100 }, { "epoch": 0.9239868195003802, "grad_norm": 0.45542770853975467, "learning_rate": 7.841243033483575e-08, "loss": 0.1261, "step": 4101 }, { "epoch": 0.9242121271861886, "grad_norm": 0.4490597747023126, "learning_rate": 7.795019505945495e-08, "loss": 0.138, "step": 4102 }, { "epoch": 0.924437434871997, "grad_norm": 0.4777794405361795, "learning_rate": 7.748930465805105e-08, "loss": 0.1442, "step": 4103 }, { "epoch": 0.9246627425578055, "grad_norm": 0.4782645372300299, "learning_rate": 7.702975938653934e-08, "loss": 0.1453, "step": 4104 }, { "epoch": 0.924888050243614, "grad_norm": 0.4797534525418542, "learning_rate": 7.657155950008904e-08, "loss": 0.1422, "step": 4105 }, { "epoch": 0.9251133579294224, "grad_norm": 0.4859477729123614, "learning_rate": 7.611470525312054e-08, "loss": 0.1422, "step": 4106 }, { "epoch": 0.9253386656152308, "grad_norm": 0.47914845153238234, "learning_rate": 7.565919689930839e-08, "loss": 0.1455, "step": 4107 }, { "epoch": 0.9255639733010392, "grad_norm": 0.462832667536284, "learning_rate": 7.520503469157947e-08, "loss": 0.1348, "step": 4108 }, { "epoch": 0.9257892809868477, "grad_norm": 0.47049952129817785, "learning_rate": 7.47522188821126e-08, "loss": 0.1407, "step": 4109 }, { "epoch": 0.9260145886726561, "grad_norm": 0.44762021394675944, "learning_rate": 7.430074972234053e-08, "loss": 0.1318, "step": 4110 }, { "epoch": 0.9262398963584645, "grad_norm": 0.45952545767286157, "learning_rate": 7.385062746294608e-08, "loss": 0.134, "step": 4111 }, { "epoch": 0.9264652040442729, "grad_norm": 0.43848643342546195, "learning_rate": 7.340185235386627e-08, "loss": 0.1266, "step": 4112 }, { "epoch": 0.9266905117300814, "grad_norm": 0.48233284878708355, "learning_rate": 7.29544246442887e-08, "loss": 0.1438, "step": 4113 }, { "epoch": 0.9269158194158899, "grad_norm": 0.48040723283418874, "learning_rate": 7.250834458265355e-08, "loss": 0.1354, "step": 4114 }, { "epoch": 0.9271411271016983, "grad_norm": 0.47447604347591066, "learning_rate": 7.206361241665266e-08, "loss": 0.1392, "step": 4115 }, { "epoch": 0.9273664347875067, "grad_norm": 0.46371914947128257, "learning_rate": 7.162022839322824e-08, "loss": 0.132, "step": 4116 }, { "epoch": 0.9275917424733151, "grad_norm": 0.47355152598312406, "learning_rate": 7.117819275857613e-08, "loss": 0.136, "step": 4117 }, { "epoch": 0.9278170501591235, "grad_norm": 0.4928462672275944, "learning_rate": 7.073750575814136e-08, "loss": 0.1492, "step": 4118 }, { "epoch": 0.9280423578449319, "grad_norm": 0.47060721385062093, "learning_rate": 7.029816763662129e-08, "loss": 0.1392, "step": 4119 }, { "epoch": 0.9282676655307405, "grad_norm": 0.47917164908363497, "learning_rate": 6.986017863796435e-08, "loss": 0.1307, "step": 4120 }, { "epoch": 0.9284929732165489, "grad_norm": 0.48465172787325095, "learning_rate": 6.94235390053688e-08, "loss": 0.139, "step": 4121 }, { "epoch": 0.9287182809023573, "grad_norm": 0.481611750105736, "learning_rate": 6.898824898128515e-08, "loss": 0.1363, "step": 4122 }, { "epoch": 0.9289435885881657, "grad_norm": 0.4807778024489483, "learning_rate": 6.85543088074131e-08, "loss": 0.1356, "step": 4123 }, { "epoch": 0.9291688962739741, "grad_norm": 0.49271889080241804, "learning_rate": 6.81217187247038e-08, "loss": 0.1464, "step": 4124 }, { "epoch": 0.9293942039597826, "grad_norm": 0.49943470735311984, "learning_rate": 6.769047897335818e-08, "loss": 0.1563, "step": 4125 }, { "epoch": 0.929619511645591, "grad_norm": 0.4562498902244851, "learning_rate": 6.726058979282774e-08, "loss": 0.1277, "step": 4126 }, { "epoch": 0.9298448193313994, "grad_norm": 0.47040935352758834, "learning_rate": 6.683205142181404e-08, "loss": 0.1345, "step": 4127 }, { "epoch": 0.9300701270172079, "grad_norm": 0.48800656067496256, "learning_rate": 6.640486409826785e-08, "loss": 0.1426, "step": 4128 }, { "epoch": 0.9302954347030163, "grad_norm": 0.4473344699309852, "learning_rate": 6.597902805939138e-08, "loss": 0.1365, "step": 4129 }, { "epoch": 0.9305207423888248, "grad_norm": 0.4979933517833155, "learning_rate": 6.555454354163437e-08, "loss": 0.1484, "step": 4130 }, { "epoch": 0.9307460500746332, "grad_norm": 0.4517633354010443, "learning_rate": 6.513141078069828e-08, "loss": 0.1293, "step": 4131 }, { "epoch": 0.9309713577604416, "grad_norm": 0.4946926565291331, "learning_rate": 6.470963001153268e-08, "loss": 0.1537, "step": 4132 }, { "epoch": 0.93119666544625, "grad_norm": 0.48974509519757004, "learning_rate": 6.428920146833606e-08, "loss": 0.1462, "step": 4133 }, { "epoch": 0.9314219731320584, "grad_norm": 0.4761209450780214, "learning_rate": 6.387012538455723e-08, "loss": 0.1517, "step": 4134 }, { "epoch": 0.9316472808178669, "grad_norm": 0.4706955427126162, "learning_rate": 6.345240199289365e-08, "loss": 0.1438, "step": 4135 }, { "epoch": 0.9318725885036754, "grad_norm": 0.46452407200738693, "learning_rate": 6.303603152529119e-08, "loss": 0.1408, "step": 4136 }, { "epoch": 0.9320978961894838, "grad_norm": 0.46541018057868117, "learning_rate": 6.262101421294547e-08, "loss": 0.1343, "step": 4137 }, { "epoch": 0.9323232038752922, "grad_norm": 0.47098480181557906, "learning_rate": 6.220735028629937e-08, "loss": 0.1327, "step": 4138 }, { "epoch": 0.9325485115611006, "grad_norm": 0.4755568983527909, "learning_rate": 6.179503997504554e-08, "loss": 0.1395, "step": 4139 }, { "epoch": 0.932773819246909, "grad_norm": 0.4815905104832014, "learning_rate": 6.13840835081242e-08, "loss": 0.1365, "step": 4140 }, { "epoch": 0.9329991269327175, "grad_norm": 0.4838954462320816, "learning_rate": 6.097448111372446e-08, "loss": 0.15, "step": 4141 }, { "epoch": 0.9332244346185259, "grad_norm": 0.4857125082783464, "learning_rate": 6.056623301928327e-08, "loss": 0.149, "step": 4142 }, { "epoch": 0.9334497423043343, "grad_norm": 0.45053349895739203, "learning_rate": 6.015933945148517e-08, "loss": 0.1308, "step": 4143 }, { "epoch": 0.9336750499901428, "grad_norm": 0.48660458250447647, "learning_rate": 5.975380063626356e-08, "loss": 0.1425, "step": 4144 }, { "epoch": 0.9339003576759513, "grad_norm": 0.47940782085921785, "learning_rate": 5.934961679879836e-08, "loss": 0.1411, "step": 4145 }, { "epoch": 0.9341256653617597, "grad_norm": 0.44987270917047917, "learning_rate": 5.894678816351862e-08, "loss": 0.133, "step": 4146 }, { "epoch": 0.9343509730475681, "grad_norm": 0.48414334794038866, "learning_rate": 5.854531495409932e-08, "loss": 0.145, "step": 4147 }, { "epoch": 0.9345762807333765, "grad_norm": 0.4639549974078448, "learning_rate": 5.8145197393463806e-08, "loss": 0.14, "step": 4148 }, { "epoch": 0.9348015884191849, "grad_norm": 0.45948529746860883, "learning_rate": 5.774643570378296e-08, "loss": 0.1287, "step": 4149 }, { "epoch": 0.9350268961049933, "grad_norm": 0.47429997344838176, "learning_rate": 5.73490301064733e-08, "loss": 0.1367, "step": 4150 }, { "epoch": 0.9352522037908018, "grad_norm": 0.48066514481237554, "learning_rate": 5.695298082219997e-08, "loss": 0.1521, "step": 4151 }, { "epoch": 0.9354775114766103, "grad_norm": 0.46185271302563835, "learning_rate": 5.6558288070874544e-08, "loss": 0.1349, "step": 4152 }, { "epoch": 0.9357028191624187, "grad_norm": 0.4992867840736411, "learning_rate": 5.616495207165451e-08, "loss": 0.1562, "step": 4153 }, { "epoch": 0.9359281268482271, "grad_norm": 0.48381812287783943, "learning_rate": 5.577297304294543e-08, "loss": 0.142, "step": 4154 }, { "epoch": 0.9361534345340355, "grad_norm": 0.46829283932499904, "learning_rate": 5.538235120239821e-08, "loss": 0.1339, "step": 4155 }, { "epoch": 0.936378742219844, "grad_norm": 0.4734966918946892, "learning_rate": 5.4993086766910733e-08, "loss": 0.1383, "step": 4156 }, { "epoch": 0.9366040499056524, "grad_norm": 0.4877136325551203, "learning_rate": 5.460517995262704e-08, "loss": 0.1456, "step": 4157 }, { "epoch": 0.9368293575914608, "grad_norm": 0.47711868327825846, "learning_rate": 5.421863097493707e-08, "loss": 0.1433, "step": 4158 }, { "epoch": 0.9370546652772692, "grad_norm": 0.47882405572711156, "learning_rate": 5.383344004847774e-08, "loss": 0.147, "step": 4159 }, { "epoch": 0.9372799729630777, "grad_norm": 0.4542593431360111, "learning_rate": 5.344960738713018e-08, "loss": 0.1303, "step": 4160 }, { "epoch": 0.9375052806488862, "grad_norm": 0.49811599304780374, "learning_rate": 5.3067133204023344e-08, "loss": 0.1563, "step": 4161 }, { "epoch": 0.9377305883346946, "grad_norm": 0.45144422788710814, "learning_rate": 5.268601771153042e-08, "loss": 0.1374, "step": 4162 }, { "epoch": 0.937955896020503, "grad_norm": 0.4755448216457163, "learning_rate": 5.230626112127046e-08, "loss": 0.1341, "step": 4163 }, { "epoch": 0.9381812037063114, "grad_norm": 0.4666514696365624, "learning_rate": 5.192786364410868e-08, "loss": 0.139, "step": 4164 }, { "epoch": 0.9384065113921198, "grad_norm": 0.44495670514922453, "learning_rate": 5.15508254901545e-08, "loss": 0.132, "step": 4165 }, { "epoch": 0.9386318190779283, "grad_norm": 0.494033372148409, "learning_rate": 5.117514686876379e-08, "loss": 0.1376, "step": 4166 }, { "epoch": 0.9388571267637368, "grad_norm": 0.47118120206769226, "learning_rate": 5.080082798853664e-08, "loss": 0.1416, "step": 4167 }, { "epoch": 0.9390824344495452, "grad_norm": 0.47736361166036123, "learning_rate": 5.0427869057317894e-08, "loss": 0.1435, "step": 4168 }, { "epoch": 0.9393077421353536, "grad_norm": 0.47639039780312875, "learning_rate": 5.0056270282198286e-08, "loss": 0.1429, "step": 4169 }, { "epoch": 0.939533049821162, "grad_norm": 0.4860936697164675, "learning_rate": 4.9686031869512486e-08, "loss": 0.1416, "step": 4170 }, { "epoch": 0.9397583575069705, "grad_norm": 0.482815171118602, "learning_rate": 4.93171540248405e-08, "loss": 0.1353, "step": 4171 }, { "epoch": 0.9399836651927789, "grad_norm": 0.5103137344503779, "learning_rate": 4.89496369530057e-08, "loss": 0.155, "step": 4172 }, { "epoch": 0.9402089728785873, "grad_norm": 0.4838258789983965, "learning_rate": 4.858348085807735e-08, "loss": 0.1422, "step": 4173 }, { "epoch": 0.9404342805643957, "grad_norm": 0.4851697238429523, "learning_rate": 4.8218685943368094e-08, "loss": 0.1495, "step": 4174 }, { "epoch": 0.9406595882502042, "grad_norm": 0.49732599987854065, "learning_rate": 4.7855252411434516e-08, "loss": 0.1492, "step": 4175 }, { "epoch": 0.9408848959360127, "grad_norm": 0.4717565593998016, "learning_rate": 4.7493180464078246e-08, "loss": 0.1388, "step": 4176 }, { "epoch": 0.9411102036218211, "grad_norm": 0.4664459236488136, "learning_rate": 4.713247030234402e-08, "loss": 0.1425, "step": 4177 }, { "epoch": 0.9413355113076295, "grad_norm": 0.4811319686765736, "learning_rate": 4.677312212652108e-08, "loss": 0.1385, "step": 4178 }, { "epoch": 0.9415608189934379, "grad_norm": 0.48387282838605733, "learning_rate": 4.641513613614174e-08, "loss": 0.1427, "step": 4179 }, { "epoch": 0.9417861266792463, "grad_norm": 0.48546839650306356, "learning_rate": 4.605851252998256e-08, "loss": 0.1429, "step": 4180 }, { "epoch": 0.9420114343650547, "grad_norm": 0.49348764821190694, "learning_rate": 4.570325150606292e-08, "loss": 0.1404, "step": 4181 }, { "epoch": 0.9422367420508632, "grad_norm": 0.47553158908716175, "learning_rate": 4.5349353261646414e-08, "loss": 0.1402, "step": 4182 }, { "epoch": 0.9424620497366717, "grad_norm": 0.43968316669747853, "learning_rate": 4.4996817993239464e-08, "loss": 0.122, "step": 4183 }, { "epoch": 0.9426873574224801, "grad_norm": 0.48967752450494284, "learning_rate": 4.464564589659187e-08, "loss": 0.1517, "step": 4184 }, { "epoch": 0.9429126651082885, "grad_norm": 0.484849385831537, "learning_rate": 4.4295837166696e-08, "loss": 0.1415, "step": 4185 }, { "epoch": 0.943137972794097, "grad_norm": 0.48741048696615075, "learning_rate": 4.3947391997787857e-08, "loss": 0.1493, "step": 4186 }, { "epoch": 0.9433632804799054, "grad_norm": 0.48746792079410006, "learning_rate": 4.360031058334602e-08, "loss": 0.1549, "step": 4187 }, { "epoch": 0.9435885881657138, "grad_norm": 0.47040559458568837, "learning_rate": 4.325459311609187e-08, "loss": 0.1443, "step": 4188 }, { "epoch": 0.9438138958515222, "grad_norm": 0.4896748350854861, "learning_rate": 4.291023978798964e-08, "loss": 0.1492, "step": 4189 }, { "epoch": 0.9440392035373306, "grad_norm": 0.5119548403533473, "learning_rate": 4.256725079024554e-08, "loss": 0.1504, "step": 4190 }, { "epoch": 0.9442645112231391, "grad_norm": 0.48365987538100963, "learning_rate": 4.22256263133089e-08, "loss": 0.1449, "step": 4191 }, { "epoch": 0.9444898189089476, "grad_norm": 0.4737091419191897, "learning_rate": 4.1885366546870754e-08, "loss": 0.145, "step": 4192 }, { "epoch": 0.944715126594756, "grad_norm": 0.49621450405580125, "learning_rate": 4.1546471679864975e-08, "loss": 0.1494, "step": 4193 }, { "epoch": 0.9449404342805644, "grad_norm": 0.4610915608131055, "learning_rate": 4.120894190046687e-08, "loss": 0.1307, "step": 4194 }, { "epoch": 0.9451657419663728, "grad_norm": 0.4546180043565135, "learning_rate": 4.087277739609458e-08, "loss": 0.1289, "step": 4195 }, { "epoch": 0.9453910496521812, "grad_norm": 0.4732411285427456, "learning_rate": 4.053797835340739e-08, "loss": 0.1425, "step": 4196 }, { "epoch": 0.9456163573379897, "grad_norm": 0.4353493843568312, "learning_rate": 4.020454495830689e-08, "loss": 0.1247, "step": 4197 }, { "epoch": 0.9458416650237981, "grad_norm": 0.4696346129690689, "learning_rate": 3.987247739593636e-08, "loss": 0.1456, "step": 4198 }, { "epoch": 0.9460669727096066, "grad_norm": 0.4470804636692044, "learning_rate": 3.9541775850679975e-08, "loss": 0.1329, "step": 4199 }, { "epoch": 0.946292280395415, "grad_norm": 0.45593946874537, "learning_rate": 3.9212440506164465e-08, "loss": 0.1372, "step": 4200 }, { "epoch": 0.9465175880812234, "grad_norm": 0.4724120173559638, "learning_rate": 3.888447154525771e-08, "loss": 0.1496, "step": 4201 }, { "epoch": 0.9467428957670319, "grad_norm": 0.4618892258266053, "learning_rate": 3.855786915006793e-08, "loss": 0.134, "step": 4202 }, { "epoch": 0.9469682034528403, "grad_norm": 0.44804853648087367, "learning_rate": 3.8232633501945896e-08, "loss": 0.1244, "step": 4203 }, { "epoch": 0.9471935111386487, "grad_norm": 0.4958248010923297, "learning_rate": 3.790876478148242e-08, "loss": 0.148, "step": 4204 }, { "epoch": 0.9474188188244571, "grad_norm": 0.4892013513348879, "learning_rate": 3.758626316850977e-08, "loss": 0.1513, "step": 4205 }, { "epoch": 0.9476441265102655, "grad_norm": 0.5008552166510983, "learning_rate": 3.726512884210165e-08, "loss": 0.1474, "step": 4206 }, { "epoch": 0.9478694341960741, "grad_norm": 0.46831375179856266, "learning_rate": 3.694536198057097e-08, "loss": 0.1344, "step": 4207 }, { "epoch": 0.9480947418818825, "grad_norm": 0.47748164992826336, "learning_rate": 3.6626962761473205e-08, "loss": 0.1448, "step": 4208 }, { "epoch": 0.9483200495676909, "grad_norm": 0.4695762282250701, "learning_rate": 3.630993136160332e-08, "loss": 0.1285, "step": 4209 }, { "epoch": 0.9485453572534993, "grad_norm": 0.49116182161803307, "learning_rate": 3.599426795699662e-08, "loss": 0.1493, "step": 4210 }, { "epoch": 0.9487706649393077, "grad_norm": 0.47400771480393467, "learning_rate": 3.567997272293011e-08, "loss": 0.1499, "step": 4211 }, { "epoch": 0.9489959726251161, "grad_norm": 0.4669982124202751, "learning_rate": 3.53670458339192e-08, "loss": 0.1411, "step": 4212 }, { "epoch": 0.9492212803109246, "grad_norm": 0.4546208294463335, "learning_rate": 3.505548746372128e-08, "loss": 0.1337, "step": 4213 }, { "epoch": 0.9494465879967331, "grad_norm": 0.4781864888823631, "learning_rate": 3.474529778533298e-08, "loss": 0.1428, "step": 4214 }, { "epoch": 0.9496718956825415, "grad_norm": 0.4628342120450911, "learning_rate": 3.443647697099067e-08, "loss": 0.1377, "step": 4215 }, { "epoch": 0.9498972033683499, "grad_norm": 0.45698200368309266, "learning_rate": 3.412902519217137e-08, "loss": 0.1364, "step": 4216 }, { "epoch": 0.9501225110541583, "grad_norm": 0.47121234535073564, "learning_rate": 3.382294261959157e-08, "loss": 0.1397, "step": 4217 }, { "epoch": 0.9503478187399668, "grad_norm": 0.50536781568685, "learning_rate": 3.351822942320754e-08, "loss": 0.1491, "step": 4218 }, { "epoch": 0.9505731264257752, "grad_norm": 0.5033394410768004, "learning_rate": 3.3214885772215046e-08, "loss": 0.1483, "step": 4219 }, { "epoch": 0.9507984341115836, "grad_norm": 0.460921227692927, "learning_rate": 3.2912911835049634e-08, "loss": 0.1339, "step": 4220 }, { "epoch": 0.951023741797392, "grad_norm": 0.48383243993925323, "learning_rate": 3.261230777938607e-08, "loss": 0.1532, "step": 4221 }, { "epoch": 0.9512490494832005, "grad_norm": 0.4984808680944678, "learning_rate": 3.231307377213833e-08, "loss": 0.1562, "step": 4222 }, { "epoch": 0.951474357169009, "grad_norm": 0.5030971544427753, "learning_rate": 3.201520997946045e-08, "loss": 0.1528, "step": 4223 }, { "epoch": 0.9516996648548174, "grad_norm": 0.4521382543613778, "learning_rate": 3.171871656674458e-08, "loss": 0.1294, "step": 4224 }, { "epoch": 0.9519249725406258, "grad_norm": 0.4621539139022736, "learning_rate": 3.142359369862291e-08, "loss": 0.1353, "step": 4225 }, { "epoch": 0.9521502802264342, "grad_norm": 0.4968241853667539, "learning_rate": 3.112984153896603e-08, "loss": 0.1627, "step": 4226 }, { "epoch": 0.9523755879122426, "grad_norm": 0.4835846870993006, "learning_rate": 3.0837460250883186e-08, "loss": 0.1453, "step": 4227 }, { "epoch": 0.9526008955980511, "grad_norm": 0.49806221123746486, "learning_rate": 3.0546449996723404e-08, "loss": 0.1445, "step": 4228 }, { "epoch": 0.9528262032838595, "grad_norm": 0.4622365541956114, "learning_rate": 3.0256810938073534e-08, "loss": 0.1336, "step": 4229 }, { "epoch": 0.953051510969668, "grad_norm": 0.5083718135711455, "learning_rate": 2.996854323575937e-08, "loss": 0.1504, "step": 4230 }, { "epoch": 0.9532768186554764, "grad_norm": 0.4696426708038953, "learning_rate": 2.968164704984483e-08, "loss": 0.1398, "step": 4231 }, { "epoch": 0.9535021263412848, "grad_norm": 0.4957647803669633, "learning_rate": 2.939612253963331e-08, "loss": 0.1485, "step": 4232 }, { "epoch": 0.9537274340270933, "grad_norm": 0.4976801047971032, "learning_rate": 2.911196986366577e-08, "loss": 0.1577, "step": 4233 }, { "epoch": 0.9539527417129017, "grad_norm": 0.5047313617268352, "learning_rate": 2.8829189179721552e-08, "loss": 0.1448, "step": 4234 }, { "epoch": 0.9541780493987101, "grad_norm": 0.48704566793067555, "learning_rate": 2.8547780644818113e-08, "loss": 0.1391, "step": 4235 }, { "epoch": 0.9544033570845185, "grad_norm": 0.45381551822981764, "learning_rate": 2.8267744415211296e-08, "loss": 0.1295, "step": 4236 }, { "epoch": 0.9546286647703269, "grad_norm": 0.5171129482131813, "learning_rate": 2.7989080646394217e-08, "loss": 0.1678, "step": 4237 }, { "epoch": 0.9548539724561355, "grad_norm": 0.46855959342887893, "learning_rate": 2.7711789493099495e-08, "loss": 0.1329, "step": 4238 }, { "epoch": 0.9550792801419439, "grad_norm": 0.4550251335595192, "learning_rate": 2.743587110929563e-08, "loss": 0.1311, "step": 4239 }, { "epoch": 0.9553045878277523, "grad_norm": 0.4649516123124458, "learning_rate": 2.716132564819035e-08, "loss": 0.1472, "step": 4240 }, { "epoch": 0.9555298955135607, "grad_norm": 0.47923186532440143, "learning_rate": 2.688815326222838e-08, "loss": 0.1415, "step": 4241 }, { "epoch": 0.9557552031993691, "grad_norm": 0.4943590216368037, "learning_rate": 2.661635410309199e-08, "loss": 0.1446, "step": 4242 }, { "epoch": 0.9559805108851775, "grad_norm": 0.4936773150917739, "learning_rate": 2.6345928321701575e-08, "loss": 0.1444, "step": 4243 }, { "epoch": 0.956205818570986, "grad_norm": 0.44895547905801686, "learning_rate": 2.6076876068213965e-08, "loss": 0.1285, "step": 4244 }, { "epoch": 0.9564311262567944, "grad_norm": 0.4804177263012208, "learning_rate": 2.5809197492024372e-08, "loss": 0.1497, "step": 4245 }, { "epoch": 0.9566564339426029, "grad_norm": 0.4804905277345808, "learning_rate": 2.554289274176419e-08, "loss": 0.14, "step": 4246 }, { "epoch": 0.9568817416284113, "grad_norm": 0.48656611905634656, "learning_rate": 2.5277961965302633e-08, "loss": 0.1465, "step": 4247 }, { "epoch": 0.9571070493142197, "grad_norm": 0.45690808190376064, "learning_rate": 2.5014405309746193e-08, "loss": 0.1316, "step": 4248 }, { "epoch": 0.9573323570000282, "grad_norm": 0.48327173788939704, "learning_rate": 2.4752222921437807e-08, "loss": 0.138, "step": 4249 }, { "epoch": 0.9575576646858366, "grad_norm": 0.5068194329354152, "learning_rate": 2.449141494595797e-08, "loss": 0.1532, "step": 4250 }, { "epoch": 0.957782972371645, "grad_norm": 0.46654742066027965, "learning_rate": 2.423198152812306e-08, "loss": 0.1356, "step": 4251 }, { "epoch": 0.9580082800574534, "grad_norm": 0.483696186611939, "learning_rate": 2.3973922811987295e-08, "loss": 0.1472, "step": 4252 }, { "epoch": 0.9582335877432618, "grad_norm": 0.47201416927326817, "learning_rate": 2.3717238940840493e-08, "loss": 0.1361, "step": 4253 }, { "epoch": 0.9584588954290704, "grad_norm": 0.45781860941730246, "learning_rate": 2.3461930057210037e-08, "loss": 0.1371, "step": 4254 }, { "epoch": 0.9586842031148788, "grad_norm": 0.4491060009907602, "learning_rate": 2.320799630285947e-08, "loss": 0.1297, "step": 4255 }, { "epoch": 0.9589095108006872, "grad_norm": 0.48731042626093923, "learning_rate": 2.2955437818788508e-08, "loss": 0.1549, "step": 4256 }, { "epoch": 0.9591348184864956, "grad_norm": 0.4646098385451307, "learning_rate": 2.2704254745233577e-08, "loss": 0.136, "step": 4257 }, { "epoch": 0.959360126172304, "grad_norm": 0.4759653882078851, "learning_rate": 2.2454447221667563e-08, "loss": 0.1445, "step": 4258 }, { "epoch": 0.9595854338581125, "grad_norm": 0.48335670581672063, "learning_rate": 2.2206015386798673e-08, "loss": 0.1395, "step": 4259 }, { "epoch": 0.9598107415439209, "grad_norm": 0.47111700747094354, "learning_rate": 2.1958959378572398e-08, "loss": 0.1335, "step": 4260 }, { "epoch": 0.9600360492297294, "grad_norm": 0.468816202711293, "learning_rate": 2.1713279334169278e-08, "loss": 0.1448, "step": 4261 }, { "epoch": 0.9602613569155378, "grad_norm": 0.44960534206662445, "learning_rate": 2.1468975390006587e-08, "loss": 0.1305, "step": 4262 }, { "epoch": 0.9604866646013462, "grad_norm": 0.45547617142439345, "learning_rate": 2.1226047681737193e-08, "loss": 0.1349, "step": 4263 }, { "epoch": 0.9607119722871547, "grad_norm": 0.48149633710593354, "learning_rate": 2.0984496344249596e-08, "loss": 0.142, "step": 4264 }, { "epoch": 0.9609372799729631, "grad_norm": 0.4636145891113348, "learning_rate": 2.074432151166844e-08, "loss": 0.1314, "step": 4265 }, { "epoch": 0.9611625876587715, "grad_norm": 0.5067529814876942, "learning_rate": 2.0505523317353727e-08, "loss": 0.1564, "step": 4266 }, { "epoch": 0.9613878953445799, "grad_norm": 0.4662581473086844, "learning_rate": 2.0268101893901327e-08, "loss": 0.1368, "step": 4267 }, { "epoch": 0.9616132030303883, "grad_norm": 0.4766281613039792, "learning_rate": 2.0032057373142453e-08, "loss": 0.148, "step": 4268 }, { "epoch": 0.9618385107161969, "grad_norm": 0.4715925046426043, "learning_rate": 1.9797389886143658e-08, "loss": 0.1492, "step": 4269 }, { "epoch": 0.9620638184020053, "grad_norm": 0.44744281444466083, "learning_rate": 1.956409956320737e-08, "loss": 0.1383, "step": 4270 }, { "epoch": 0.9622891260878137, "grad_norm": 0.49040603740334754, "learning_rate": 1.933218653387081e-08, "loss": 0.1517, "step": 4271 }, { "epoch": 0.9625144337736221, "grad_norm": 0.4781738093055147, "learning_rate": 1.91016509269068e-08, "loss": 0.1477, "step": 4272 }, { "epoch": 0.9627397414594305, "grad_norm": 0.5043088508617445, "learning_rate": 1.8872492870322945e-08, "loss": 0.153, "step": 4273 }, { "epoch": 0.962965049145239, "grad_norm": 0.45938917413578406, "learning_rate": 1.864471249136218e-08, "loss": 0.1386, "step": 4274 }, { "epoch": 0.9631903568310474, "grad_norm": 0.48259651582192653, "learning_rate": 1.8418309916502787e-08, "loss": 0.1432, "step": 4275 }, { "epoch": 0.9634156645168558, "grad_norm": 0.4598535106472366, "learning_rate": 1.819328527145725e-08, "loss": 0.1241, "step": 4276 }, { "epoch": 0.9636409722026643, "grad_norm": 0.46091011665324716, "learning_rate": 1.7969638681173684e-08, "loss": 0.1367, "step": 4277 }, { "epoch": 0.9638662798884727, "grad_norm": 0.48351619131549517, "learning_rate": 1.774737026983414e-08, "loss": 0.1379, "step": 4278 }, { "epoch": 0.9640915875742812, "grad_norm": 0.481262920354411, "learning_rate": 1.752648016085684e-08, "loss": 0.1506, "step": 4279 }, { "epoch": 0.9643168952600896, "grad_norm": 0.4805939380195428, "learning_rate": 1.7306968476893393e-08, "loss": 0.1448, "step": 4280 }, { "epoch": 0.964542202945898, "grad_norm": 0.5055025504837728, "learning_rate": 1.708883533983019e-08, "loss": 0.1463, "step": 4281 }, { "epoch": 0.9647675106317064, "grad_norm": 0.49571269367735965, "learning_rate": 1.6872080870788955e-08, "loss": 0.1473, "step": 4282 }, { "epoch": 0.9649928183175148, "grad_norm": 0.4533431039913623, "learning_rate": 1.6656705190125078e-08, "loss": 0.1274, "step": 4283 }, { "epoch": 0.9652181260033232, "grad_norm": 0.46643306934522594, "learning_rate": 1.6442708417428732e-08, "loss": 0.1391, "step": 4284 }, { "epoch": 0.9654434336891318, "grad_norm": 0.4655292603080353, "learning_rate": 1.6230090671524312e-08, "loss": 0.1403, "step": 4285 }, { "epoch": 0.9656687413749402, "grad_norm": 0.505200988979893, "learning_rate": 1.6018852070470437e-08, "loss": 0.1443, "step": 4286 }, { "epoch": 0.9658940490607486, "grad_norm": 0.49078232446343767, "learning_rate": 1.5808992731560225e-08, "loss": 0.1451, "step": 4287 }, { "epoch": 0.966119356746557, "grad_norm": 0.4828868989290041, "learning_rate": 1.5600512771320462e-08, "loss": 0.1357, "step": 4288 }, { "epoch": 0.9663446644323654, "grad_norm": 0.49321915716373294, "learning_rate": 1.5393412305512446e-08, "loss": 0.1406, "step": 4289 }, { "epoch": 0.9665699721181739, "grad_norm": 0.4797986763655872, "learning_rate": 1.518769144913168e-08, "loss": 0.1368, "step": 4290 }, { "epoch": 0.9667952798039823, "grad_norm": 0.4583811913091939, "learning_rate": 1.4983350316406797e-08, "loss": 0.1345, "step": 4291 }, { "epoch": 0.9670205874897907, "grad_norm": 0.4615067525946092, "learning_rate": 1.4780389020800923e-08, "loss": 0.1322, "step": 4292 }, { "epoch": 0.9672458951755992, "grad_norm": 0.4497627493649979, "learning_rate": 1.4578807675011131e-08, "loss": 0.1334, "step": 4293 }, { "epoch": 0.9674712028614076, "grad_norm": 0.4835148026172114, "learning_rate": 1.4378606390967609e-08, "loss": 0.1441, "step": 4294 }, { "epoch": 0.9676965105472161, "grad_norm": 0.481902922708207, "learning_rate": 1.4179785279835045e-08, "loss": 0.1431, "step": 4295 }, { "epoch": 0.9679218182330245, "grad_norm": 0.4582081824677256, "learning_rate": 1.3982344452011242e-08, "loss": 0.1348, "step": 4296 }, { "epoch": 0.9681471259188329, "grad_norm": 0.47063804644729257, "learning_rate": 1.3786284017127949e-08, "loss": 0.1429, "step": 4297 }, { "epoch": 0.9683724336046413, "grad_norm": 0.474544321572788, "learning_rate": 1.3591604084049747e-08, "loss": 0.1406, "step": 4298 }, { "epoch": 0.9685977412904497, "grad_norm": 0.47956333804497253, "learning_rate": 1.3398304760875725e-08, "loss": 0.144, "step": 4299 }, { "epoch": 0.9688230489762581, "grad_norm": 0.47952835497451235, "learning_rate": 1.3206386154937245e-08, "loss": 0.1489, "step": 4300 }, { "epoch": 0.9690483566620667, "grad_norm": 0.5011216096374138, "learning_rate": 1.30158483727999e-08, "loss": 0.1519, "step": 4301 }, { "epoch": 0.9692736643478751, "grad_norm": 0.4619877849945653, "learning_rate": 1.2826691520262114e-08, "loss": 0.1335, "step": 4302 }, { "epoch": 0.9694989720336835, "grad_norm": 0.4510567616253443, "learning_rate": 1.2638915702355702e-08, "loss": 0.1271, "step": 4303 }, { "epoch": 0.9697242797194919, "grad_norm": 0.46742284679784957, "learning_rate": 1.2452521023345598e-08, "loss": 0.1353, "step": 4304 }, { "epoch": 0.9699495874053004, "grad_norm": 0.4744154062862906, "learning_rate": 1.2267507586729566e-08, "loss": 0.1448, "step": 4305 }, { "epoch": 0.9701748950911088, "grad_norm": 0.4365727581849454, "learning_rate": 1.2083875495238761e-08, "loss": 0.1246, "step": 4306 }, { "epoch": 0.9704002027769172, "grad_norm": 0.46537369554131985, "learning_rate": 1.1901624850837734e-08, "loss": 0.1428, "step": 4307 }, { "epoch": 0.9706255104627256, "grad_norm": 0.49077869002621655, "learning_rate": 1.1720755754722757e-08, "loss": 0.1494, "step": 4308 }, { "epoch": 0.9708508181485341, "grad_norm": 0.4976622519358272, "learning_rate": 1.1541268307324049e-08, "loss": 0.1462, "step": 4309 }, { "epoch": 0.9710761258343426, "grad_norm": 0.4936796887656191, "learning_rate": 1.1363162608304112e-08, "loss": 0.1477, "step": 4310 }, { "epoch": 0.971301433520151, "grad_norm": 0.497374029101745, "learning_rate": 1.1186438756558838e-08, "loss": 0.1482, "step": 4311 }, { "epoch": 0.9715267412059594, "grad_norm": 0.4698021005115719, "learning_rate": 1.1011096850215842e-08, "loss": 0.1455, "step": 4312 }, { "epoch": 0.9717520488917678, "grad_norm": 0.4792243933647282, "learning_rate": 1.083713698663641e-08, "loss": 0.1389, "step": 4313 }, { "epoch": 0.9719773565775762, "grad_norm": 0.4626205082490106, "learning_rate": 1.0664559262413831e-08, "loss": 0.1281, "step": 4314 }, { "epoch": 0.9722026642633846, "grad_norm": 0.4874428688965878, "learning_rate": 1.0493363773373677e-08, "loss": 0.1551, "step": 4315 }, { "epoch": 0.9724279719491932, "grad_norm": 0.5004355080951479, "learning_rate": 1.0323550614574907e-08, "loss": 0.1517, "step": 4316 }, { "epoch": 0.9726532796350016, "grad_norm": 0.47262833063941007, "learning_rate": 1.0155119880308483e-08, "loss": 0.1352, "step": 4317 }, { "epoch": 0.97287858732081, "grad_norm": 0.46940376415483154, "learning_rate": 9.988071664097376e-09, "loss": 0.1387, "step": 4318 }, { "epoch": 0.9731038950066184, "grad_norm": 0.47903980564159626, "learning_rate": 9.822406058697665e-09, "loss": 0.146, "step": 4319 }, { "epoch": 0.9733292026924268, "grad_norm": 0.4949962235080336, "learning_rate": 9.658123156096599e-09, "loss": 0.1465, "step": 4320 }, { "epoch": 0.9735545103782353, "grad_norm": 0.4744343780732498, "learning_rate": 9.4952230475151e-09, "loss": 0.1436, "step": 4321 }, { "epoch": 0.9737798180640437, "grad_norm": 0.45313837725329525, "learning_rate": 9.333705823404981e-09, "loss": 0.1364, "step": 4322 }, { "epoch": 0.9740051257498521, "grad_norm": 0.48156914502453946, "learning_rate": 9.17357157345089e-09, "loss": 0.1423, "step": 4323 }, { "epoch": 0.9742304334356606, "grad_norm": 0.493321796410043, "learning_rate": 9.014820386569756e-09, "loss": 0.1518, "step": 4324 }, { "epoch": 0.974455741121469, "grad_norm": 0.47247014797296477, "learning_rate": 8.85745235090968e-09, "loss": 0.1374, "step": 4325 }, { "epoch": 0.9746810488072775, "grad_norm": 0.4778581083662472, "learning_rate": 8.701467553851317e-09, "loss": 0.1459, "step": 4326 }, { "epoch": 0.9749063564930859, "grad_norm": 0.4666557114127274, "learning_rate": 8.54686608200761e-09, "loss": 0.1432, "step": 4327 }, { "epoch": 0.9751316641788943, "grad_norm": 0.46927749290389636, "learning_rate": 8.393648021222666e-09, "loss": 0.1287, "step": 4328 }, { "epoch": 0.9753569718647027, "grad_norm": 0.4788321168845856, "learning_rate": 8.241813456573156e-09, "loss": 0.1285, "step": 4329 }, { "epoch": 0.9755822795505111, "grad_norm": 0.483308961427759, "learning_rate": 8.09136247236636e-09, "loss": 0.143, "step": 4330 }, { "epoch": 0.9758075872363196, "grad_norm": 0.503398696917994, "learning_rate": 7.942295152142954e-09, "loss": 0.1576, "step": 4331 }, { "epoch": 0.9760328949221281, "grad_norm": 0.5132744653021303, "learning_rate": 7.79461157867395e-09, "loss": 0.1652, "step": 4332 }, { "epoch": 0.9762582026079365, "grad_norm": 0.46056290280919454, "learning_rate": 7.64831183396264e-09, "loss": 0.1321, "step": 4333 }, { "epoch": 0.9764835102937449, "grad_norm": 0.45311995911352104, "learning_rate": 7.503395999244045e-09, "loss": 0.1267, "step": 4334 }, { "epoch": 0.9767088179795533, "grad_norm": 0.45587215436860634, "learning_rate": 7.359864154984353e-09, "loss": 0.1313, "step": 4335 }, { "epoch": 0.9769341256653618, "grad_norm": 0.48113082998224244, "learning_rate": 7.217716380881479e-09, "loss": 0.1443, "step": 4336 }, { "epoch": 0.9771594333511702, "grad_norm": 0.47691226845499113, "learning_rate": 7.076952755864508e-09, "loss": 0.1447, "step": 4337 }, { "epoch": 0.9773847410369786, "grad_norm": 0.48006758958441803, "learning_rate": 6.937573358094529e-09, "loss": 0.1355, "step": 4338 }, { "epoch": 0.977610048722787, "grad_norm": 0.47284929946531534, "learning_rate": 6.799578264963802e-09, "loss": 0.1344, "step": 4339 }, { "epoch": 0.9778353564085955, "grad_norm": 0.47254716605333813, "learning_rate": 6.662967553095756e-09, "loss": 0.1451, "step": 4340 }, { "epoch": 0.978060664094404, "grad_norm": 0.44905567061828916, "learning_rate": 6.527741298345269e-09, "loss": 0.1313, "step": 4341 }, { "epoch": 0.9782859717802124, "grad_norm": 0.4691229636171862, "learning_rate": 6.3938995757981125e-09, "loss": 0.1367, "step": 4342 }, { "epoch": 0.9785112794660208, "grad_norm": 0.45663032287457944, "learning_rate": 6.2614424597720605e-09, "loss": 0.1304, "step": 4343 }, { "epoch": 0.9787365871518292, "grad_norm": 0.44357322752828715, "learning_rate": 6.1303700238152245e-09, "loss": 0.1224, "step": 4344 }, { "epoch": 0.9789618948376376, "grad_norm": 0.48531056325011684, "learning_rate": 6.00068234070772e-09, "loss": 0.1401, "step": 4345 }, { "epoch": 0.979187202523446, "grad_norm": 0.48866064423236033, "learning_rate": 5.8723794824597226e-09, "loss": 0.15, "step": 4346 }, { "epoch": 0.9794125102092545, "grad_norm": 0.47173835292473587, "learning_rate": 5.745461520313411e-09, "loss": 0.1458, "step": 4347 }, { "epoch": 0.979637817895063, "grad_norm": 0.4606684373862621, "learning_rate": 5.6199285247415805e-09, "loss": 0.1356, "step": 4348 }, { "epoch": 0.9798631255808714, "grad_norm": 0.45008526887749, "learning_rate": 5.495780565447917e-09, "loss": 0.1312, "step": 4349 }, { "epoch": 0.9800884332666798, "grad_norm": 0.4894920967180572, "learning_rate": 5.373017711367001e-09, "loss": 0.1485, "step": 4350 }, { "epoch": 0.9803137409524882, "grad_norm": 0.5039444257173281, "learning_rate": 5.2516400306648615e-09, "loss": 0.1453, "step": 4351 }, { "epoch": 0.9805390486382967, "grad_norm": 0.44672365989159696, "learning_rate": 5.131647590737587e-09, "loss": 0.1275, "step": 4352 }, { "epoch": 0.9807643563241051, "grad_norm": 0.49591763264309763, "learning_rate": 5.0130404582127144e-09, "loss": 0.1374, "step": 4353 }, { "epoch": 0.9809896640099135, "grad_norm": 0.4804636892409362, "learning_rate": 4.895818698948396e-09, "loss": 0.1442, "step": 4354 }, { "epoch": 0.9812149716957219, "grad_norm": 0.48197632973429744, "learning_rate": 4.779982378033676e-09, "loss": 0.1415, "step": 4355 }, { "epoch": 0.9814402793815304, "grad_norm": 0.47806439019911473, "learning_rate": 4.6655315597876615e-09, "loss": 0.1421, "step": 4356 }, { "epoch": 0.9816655870673389, "grad_norm": 0.46145387110204644, "learning_rate": 4.552466307760905e-09, "loss": 0.1314, "step": 4357 }, { "epoch": 0.9818908947531473, "grad_norm": 0.4526873753515115, "learning_rate": 4.440786684734577e-09, "loss": 0.1362, "step": 4358 }, { "epoch": 0.9821162024389557, "grad_norm": 0.48922876618552663, "learning_rate": 4.330492752719628e-09, "loss": 0.1502, "step": 4359 }, { "epoch": 0.9823415101247641, "grad_norm": 0.4789288370287609, "learning_rate": 4.221584572958737e-09, "loss": 0.1405, "step": 4360 }, { "epoch": 0.9825668178105725, "grad_norm": 0.4517484644833375, "learning_rate": 4.114062205924085e-09, "loss": 0.1268, "step": 4361 }, { "epoch": 0.982792125496381, "grad_norm": 0.4830389432928069, "learning_rate": 4.0079257113190275e-09, "loss": 0.1473, "step": 4362 }, { "epoch": 0.9830174331821895, "grad_norm": 0.4423608570433734, "learning_rate": 3.903175148077531e-09, "loss": 0.1255, "step": 4363 }, { "epoch": 0.9832427408679979, "grad_norm": 0.4599706442868318, "learning_rate": 3.799810574363072e-09, "loss": 0.1326, "step": 4364 }, { "epoch": 0.9834680485538063, "grad_norm": 0.48203972353551805, "learning_rate": 3.697832047570571e-09, "loss": 0.1549, "step": 4365 }, { "epoch": 0.9836933562396147, "grad_norm": 0.4835541316928907, "learning_rate": 3.597239624325011e-09, "loss": 0.1433, "step": 4366 }, { "epoch": 0.9839186639254232, "grad_norm": 0.4788202007537194, "learning_rate": 3.4980333604811567e-09, "loss": 0.1496, "step": 4367 }, { "epoch": 0.9841439716112316, "grad_norm": 0.46619578510949317, "learning_rate": 3.4002133111246673e-09, "loss": 0.1432, "step": 4368 }, { "epoch": 0.98436927929704, "grad_norm": 0.5110741441195408, "learning_rate": 3.303779530571538e-09, "loss": 0.1605, "step": 4369 }, { "epoch": 0.9845945869828484, "grad_norm": 0.4621865944157658, "learning_rate": 3.208732072368104e-09, "loss": 0.1346, "step": 4370 }, { "epoch": 0.9848198946686569, "grad_norm": 0.4796600192057587, "learning_rate": 3.1150709892899256e-09, "loss": 0.1453, "step": 4371 }, { "epoch": 0.9850452023544654, "grad_norm": 0.4678812341995167, "learning_rate": 3.022796333344291e-09, "loss": 0.1431, "step": 4372 }, { "epoch": 0.9852705100402738, "grad_norm": 0.4786835908663167, "learning_rate": 2.9319081557674377e-09, "loss": 0.1459, "step": 4373 }, { "epoch": 0.9854958177260822, "grad_norm": 0.5007725644868222, "learning_rate": 2.8424065070262186e-09, "loss": 0.1513, "step": 4374 }, { "epoch": 0.9857211254118906, "grad_norm": 0.49174743123100595, "learning_rate": 2.754291436817824e-09, "loss": 0.1506, "step": 4375 }, { "epoch": 0.985946433097699, "grad_norm": 0.46095084583775564, "learning_rate": 2.6675629940689508e-09, "loss": 0.1397, "step": 4376 }, { "epoch": 0.9861717407835074, "grad_norm": 0.4674294425537989, "learning_rate": 2.582221226936632e-09, "loss": 0.1338, "step": 4377 }, { "epoch": 0.9863970484693159, "grad_norm": 0.4623302703852915, "learning_rate": 2.4982661828085175e-09, "loss": 0.1356, "step": 4378 }, { "epoch": 0.9866223561551244, "grad_norm": 0.4617577655604814, "learning_rate": 2.415697908300929e-09, "loss": 0.1356, "step": 4379 }, { "epoch": 0.9868476638409328, "grad_norm": 0.4634214429564338, "learning_rate": 2.3345164492616367e-09, "loss": 0.1413, "step": 4380 }, { "epoch": 0.9870729715267412, "grad_norm": 0.4702899505718087, "learning_rate": 2.2547218507673606e-09, "loss": 0.1315, "step": 4381 }, { "epoch": 0.9872982792125496, "grad_norm": 0.48808317706093335, "learning_rate": 2.1763141571248813e-09, "loss": 0.1529, "step": 4382 }, { "epoch": 0.9875235868983581, "grad_norm": 0.48563405143106425, "learning_rate": 2.0992934118715948e-09, "loss": 0.1413, "step": 4383 }, { "epoch": 0.9877488945841665, "grad_norm": 0.4991973947467297, "learning_rate": 2.0236596577738466e-09, "loss": 0.1462, "step": 4384 }, { "epoch": 0.9879742022699749, "grad_norm": 0.4501683046388823, "learning_rate": 1.9494129368280432e-09, "loss": 0.1204, "step": 4385 }, { "epoch": 0.9881995099557833, "grad_norm": 0.47746210450182514, "learning_rate": 1.876553290261207e-09, "loss": 0.136, "step": 4386 }, { "epoch": 0.9884248176415918, "grad_norm": 0.4546943022120308, "learning_rate": 1.8050807585293095e-09, "loss": 0.1297, "step": 4387 }, { "epoch": 0.9886501253274003, "grad_norm": 0.5517091587017808, "learning_rate": 1.7349953813183828e-09, "loss": 0.1484, "step": 4388 }, { "epoch": 0.9888754330132087, "grad_norm": 0.45122543164788703, "learning_rate": 1.6662971975439645e-09, "loss": 0.1248, "step": 4389 }, { "epoch": 0.9891007406990171, "grad_norm": 0.48450689131031716, "learning_rate": 1.5989862453522075e-09, "loss": 0.1436, "step": 4390 }, { "epoch": 0.9893260483848255, "grad_norm": 0.4756699371386352, "learning_rate": 1.5330625621176598e-09, "loss": 0.1383, "step": 4391 }, { "epoch": 0.9895513560706339, "grad_norm": 0.4694479754875359, "learning_rate": 1.468526184445762e-09, "loss": 0.1311, "step": 4392 }, { "epoch": 0.9897766637564424, "grad_norm": 0.4978229930092217, "learning_rate": 1.4053771481711832e-09, "loss": 0.1471, "step": 4393 }, { "epoch": 0.9900019714422508, "grad_norm": 0.4905457833843764, "learning_rate": 1.343615488357819e-09, "loss": 0.1488, "step": 4394 }, { "epoch": 0.9902272791280593, "grad_norm": 0.4681366072187746, "learning_rate": 1.2832412393001814e-09, "loss": 0.1355, "step": 4395 }, { "epoch": 0.9904525868138677, "grad_norm": 0.45234859070769357, "learning_rate": 1.2242544345211772e-09, "loss": 0.1287, "step": 4396 }, { "epoch": 0.9906778944996761, "grad_norm": 0.46650840082669476, "learning_rate": 1.1666551067746058e-09, "loss": 0.1481, "step": 4397 }, { "epoch": 0.9909032021854846, "grad_norm": 0.49602833209947034, "learning_rate": 1.1104432880429394e-09, "loss": 0.1405, "step": 4398 }, { "epoch": 0.991128509871293, "grad_norm": 0.46169760847092306, "learning_rate": 1.0556190095384333e-09, "loss": 0.1366, "step": 4399 }, { "epoch": 0.9913538175571014, "grad_norm": 0.47331739771221604, "learning_rate": 1.0021823017028475e-09, "loss": 0.1438, "step": 4400 }, { "epoch": 0.9915791252429098, "grad_norm": 0.49149463979989205, "learning_rate": 9.501331942080029e-10, "loss": 0.1533, "step": 4401 }, { "epoch": 0.9918044329287182, "grad_norm": 0.5040049366138424, "learning_rate": 8.994717159546695e-10, "loss": 0.1517, "step": 4402 }, { "epoch": 0.9920297406145268, "grad_norm": 0.4588404544201829, "learning_rate": 8.501978950734014e-10, "loss": 0.1317, "step": 4403 }, { "epoch": 0.9922550483003352, "grad_norm": 0.4605093760903773, "learning_rate": 8.023117589237017e-10, "loss": 0.1315, "step": 4404 }, { "epoch": 0.9924803559861436, "grad_norm": 0.4867784316909424, "learning_rate": 7.558133340954121e-10, "loss": 0.1519, "step": 4405 }, { "epoch": 0.992705663671952, "grad_norm": 0.49264063501342575, "learning_rate": 7.10702646406769e-10, "loss": 0.1369, "step": 4406 }, { "epoch": 0.9929309713577604, "grad_norm": 0.48651414304036755, "learning_rate": 6.669797209069018e-10, "loss": 0.1472, "step": 4407 }, { "epoch": 0.9931562790435688, "grad_norm": 0.45973115163132117, "learning_rate": 6.246445818727798e-10, "loss": 0.1294, "step": 4408 }, { "epoch": 0.9933815867293773, "grad_norm": 0.46247424902217044, "learning_rate": 5.836972528119878e-10, "loss": 0.1445, "step": 4409 }, { "epoch": 0.9936068944151858, "grad_norm": 0.46562589659047027, "learning_rate": 5.44137756460783e-10, "loss": 0.1295, "step": 4410 }, { "epoch": 0.9938322021009942, "grad_norm": 0.45535410008693744, "learning_rate": 5.059661147852057e-10, "loss": 0.1388, "step": 4411 }, { "epoch": 0.9940575097868026, "grad_norm": 0.47495739736859655, "learning_rate": 4.691823489805236e-10, "loss": 0.1413, "step": 4412 }, { "epoch": 0.994282817472611, "grad_norm": 0.47225570583290694, "learning_rate": 4.3378647947150965e-10, "loss": 0.14, "step": 4413 }, { "epoch": 0.9945081251584195, "grad_norm": 0.4726742313213338, "learning_rate": 3.9977852591188694e-10, "loss": 0.1416, "step": 4414 }, { "epoch": 0.9947334328442279, "grad_norm": 0.4782536098463734, "learning_rate": 3.671585071854389e-10, "loss": 0.1392, "step": 4415 }, { "epoch": 0.9949587405300363, "grad_norm": 0.4899719857969558, "learning_rate": 3.3592644140434393e-10, "loss": 0.1427, "step": 4416 }, { "epoch": 0.9951840482158447, "grad_norm": 0.48595091219025266, "learning_rate": 3.0608234591084083e-10, "loss": 0.1429, "step": 4417 }, { "epoch": 0.9954093559016532, "grad_norm": 0.48108261570009553, "learning_rate": 2.776262372761185e-10, "loss": 0.1504, "step": 4418 }, { "epoch": 0.9956346635874617, "grad_norm": 0.4698281725163803, "learning_rate": 2.505581313011485e-10, "loss": 0.1382, "step": 4419 }, { "epoch": 0.9958599712732701, "grad_norm": 0.4667519344743465, "learning_rate": 2.2487804301557503e-10, "loss": 0.1448, "step": 4420 }, { "epoch": 0.9960852789590785, "grad_norm": 0.46820185366520156, "learning_rate": 2.0058598667854755e-10, "loss": 0.1415, "step": 4421 }, { "epoch": 0.9963105866448869, "grad_norm": 0.4779993256669945, "learning_rate": 1.776819757787207e-10, "loss": 0.1353, "step": 4422 }, { "epoch": 0.9965358943306953, "grad_norm": 0.4723393911423046, "learning_rate": 1.561660230336992e-10, "loss": 0.1444, "step": 4423 }, { "epoch": 0.9967612020165038, "grad_norm": 0.4768298410372807, "learning_rate": 1.3603814039031547e-10, "loss": 0.1386, "step": 4424 }, { "epoch": 0.9969865097023122, "grad_norm": 0.44014844744591247, "learning_rate": 1.1729833902518473e-10, "loss": 0.1202, "step": 4425 }, { "epoch": 0.9972118173881207, "grad_norm": 0.4881840586497598, "learning_rate": 9.994662934387223e-11, "loss": 0.1442, "step": 4426 }, { "epoch": 0.9974371250739291, "grad_norm": 0.4844812645696949, "learning_rate": 8.398302098061583e-11, "loss": 0.141, "step": 4427 }, { "epoch": 0.9976624327597375, "grad_norm": 0.4619719293822893, "learning_rate": 6.94075227999913e-11, "loss": 0.139, "step": 4428 }, { "epoch": 0.997887740445546, "grad_norm": 0.4570075187656965, "learning_rate": 5.62201428946918e-11, "loss": 0.1362, "step": 4429 }, { "epoch": 0.9981130481313544, "grad_norm": 0.46402279082792575, "learning_rate": 4.44208885877484e-11, "loss": 0.1313, "step": 4430 }, { "epoch": 0.9983383558171628, "grad_norm": 0.46837914976734196, "learning_rate": 3.400976643030962e-11, "loss": 0.1387, "step": 4431 }, { "epoch": 0.9985636635029712, "grad_norm": 0.49390204409988, "learning_rate": 2.498678220386186e-11, "loss": 0.148, "step": 4432 }, { "epoch": 0.9987889711887796, "grad_norm": 0.4884904018494601, "learning_rate": 1.735194091800896e-11, "loss": 0.1494, "step": 4433 }, { "epoch": 0.9990142788745882, "grad_norm": 0.4828464055068492, "learning_rate": 1.1105246812137538e-11, "loss": 0.138, "step": 4434 }, { "epoch": 0.9992395865603966, "grad_norm": 0.4629521120856078, "learning_rate": 6.246703355139438e-12, "loss": 0.1439, "step": 4435 }, { "epoch": 0.999464894246205, "grad_norm": 0.45979903985531045, "learning_rate": 2.7763132445790543e-12, "loss": 0.1297, "step": 4436 }, { "epoch": 0.9996902019320134, "grad_norm": 0.48535911253263453, "learning_rate": 6.940784075259999e-13, "loss": 0.1444, "step": 4437 }, { "epoch": 0.9999155096178218, "grad_norm": 0.4531440552581044, "learning_rate": 0.0, "loss": 0.125, "step": 4438 }, { "epoch": 0.9999155096178218, "step": 4438, "total_flos": 995060424065024.0, "train_loss": 0.16356557928304405, "train_runtime": 22100.2501, "train_samples_per_second": 12.853, "train_steps_per_second": 0.201 } ], "logging_steps": 1, "max_steps": 4438, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 995060424065024.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }