{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9999067555596999, "eval_steps": 500, "global_step": 10724, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.524007747080166, "learning_rate": 9.999994636280297e-06, "loss": 1.823, "step": 5 }, { "epoch": 0.0, "grad_norm": 3.751582456194241, "learning_rate": 9.999978545132696e-06, "loss": 1.3441, "step": 10 }, { "epoch": 0.0, "grad_norm": 3.3490766596115806, "learning_rate": 9.999951726591716e-06, "loss": 1.3161, "step": 15 }, { "epoch": 0.0, "grad_norm": 3.2220183943018883, "learning_rate": 9.999914180714902e-06, "loss": 1.3822, "step": 20 }, { "epoch": 0.0, "grad_norm": 3.7449433236910914, "learning_rate": 9.999865907582806e-06, "loss": 1.4161, "step": 25 }, { "epoch": 0.01, "grad_norm": 4.13884314404206, "learning_rate": 9.999806907298994e-06, "loss": 1.325, "step": 30 }, { "epoch": 0.01, "grad_norm": 3.1735066858647407, "learning_rate": 9.999737179990057e-06, "loss": 1.3145, "step": 35 }, { "epoch": 0.01, "grad_norm": 4.044320591378858, "learning_rate": 9.999656725805588e-06, "loss": 1.3288, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.945673848593178, "learning_rate": 9.999565544918204e-06, "loss": 1.3304, "step": 45 }, { "epoch": 0.01, "grad_norm": 3.0178952115872986, "learning_rate": 9.99946363752353e-06, "loss": 1.302, "step": 50 }, { "epoch": 0.01, "grad_norm": 2.908785287683549, "learning_rate": 9.999351003840209e-06, "loss": 1.3062, "step": 55 }, { "epoch": 0.01, "grad_norm": 3.1033835338024884, "learning_rate": 9.999227644109894e-06, "loss": 1.3064, "step": 60 }, { "epoch": 0.01, "grad_norm": 3.061377074716208, "learning_rate": 9.999093558597254e-06, "loss": 1.325, "step": 65 }, { "epoch": 0.01, "grad_norm": 2.336953651662858, "learning_rate": 9.998948747589966e-06, "loss": 1.2808, "step": 70 }, { "epoch": 0.01, "grad_norm": 2.9142448487551142, "learning_rate": 9.998793211398721e-06, "loss": 1.3047, "step": 75 }, { "epoch": 0.01, "grad_norm": 2.6943468676524334, "learning_rate": 9.998626950357218e-06, "loss": 1.3182, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.979676737020957, "learning_rate": 9.99844996482217e-06, "loss": 1.312, "step": 85 }, { "epoch": 0.02, "grad_norm": 3.4798768099858544, "learning_rate": 9.998262255173298e-06, "loss": 1.279, "step": 90 }, { "epoch": 0.02, "grad_norm": 3.034106416493002, "learning_rate": 9.998063821813328e-06, "loss": 1.3215, "step": 95 }, { "epoch": 0.02, "grad_norm": 3.1680260266966953, "learning_rate": 9.997854665168001e-06, "loss": 1.3095, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.9582365796044368, "learning_rate": 9.997634785686054e-06, "loss": 1.3602, "step": 105 }, { "epoch": 0.02, "grad_norm": 3.4201546672414294, "learning_rate": 9.997404183839241e-06, "loss": 1.3675, "step": 110 }, { "epoch": 0.02, "grad_norm": 3.2960605448255067, "learning_rate": 9.997162860122313e-06, "loss": 1.3517, "step": 115 }, { "epoch": 0.02, "grad_norm": 2.774691284711409, "learning_rate": 9.996910815053027e-06, "loss": 1.3123, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.7634540383113446, "learning_rate": 9.996648049172143e-06, "loss": 1.2638, "step": 125 }, { "epoch": 0.02, "grad_norm": 2.6841122836813094, "learning_rate": 9.996374563043422e-06, "loss": 1.3476, "step": 130 }, { "epoch": 0.03, "grad_norm": 3.2063540307974288, "learning_rate": 9.996090357253625e-06, "loss": 1.3489, "step": 135 }, { "epoch": 0.03, "grad_norm": 2.5801773637103023, "learning_rate": 9.995795432412513e-06, "loss": 1.3069, "step": 140 }, { "epoch": 0.03, "grad_norm": 2.860331572440809, "learning_rate": 9.995489789152844e-06, "loss": 1.3342, "step": 145 }, { "epoch": 0.03, "grad_norm": 2.9984719758607397, "learning_rate": 9.99517342813037e-06, "loss": 1.3705, "step": 150 }, { "epoch": 0.03, "grad_norm": 3.230814648227745, "learning_rate": 9.994846350023842e-06, "loss": 1.3816, "step": 155 }, { "epoch": 0.03, "grad_norm": 2.6882982691959696, "learning_rate": 9.994508555534999e-06, "loss": 1.2859, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.7701863434403355, "learning_rate": 9.994160045388576e-06, "loss": 1.264, "step": 165 }, { "epoch": 0.03, "grad_norm": 2.837077069983557, "learning_rate": 9.9938008203323e-06, "loss": 1.2557, "step": 170 }, { "epoch": 0.03, "grad_norm": 2.7592433235464915, "learning_rate": 9.993430881136883e-06, "loss": 1.3556, "step": 175 }, { "epoch": 0.03, "grad_norm": 2.618947907449918, "learning_rate": 9.993050228596021e-06, "loss": 1.3118, "step": 180 }, { "epoch": 0.03, "grad_norm": 3.2327219520967563, "learning_rate": 9.992658863526405e-06, "loss": 1.2953, "step": 185 }, { "epoch": 0.04, "grad_norm": 2.917865999401656, "learning_rate": 9.992256786767702e-06, "loss": 1.3146, "step": 190 }, { "epoch": 0.04, "grad_norm": 2.611540955710515, "learning_rate": 9.99184399918256e-06, "loss": 1.3002, "step": 195 }, { "epoch": 0.04, "grad_norm": 2.975342658891056, "learning_rate": 9.991420501656615e-06, "loss": 1.2903, "step": 200 }, { "epoch": 0.04, "grad_norm": 3.1866127183127277, "learning_rate": 9.990986295098472e-06, "loss": 1.3178, "step": 205 }, { "epoch": 0.04, "grad_norm": 3.1225984655510652, "learning_rate": 9.990541380439716e-06, "loss": 1.3263, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.4693895609084078, "learning_rate": 9.990085758634907e-06, "loss": 1.2732, "step": 215 }, { "epoch": 0.04, "grad_norm": 3.529179960181463, "learning_rate": 9.989619430661576e-06, "loss": 1.2542, "step": 220 }, { "epoch": 0.04, "grad_norm": 2.769607377162276, "learning_rate": 9.989142397520225e-06, "loss": 1.2751, "step": 225 }, { "epoch": 0.04, "grad_norm": 2.6943100281562233, "learning_rate": 9.98865466023432e-06, "loss": 1.3003, "step": 230 }, { "epoch": 0.04, "grad_norm": 3.418579661198453, "learning_rate": 9.9881562198503e-06, "loss": 1.2373, "step": 235 }, { "epoch": 0.04, "grad_norm": 5.452061410443576, "learning_rate": 9.987647077437559e-06, "loss": 1.2977, "step": 240 }, { "epoch": 0.05, "grad_norm": 2.718344890262994, "learning_rate": 9.987127234088456e-06, "loss": 1.2952, "step": 245 }, { "epoch": 0.05, "grad_norm": 2.872673940218012, "learning_rate": 9.986596690918308e-06, "loss": 1.297, "step": 250 }, { "epoch": 0.05, "grad_norm": 4.033704916837394, "learning_rate": 9.986055449065394e-06, "loss": 1.2778, "step": 255 }, { "epoch": 0.05, "grad_norm": 2.6759149455624804, "learning_rate": 9.985503509690937e-06, "loss": 1.2974, "step": 260 }, { "epoch": 0.05, "grad_norm": 2.9791514906697585, "learning_rate": 9.984940873979116e-06, "loss": 1.3069, "step": 265 }, { "epoch": 0.05, "grad_norm": 2.7904291459421575, "learning_rate": 9.984367543137062e-06, "loss": 1.2749, "step": 270 }, { "epoch": 0.05, "grad_norm": 2.8139332500165652, "learning_rate": 9.98378351839485e-06, "loss": 1.2622, "step": 275 }, { "epoch": 0.05, "grad_norm": 3.057619432752788, "learning_rate": 9.983188801005492e-06, "loss": 1.2835, "step": 280 }, { "epoch": 0.05, "grad_norm": 2.794308837656347, "learning_rate": 9.982583392244954e-06, "loss": 1.278, "step": 285 }, { "epoch": 0.05, "grad_norm": 3.0284278220138034, "learning_rate": 9.981967293412128e-06, "loss": 1.2832, "step": 290 }, { "epoch": 0.06, "grad_norm": 2.5950675242644285, "learning_rate": 9.98134050582885e-06, "loss": 1.2421, "step": 295 }, { "epoch": 0.06, "grad_norm": 3.0925652328309123, "learning_rate": 9.980703030839884e-06, "loss": 1.2869, "step": 300 }, { "epoch": 0.06, "grad_norm": 2.6030703692470243, "learning_rate": 9.980054869812923e-06, "loss": 1.2463, "step": 305 }, { "epoch": 0.06, "grad_norm": 2.7199426489230674, "learning_rate": 9.979396024138593e-06, "loss": 1.2865, "step": 310 }, { "epoch": 0.06, "grad_norm": 2.8797052662634712, "learning_rate": 9.978726495230434e-06, "loss": 1.3188, "step": 315 }, { "epoch": 0.06, "grad_norm": 2.8935014338459344, "learning_rate": 9.978046284524917e-06, "loss": 1.3377, "step": 320 }, { "epoch": 0.06, "grad_norm": 2.779176704862691, "learning_rate": 9.977355393481423e-06, "loss": 1.3213, "step": 325 }, { "epoch": 0.06, "grad_norm": 2.6820244249317375, "learning_rate": 9.976653823582253e-06, "loss": 1.259, "step": 330 }, { "epoch": 0.06, "grad_norm": 3.0357143479356665, "learning_rate": 9.975941576332611e-06, "loss": 1.2904, "step": 335 }, { "epoch": 0.06, "grad_norm": 2.8019467918942915, "learning_rate": 9.975218653260621e-06, "loss": 1.3113, "step": 340 }, { "epoch": 0.06, "grad_norm": 2.3048841868601495, "learning_rate": 9.974485055917303e-06, "loss": 1.3132, "step": 345 }, { "epoch": 0.07, "grad_norm": 2.5081602045955065, "learning_rate": 9.973740785876582e-06, "loss": 1.3331, "step": 350 }, { "epoch": 0.07, "grad_norm": 2.679225954751873, "learning_rate": 9.97298584473528e-06, "loss": 1.3206, "step": 355 }, { "epoch": 0.07, "grad_norm": 2.7120364585174075, "learning_rate": 9.972220234113114e-06, "loss": 1.2441, "step": 360 }, { "epoch": 0.07, "grad_norm": 3.0678883501219705, "learning_rate": 9.971443955652692e-06, "loss": 1.2323, "step": 365 }, { "epoch": 0.07, "grad_norm": 2.5544818482029843, "learning_rate": 9.97065701101951e-06, "loss": 1.2432, "step": 370 }, { "epoch": 0.07, "grad_norm": 2.6731233037881257, "learning_rate": 9.969859401901948e-06, "loss": 1.2668, "step": 375 }, { "epoch": 0.07, "grad_norm": 2.804128919360195, "learning_rate": 9.969051130011268e-06, "loss": 1.3494, "step": 380 }, { "epoch": 0.07, "grad_norm": 3.079258704000447, "learning_rate": 9.968232197081609e-06, "loss": 1.2553, "step": 385 }, { "epoch": 0.07, "grad_norm": 2.7829735470494263, "learning_rate": 9.967402604869976e-06, "loss": 1.3145, "step": 390 }, { "epoch": 0.07, "grad_norm": 2.8363957159184623, "learning_rate": 9.966562355156254e-06, "loss": 1.2802, "step": 395 }, { "epoch": 0.07, "grad_norm": 3.090152084812277, "learning_rate": 9.965711449743187e-06, "loss": 1.3289, "step": 400 }, { "epoch": 0.08, "grad_norm": 2.8212887732560223, "learning_rate": 9.964849890456382e-06, "loss": 1.2879, "step": 405 }, { "epoch": 0.08, "grad_norm": 3.2716434155310394, "learning_rate": 9.963977679144304e-06, "loss": 1.311, "step": 410 }, { "epoch": 0.08, "grad_norm": 2.8169159980839464, "learning_rate": 9.963094817678273e-06, "loss": 1.2731, "step": 415 }, { "epoch": 0.08, "grad_norm": 3.1670684683630705, "learning_rate": 9.962201307952455e-06, "loss": 1.2683, "step": 420 }, { "epoch": 0.08, "grad_norm": 2.62866536431656, "learning_rate": 9.961297151883869e-06, "loss": 1.2799, "step": 425 }, { "epoch": 0.08, "grad_norm": 3.362938035312287, "learning_rate": 9.960382351412365e-06, "loss": 1.3027, "step": 430 }, { "epoch": 0.08, "grad_norm": 2.6267772082519825, "learning_rate": 9.95945690850064e-06, "loss": 1.2544, "step": 435 }, { "epoch": 0.08, "grad_norm": 2.773463036278164, "learning_rate": 9.958520825134221e-06, "loss": 1.2887, "step": 440 }, { "epoch": 0.08, "grad_norm": 2.911616224637602, "learning_rate": 9.957574103321462e-06, "loss": 1.2402, "step": 445 }, { "epoch": 0.08, "grad_norm": 2.395016720915871, "learning_rate": 9.956616745093541e-06, "loss": 1.2492, "step": 450 }, { "epoch": 0.08, "grad_norm": 2.4830984531007423, "learning_rate": 9.955648752504463e-06, "loss": 1.243, "step": 455 }, { "epoch": 0.09, "grad_norm": 2.8437350034534052, "learning_rate": 9.954670127631042e-06, "loss": 1.3052, "step": 460 }, { "epoch": 0.09, "grad_norm": 2.794782444316208, "learning_rate": 9.953680872572906e-06, "loss": 1.2655, "step": 465 }, { "epoch": 0.09, "grad_norm": 2.622763840389913, "learning_rate": 9.95268098945249e-06, "loss": 1.3403, "step": 470 }, { "epoch": 0.09, "grad_norm": 2.963439619215708, "learning_rate": 9.95167048041503e-06, "loss": 1.3105, "step": 475 }, { "epoch": 0.09, "grad_norm": 2.6699317361231447, "learning_rate": 9.950649347628564e-06, "loss": 1.255, "step": 480 }, { "epoch": 0.09, "grad_norm": 2.909028049137236, "learning_rate": 9.949617593283916e-06, "loss": 1.3104, "step": 485 }, { "epoch": 0.09, "grad_norm": 2.5792044118119613, "learning_rate": 9.948575219594704e-06, "loss": 1.2582, "step": 490 }, { "epoch": 0.09, "grad_norm": 2.981241252777049, "learning_rate": 9.94752222879733e-06, "loss": 1.3307, "step": 495 }, { "epoch": 0.09, "grad_norm": 2.8980802115787023, "learning_rate": 9.94645862315097e-06, "loss": 1.2995, "step": 500 }, { "epoch": 0.09, "grad_norm": 2.7552380730072485, "learning_rate": 9.945384404937581e-06, "loss": 1.2996, "step": 505 }, { "epoch": 0.1, "grad_norm": 3.182612212241565, "learning_rate": 9.94429957646188e-06, "loss": 1.3282, "step": 510 }, { "epoch": 0.1, "grad_norm": 2.893316970193462, "learning_rate": 9.943204140051357e-06, "loss": 1.289, "step": 515 }, { "epoch": 0.1, "grad_norm": 3.0853061089675307, "learning_rate": 9.942098098056255e-06, "loss": 1.3116, "step": 520 }, { "epoch": 0.1, "grad_norm": 3.2597008125913147, "learning_rate": 9.940981452849577e-06, "loss": 1.2695, "step": 525 }, { "epoch": 0.1, "grad_norm": 2.803486275623243, "learning_rate": 9.939854206827069e-06, "loss": 1.2988, "step": 530 }, { "epoch": 0.1, "grad_norm": 2.7856894140636626, "learning_rate": 9.938716362407226e-06, "loss": 1.2213, "step": 535 }, { "epoch": 0.1, "grad_norm": 3.1781516944496344, "learning_rate": 9.937567922031277e-06, "loss": 1.2921, "step": 540 }, { "epoch": 0.1, "grad_norm": 2.70081195002388, "learning_rate": 9.936408888163187e-06, "loss": 1.2939, "step": 545 }, { "epoch": 0.1, "grad_norm": 2.571513269019906, "learning_rate": 9.93523926328965e-06, "loss": 1.2787, "step": 550 }, { "epoch": 0.1, "grad_norm": 2.5347818693158395, "learning_rate": 9.934059049920085e-06, "loss": 1.2396, "step": 555 }, { "epoch": 0.1, "grad_norm": 2.927026090981205, "learning_rate": 9.932868250586619e-06, "loss": 1.3322, "step": 560 }, { "epoch": 0.11, "grad_norm": 3.0072023467515714, "learning_rate": 9.931666867844103e-06, "loss": 1.2505, "step": 565 }, { "epoch": 0.11, "grad_norm": 2.5798486811625816, "learning_rate": 9.930454904270087e-06, "loss": 1.3168, "step": 570 }, { "epoch": 0.11, "grad_norm": 2.606273714286121, "learning_rate": 9.929232362464825e-06, "loss": 1.2929, "step": 575 }, { "epoch": 0.11, "grad_norm": 2.862057851885314, "learning_rate": 9.927999245051263e-06, "loss": 1.2221, "step": 580 }, { "epoch": 0.11, "grad_norm": 3.0533100345061666, "learning_rate": 9.926755554675043e-06, "loss": 1.2824, "step": 585 }, { "epoch": 0.11, "grad_norm": 2.8268549184677196, "learning_rate": 9.925501294004485e-06, "loss": 1.2689, "step": 590 }, { "epoch": 0.11, "grad_norm": 2.9474244570654515, "learning_rate": 9.924236465730592e-06, "loss": 1.266, "step": 595 }, { "epoch": 0.11, "grad_norm": 2.5171703874147315, "learning_rate": 9.922961072567037e-06, "loss": 1.2945, "step": 600 }, { "epoch": 0.11, "grad_norm": 2.8323155471229073, "learning_rate": 9.92167511725016e-06, "loss": 1.2497, "step": 605 }, { "epoch": 0.11, "grad_norm": 2.6443126596211632, "learning_rate": 9.920378602538963e-06, "loss": 1.2388, "step": 610 }, { "epoch": 0.11, "grad_norm": 2.77248510868743, "learning_rate": 9.919071531215104e-06, "loss": 1.2655, "step": 615 }, { "epoch": 0.12, "grad_norm": 3.526397598316819, "learning_rate": 9.917753906082885e-06, "loss": 1.302, "step": 620 }, { "epoch": 0.12, "grad_norm": 3.015236540785339, "learning_rate": 9.916425729969259e-06, "loss": 1.2072, "step": 625 }, { "epoch": 0.12, "grad_norm": 2.8038695983344937, "learning_rate": 9.915087005723809e-06, "loss": 1.2924, "step": 630 }, { "epoch": 0.12, "grad_norm": 2.5898365904275034, "learning_rate": 9.913737736218751e-06, "loss": 1.2357, "step": 635 }, { "epoch": 0.12, "grad_norm": 2.4969045444120566, "learning_rate": 9.912377924348931e-06, "loss": 1.2601, "step": 640 }, { "epoch": 0.12, "grad_norm": 2.8599851914324357, "learning_rate": 9.911007573031803e-06, "loss": 1.2779, "step": 645 }, { "epoch": 0.12, "grad_norm": 2.53590231137642, "learning_rate": 9.909626685207444e-06, "loss": 1.2425, "step": 650 }, { "epoch": 0.12, "grad_norm": 2.7844674380752457, "learning_rate": 9.908235263838529e-06, "loss": 1.2628, "step": 655 }, { "epoch": 0.12, "grad_norm": 2.921537190563611, "learning_rate": 9.906833311910335e-06, "loss": 1.2918, "step": 660 }, { "epoch": 0.12, "grad_norm": 2.674269569684917, "learning_rate": 9.905420832430736e-06, "loss": 1.2317, "step": 665 }, { "epoch": 0.12, "grad_norm": 2.7972975778914493, "learning_rate": 9.903997828430188e-06, "loss": 1.2706, "step": 670 }, { "epoch": 0.13, "grad_norm": 3.0857826068674234, "learning_rate": 9.902564302961727e-06, "loss": 1.2953, "step": 675 }, { "epoch": 0.13, "grad_norm": 2.353182975912364, "learning_rate": 9.901120259100969e-06, "loss": 1.3039, "step": 680 }, { "epoch": 0.13, "grad_norm": 2.5050793033453953, "learning_rate": 9.89966569994609e-06, "loss": 1.2548, "step": 685 }, { "epoch": 0.13, "grad_norm": 2.8182846692065966, "learning_rate": 9.898200628617829e-06, "loss": 1.3022, "step": 690 }, { "epoch": 0.13, "grad_norm": 3.6265366655340845, "learning_rate": 9.896725048259478e-06, "loss": 1.241, "step": 695 }, { "epoch": 0.13, "grad_norm": 2.6434221137993625, "learning_rate": 9.895238962036878e-06, "loss": 1.275, "step": 700 }, { "epoch": 0.13, "grad_norm": 2.5173535448609283, "learning_rate": 9.893742373138408e-06, "loss": 1.2568, "step": 705 }, { "epoch": 0.13, "grad_norm": 2.7542295966472348, "learning_rate": 9.892235284774985e-06, "loss": 1.2868, "step": 710 }, { "epoch": 0.13, "grad_norm": 2.368577938425068, "learning_rate": 9.890717700180043e-06, "loss": 1.2245, "step": 715 }, { "epoch": 0.13, "grad_norm": 2.5278687487866884, "learning_rate": 9.889189622609545e-06, "loss": 1.2583, "step": 720 }, { "epoch": 0.14, "grad_norm": 2.702147632129278, "learning_rate": 9.887651055341961e-06, "loss": 1.2428, "step": 725 }, { "epoch": 0.14, "grad_norm": 2.35390476449974, "learning_rate": 9.886102001678271e-06, "loss": 1.282, "step": 730 }, { "epoch": 0.14, "grad_norm": 3.0658442781629756, "learning_rate": 9.884542464941948e-06, "loss": 1.29, "step": 735 }, { "epoch": 0.14, "grad_norm": 3.5482489583081733, "learning_rate": 9.882972448478962e-06, "loss": 1.2043, "step": 740 }, { "epoch": 0.14, "grad_norm": 2.8661887660145897, "learning_rate": 9.881391955657762e-06, "loss": 1.2325, "step": 745 }, { "epoch": 0.14, "grad_norm": 2.7066006373702063, "learning_rate": 9.879800989869277e-06, "loss": 1.2436, "step": 750 }, { "epoch": 0.14, "grad_norm": 2.7536179864413666, "learning_rate": 9.878199554526904e-06, "loss": 1.2886, "step": 755 }, { "epoch": 0.14, "grad_norm": 2.4756211385632683, "learning_rate": 9.876587653066504e-06, "loss": 1.3146, "step": 760 }, { "epoch": 0.14, "grad_norm": 2.678588724807592, "learning_rate": 9.874965288946395e-06, "loss": 1.2565, "step": 765 }, { "epoch": 0.14, "grad_norm": 2.6248055714027645, "learning_rate": 9.873332465647333e-06, "loss": 1.2838, "step": 770 }, { "epoch": 0.14, "grad_norm": 2.933089391512721, "learning_rate": 9.871689186672527e-06, "loss": 1.3046, "step": 775 }, { "epoch": 0.15, "grad_norm": 2.8054595245249194, "learning_rate": 9.870035455547607e-06, "loss": 1.3532, "step": 780 }, { "epoch": 0.15, "grad_norm": 3.029754362133909, "learning_rate": 9.868371275820636e-06, "loss": 1.2358, "step": 785 }, { "epoch": 0.15, "grad_norm": 2.381829122673505, "learning_rate": 9.866696651062092e-06, "loss": 1.2673, "step": 790 }, { "epoch": 0.15, "grad_norm": 2.9578049809020284, "learning_rate": 9.86501158486486e-06, "loss": 1.3057, "step": 795 }, { "epoch": 0.15, "grad_norm": 3.2079875550888652, "learning_rate": 9.86331608084423e-06, "loss": 1.2836, "step": 800 }, { "epoch": 0.15, "grad_norm": 2.7394380027483316, "learning_rate": 9.861610142637885e-06, "loss": 1.2967, "step": 805 }, { "epoch": 0.15, "grad_norm": 2.757020347162176, "learning_rate": 9.859893773905897e-06, "loss": 1.327, "step": 810 }, { "epoch": 0.15, "grad_norm": 2.7386946320924723, "learning_rate": 9.858166978330711e-06, "loss": 1.3324, "step": 815 }, { "epoch": 0.15, "grad_norm": 2.487600070733838, "learning_rate": 9.856429759617148e-06, "loss": 1.243, "step": 820 }, { "epoch": 0.15, "grad_norm": 2.4859094980688643, "learning_rate": 9.854682121492388e-06, "loss": 1.2884, "step": 825 }, { "epoch": 0.15, "grad_norm": 2.671185046699724, "learning_rate": 9.852924067705968e-06, "loss": 1.2814, "step": 830 }, { "epoch": 0.16, "grad_norm": 2.842082690479725, "learning_rate": 9.851155602029774e-06, "loss": 1.1763, "step": 835 }, { "epoch": 0.16, "grad_norm": 2.590003326493348, "learning_rate": 9.849376728258024e-06, "loss": 1.2732, "step": 840 }, { "epoch": 0.16, "grad_norm": 3.755687197612357, "learning_rate": 9.847587450207273e-06, "loss": 1.3001, "step": 845 }, { "epoch": 0.16, "grad_norm": 3.078220599557541, "learning_rate": 9.845787771716391e-06, "loss": 1.297, "step": 850 }, { "epoch": 0.16, "grad_norm": 2.8978172028530995, "learning_rate": 9.843977696646572e-06, "loss": 1.2412, "step": 855 }, { "epoch": 0.16, "grad_norm": 2.5403657098108647, "learning_rate": 9.842157228881305e-06, "loss": 1.2634, "step": 860 }, { "epoch": 0.16, "grad_norm": 2.95924694930617, "learning_rate": 9.840326372326386e-06, "loss": 1.2423, "step": 865 }, { "epoch": 0.16, "grad_norm": 2.709630812317427, "learning_rate": 9.838485130909892e-06, "loss": 1.3462, "step": 870 }, { "epoch": 0.16, "grad_norm": 2.8155472202298384, "learning_rate": 9.836633508582185e-06, "loss": 1.2516, "step": 875 }, { "epoch": 0.16, "grad_norm": 2.500070860342406, "learning_rate": 9.834771509315899e-06, "loss": 1.2849, "step": 880 }, { "epoch": 0.17, "grad_norm": 2.3553653961885845, "learning_rate": 9.832899137105931e-06, "loss": 1.2972, "step": 885 }, { "epoch": 0.17, "grad_norm": 2.739911552383276, "learning_rate": 9.831016395969432e-06, "loss": 1.2898, "step": 890 }, { "epoch": 0.17, "grad_norm": 2.7979814034906996, "learning_rate": 9.829123289945803e-06, "loss": 1.2352, "step": 895 }, { "epoch": 0.17, "grad_norm": 2.7756637703151266, "learning_rate": 9.827219823096677e-06, "loss": 1.2838, "step": 900 }, { "epoch": 0.17, "grad_norm": 3.147380497985643, "learning_rate": 9.82530599950592e-06, "loss": 1.3009, "step": 905 }, { "epoch": 0.17, "grad_norm": 2.6195588988426817, "learning_rate": 9.823381823279617e-06, "loss": 1.2542, "step": 910 }, { "epoch": 0.17, "grad_norm": 3.0294691695407687, "learning_rate": 9.821447298546063e-06, "loss": 1.2529, "step": 915 }, { "epoch": 0.17, "grad_norm": 3.0406219437265136, "learning_rate": 9.819502429455762e-06, "loss": 1.2788, "step": 920 }, { "epoch": 0.17, "grad_norm": 2.751447227490008, "learning_rate": 9.817547220181404e-06, "loss": 1.2474, "step": 925 }, { "epoch": 0.17, "grad_norm": 2.7920501109572946, "learning_rate": 9.815581674917866e-06, "loss": 1.2426, "step": 930 }, { "epoch": 0.17, "grad_norm": 2.702602399095983, "learning_rate": 9.813605797882204e-06, "loss": 1.2792, "step": 935 }, { "epoch": 0.18, "grad_norm": 2.5213037151582913, "learning_rate": 9.811619593313636e-06, "loss": 1.2396, "step": 940 }, { "epoch": 0.18, "grad_norm": 2.5410701879815263, "learning_rate": 9.809623065473544e-06, "loss": 1.332, "step": 945 }, { "epoch": 0.18, "grad_norm": 2.754615306176169, "learning_rate": 9.807616218645448e-06, "loss": 1.3013, "step": 950 }, { "epoch": 0.18, "grad_norm": 3.0527201906235515, "learning_rate": 9.805599057135017e-06, "loss": 1.2555, "step": 955 }, { "epoch": 0.18, "grad_norm": 2.7211770353555464, "learning_rate": 9.803571585270047e-06, "loss": 1.2898, "step": 960 }, { "epoch": 0.18, "grad_norm": 2.8451338452669073, "learning_rate": 9.801533807400455e-06, "loss": 1.3269, "step": 965 }, { "epoch": 0.18, "grad_norm": 2.375893987239719, "learning_rate": 9.799485727898265e-06, "loss": 1.2695, "step": 970 }, { "epoch": 0.18, "grad_norm": 2.749006570276283, "learning_rate": 9.79742735115761e-06, "loss": 1.2496, "step": 975 }, { "epoch": 0.18, "grad_norm": 2.505279680209459, "learning_rate": 9.795358681594712e-06, "loss": 1.2445, "step": 980 }, { "epoch": 0.18, "grad_norm": 2.582391358707717, "learning_rate": 9.793279723647874e-06, "loss": 1.2596, "step": 985 }, { "epoch": 0.18, "grad_norm": 2.5671931470502467, "learning_rate": 9.79119048177748e-06, "loss": 1.2719, "step": 990 }, { "epoch": 0.19, "grad_norm": 2.8817921609658854, "learning_rate": 9.789090960465968e-06, "loss": 1.2621, "step": 995 }, { "epoch": 0.19, "grad_norm": 3.0026356453735605, "learning_rate": 9.786981164217838e-06, "loss": 1.3099, "step": 1000 }, { "epoch": 0.19, "grad_norm": 2.575472157187206, "learning_rate": 9.784861097559632e-06, "loss": 1.255, "step": 1005 }, { "epoch": 0.19, "grad_norm": 2.8771804234712754, "learning_rate": 9.782730765039927e-06, "loss": 1.2492, "step": 1010 }, { "epoch": 0.19, "grad_norm": 2.7679377180963827, "learning_rate": 9.780590171229327e-06, "loss": 1.2866, "step": 1015 }, { "epoch": 0.19, "grad_norm": 2.616337211838346, "learning_rate": 9.778439320720448e-06, "loss": 1.2376, "step": 1020 }, { "epoch": 0.19, "grad_norm": 2.482034813414732, "learning_rate": 9.776278218127914e-06, "loss": 1.2445, "step": 1025 }, { "epoch": 0.19, "grad_norm": 2.7329915919806758, "learning_rate": 9.774106868088346e-06, "loss": 1.2521, "step": 1030 }, { "epoch": 0.19, "grad_norm": 2.680070713070951, "learning_rate": 9.771925275260348e-06, "loss": 1.3296, "step": 1035 }, { "epoch": 0.19, "grad_norm": 2.8977960820861597, "learning_rate": 9.769733444324501e-06, "loss": 1.2053, "step": 1040 }, { "epoch": 0.19, "grad_norm": 2.977796667138015, "learning_rate": 9.767531379983353e-06, "loss": 1.293, "step": 1045 }, { "epoch": 0.2, "grad_norm": 3.4348023333586823, "learning_rate": 9.765319086961405e-06, "loss": 1.2839, "step": 1050 }, { "epoch": 0.2, "grad_norm": 2.408646301347671, "learning_rate": 9.763096570005103e-06, "loss": 1.2475, "step": 1055 }, { "epoch": 0.2, "grad_norm": 2.4851841489693043, "learning_rate": 9.760863833882834e-06, "loss": 1.2663, "step": 1060 }, { "epoch": 0.2, "grad_norm": 2.862755821551652, "learning_rate": 9.758620883384906e-06, "loss": 1.3317, "step": 1065 }, { "epoch": 0.2, "grad_norm": 2.4831453533426915, "learning_rate": 9.756367723323538e-06, "loss": 1.2525, "step": 1070 }, { "epoch": 0.2, "grad_norm": 2.570370036777222, "learning_rate": 9.754104358532863e-06, "loss": 1.2796, "step": 1075 }, { "epoch": 0.2, "grad_norm": 2.583982409024191, "learning_rate": 9.7518307938689e-06, "loss": 1.3011, "step": 1080 }, { "epoch": 0.2, "grad_norm": 2.6843240127817904, "learning_rate": 9.749547034209552e-06, "loss": 1.2811, "step": 1085 }, { "epoch": 0.2, "grad_norm": 2.3089147503032557, "learning_rate": 9.747253084454601e-06, "loss": 1.2071, "step": 1090 }, { "epoch": 0.2, "grad_norm": 2.3584969392089796, "learning_rate": 9.74494894952569e-06, "loss": 1.2372, "step": 1095 }, { "epoch": 0.21, "grad_norm": 2.393274524242671, "learning_rate": 9.742634634366308e-06, "loss": 1.245, "step": 1100 }, { "epoch": 0.21, "grad_norm": 2.7920124624113787, "learning_rate": 9.740310143941792e-06, "loss": 1.3044, "step": 1105 }, { "epoch": 0.21, "grad_norm": 2.8358988005907597, "learning_rate": 9.73797548323931e-06, "loss": 1.2091, "step": 1110 }, { "epoch": 0.21, "grad_norm": 2.672057798702065, "learning_rate": 9.735630657267846e-06, "loss": 1.2464, "step": 1115 }, { "epoch": 0.21, "grad_norm": 2.9456854326651194, "learning_rate": 9.733275671058195e-06, "loss": 1.2838, "step": 1120 }, { "epoch": 0.21, "grad_norm": 2.8896784051857574, "learning_rate": 9.730910529662954e-06, "loss": 1.2381, "step": 1125 }, { "epoch": 0.21, "grad_norm": 2.769582071620601, "learning_rate": 9.728535238156504e-06, "loss": 1.3079, "step": 1130 }, { "epoch": 0.21, "grad_norm": 2.4653295633356302, "learning_rate": 9.726149801635002e-06, "loss": 1.278, "step": 1135 }, { "epoch": 0.21, "grad_norm": 2.941156988269561, "learning_rate": 9.723754225216378e-06, "loss": 1.2622, "step": 1140 }, { "epoch": 0.21, "grad_norm": 2.637713701891722, "learning_rate": 9.721348514040307e-06, "loss": 1.2717, "step": 1145 }, { "epoch": 0.21, "grad_norm": 2.7517262009540175, "learning_rate": 9.718932673268218e-06, "loss": 1.2582, "step": 1150 }, { "epoch": 0.22, "grad_norm": 2.9577500419309435, "learning_rate": 9.716506708083264e-06, "loss": 1.2748, "step": 1155 }, { "epoch": 0.22, "grad_norm": 3.090524775310035, "learning_rate": 9.714070623690326e-06, "loss": 1.2764, "step": 1160 }, { "epoch": 0.22, "grad_norm": 2.3932563623472327, "learning_rate": 9.711624425315993e-06, "loss": 1.2321, "step": 1165 }, { "epoch": 0.22, "grad_norm": 2.8559504713397175, "learning_rate": 9.709168118208553e-06, "loss": 1.2493, "step": 1170 }, { "epoch": 0.22, "grad_norm": 2.7551996217047545, "learning_rate": 9.706701707637984e-06, "loss": 1.2613, "step": 1175 }, { "epoch": 0.22, "grad_norm": 2.966463340002555, "learning_rate": 9.704225198895943e-06, "loss": 1.2348, "step": 1180 }, { "epoch": 0.22, "grad_norm": 2.5657038224244744, "learning_rate": 9.701738597295743e-06, "loss": 1.2136, "step": 1185 }, { "epoch": 0.22, "grad_norm": 2.5586347071410445, "learning_rate": 9.699241908172364e-06, "loss": 1.2914, "step": 1190 }, { "epoch": 0.22, "grad_norm": 2.5434709087968197, "learning_rate": 9.696735136882419e-06, "loss": 1.2659, "step": 1195 }, { "epoch": 0.22, "grad_norm": 2.7061026705057185, "learning_rate": 9.694218288804154e-06, "loss": 1.2609, "step": 1200 }, { "epoch": 0.22, "grad_norm": 2.396398638179384, "learning_rate": 9.691691369337439e-06, "loss": 1.2483, "step": 1205 }, { "epoch": 0.23, "grad_norm": 2.438971895626567, "learning_rate": 9.689154383903749e-06, "loss": 1.2568, "step": 1210 }, { "epoch": 0.23, "grad_norm": 2.801969219455209, "learning_rate": 9.686607337946155e-06, "loss": 1.189, "step": 1215 }, { "epoch": 0.23, "grad_norm": 2.715494965332097, "learning_rate": 9.684050236929309e-06, "loss": 1.2282, "step": 1220 }, { "epoch": 0.23, "grad_norm": 2.43406850408372, "learning_rate": 9.681483086339448e-06, "loss": 1.2654, "step": 1225 }, { "epoch": 0.23, "grad_norm": 2.637848687572683, "learning_rate": 9.678905891684359e-06, "loss": 1.2365, "step": 1230 }, { "epoch": 0.23, "grad_norm": 2.816634445745627, "learning_rate": 9.676318658493378e-06, "loss": 1.2892, "step": 1235 }, { "epoch": 0.23, "grad_norm": 2.59188291753139, "learning_rate": 9.673721392317388e-06, "loss": 1.2818, "step": 1240 }, { "epoch": 0.23, "grad_norm": 2.6589472617669214, "learning_rate": 9.67111409872879e-06, "loss": 1.2525, "step": 1245 }, { "epoch": 0.23, "grad_norm": 2.418049084852354, "learning_rate": 9.668496783321499e-06, "loss": 1.2528, "step": 1250 }, { "epoch": 0.23, "grad_norm": 2.6134422197628124, "learning_rate": 9.665869451710938e-06, "loss": 1.2489, "step": 1255 }, { "epoch": 0.23, "grad_norm": 2.8277369949630207, "learning_rate": 9.663232109534011e-06, "loss": 1.2425, "step": 1260 }, { "epoch": 0.24, "grad_norm": 2.6677083847071454, "learning_rate": 9.660584762449106e-06, "loss": 1.2528, "step": 1265 }, { "epoch": 0.24, "grad_norm": 2.901015204481418, "learning_rate": 9.657927416136072e-06, "loss": 1.2856, "step": 1270 }, { "epoch": 0.24, "grad_norm": 2.783653207223188, "learning_rate": 9.655260076296213e-06, "loss": 1.2194, "step": 1275 }, { "epoch": 0.24, "grad_norm": 2.5728825133639455, "learning_rate": 9.652582748652278e-06, "loss": 1.2669, "step": 1280 }, { "epoch": 0.24, "grad_norm": 2.730634926238819, "learning_rate": 9.649895438948438e-06, "loss": 1.2591, "step": 1285 }, { "epoch": 0.24, "grad_norm": 2.7910418990912684, "learning_rate": 9.647198152950285e-06, "loss": 1.2911, "step": 1290 }, { "epoch": 0.24, "grad_norm": 2.6279219550939485, "learning_rate": 9.644490896444812e-06, "loss": 1.2364, "step": 1295 }, { "epoch": 0.24, "grad_norm": 2.6929143816353007, "learning_rate": 9.641773675240408e-06, "loss": 1.2472, "step": 1300 }, { "epoch": 0.24, "grad_norm": 2.7126488640148536, "learning_rate": 9.639046495166833e-06, "loss": 1.2981, "step": 1305 }, { "epoch": 0.24, "grad_norm": 2.275713238425623, "learning_rate": 9.636309362075222e-06, "loss": 1.3011, "step": 1310 }, { "epoch": 0.25, "grad_norm": 2.813094238557557, "learning_rate": 9.63356228183806e-06, "loss": 1.2446, "step": 1315 }, { "epoch": 0.25, "grad_norm": 3.0356903685468164, "learning_rate": 9.630805260349176e-06, "loss": 1.3118, "step": 1320 }, { "epoch": 0.25, "grad_norm": 2.6472645381899635, "learning_rate": 9.628038303523725e-06, "loss": 1.2682, "step": 1325 }, { "epoch": 0.25, "grad_norm": 2.512006211451414, "learning_rate": 9.625261417298179e-06, "loss": 1.2472, "step": 1330 }, { "epoch": 0.25, "grad_norm": 2.488773622482499, "learning_rate": 9.622474607630315e-06, "loss": 1.2376, "step": 1335 }, { "epoch": 0.25, "grad_norm": 2.685018435556931, "learning_rate": 9.619677880499196e-06, "loss": 1.3117, "step": 1340 }, { "epoch": 0.25, "grad_norm": 2.5059232088783756, "learning_rate": 9.61687124190517e-06, "loss": 1.2424, "step": 1345 }, { "epoch": 0.25, "grad_norm": 3.0030330704239327, "learning_rate": 9.614054697869846e-06, "loss": 1.2325, "step": 1350 }, { "epoch": 0.25, "grad_norm": 2.7185700535574324, "learning_rate": 9.611228254436082e-06, "loss": 1.2709, "step": 1355 }, { "epoch": 0.25, "grad_norm": 2.6550063124078167, "learning_rate": 9.60839191766798e-06, "loss": 1.2554, "step": 1360 }, { "epoch": 0.25, "grad_norm": 2.3924990329834506, "learning_rate": 9.605545693650869e-06, "loss": 1.2411, "step": 1365 }, { "epoch": 0.26, "grad_norm": 2.9253114263002726, "learning_rate": 9.602689588491284e-06, "loss": 1.2349, "step": 1370 }, { "epoch": 0.26, "grad_norm": 3.1305134098499097, "learning_rate": 9.599823608316962e-06, "loss": 1.1982, "step": 1375 }, { "epoch": 0.26, "grad_norm": 2.494931249541086, "learning_rate": 9.596947759276835e-06, "loss": 1.2799, "step": 1380 }, { "epoch": 0.26, "grad_norm": 2.3982225168241644, "learning_rate": 9.594062047541e-06, "loss": 1.2257, "step": 1385 }, { "epoch": 0.26, "grad_norm": 2.586775286217663, "learning_rate": 9.591166479300714e-06, "loss": 1.2032, "step": 1390 }, { "epoch": 0.26, "grad_norm": 2.797391466557641, "learning_rate": 9.588261060768388e-06, "loss": 1.1992, "step": 1395 }, { "epoch": 0.26, "grad_norm": 3.2692807339475976, "learning_rate": 9.585345798177557e-06, "loss": 1.2955, "step": 1400 }, { "epoch": 0.26, "grad_norm": 2.4762035990234392, "learning_rate": 9.582420697782884e-06, "loss": 1.328, "step": 1405 }, { "epoch": 0.26, "grad_norm": 2.88773258550614, "learning_rate": 9.579485765860138e-06, "loss": 1.2713, "step": 1410 }, { "epoch": 0.26, "grad_norm": 2.537654166999561, "learning_rate": 9.576541008706177e-06, "loss": 1.2273, "step": 1415 }, { "epoch": 0.26, "grad_norm": 2.5099612429483527, "learning_rate": 9.573586432638947e-06, "loss": 1.251, "step": 1420 }, { "epoch": 0.27, "grad_norm": 2.4565468231653558, "learning_rate": 9.570622043997447e-06, "loss": 1.2887, "step": 1425 }, { "epoch": 0.27, "grad_norm": 2.7085428069726394, "learning_rate": 9.567647849141744e-06, "loss": 1.2744, "step": 1430 }, { "epoch": 0.27, "grad_norm": 2.649022261193647, "learning_rate": 9.564663854452933e-06, "loss": 1.3101, "step": 1435 }, { "epoch": 0.27, "grad_norm": 2.584187921682549, "learning_rate": 9.561670066333142e-06, "loss": 1.2482, "step": 1440 }, { "epoch": 0.27, "grad_norm": 2.6769977815059693, "learning_rate": 9.558666491205503e-06, "loss": 1.2361, "step": 1445 }, { "epoch": 0.27, "grad_norm": 2.463200002777827, "learning_rate": 9.555653135514152e-06, "loss": 1.2859, "step": 1450 }, { "epoch": 0.27, "grad_norm": 2.8051780394700536, "learning_rate": 9.552630005724209e-06, "loss": 1.2653, "step": 1455 }, { "epoch": 0.27, "grad_norm": 2.7025858761551316, "learning_rate": 9.549597108321759e-06, "loss": 1.1986, "step": 1460 }, { "epoch": 0.27, "grad_norm": 2.448947238174015, "learning_rate": 9.546554449813847e-06, "loss": 1.281, "step": 1465 }, { "epoch": 0.27, "grad_norm": 2.583763069734913, "learning_rate": 9.543502036728463e-06, "loss": 1.2564, "step": 1470 }, { "epoch": 0.28, "grad_norm": 2.8131660018770575, "learning_rate": 9.54043987561452e-06, "loss": 1.2581, "step": 1475 }, { "epoch": 0.28, "grad_norm": 2.611621933823635, "learning_rate": 9.537367973041847e-06, "loss": 1.244, "step": 1480 }, { "epoch": 0.28, "grad_norm": 2.725299231142621, "learning_rate": 9.534286335601175e-06, "loss": 1.2628, "step": 1485 }, { "epoch": 0.28, "grad_norm": 2.5387805053467734, "learning_rate": 9.53119496990412e-06, "loss": 1.2373, "step": 1490 }, { "epoch": 0.28, "grad_norm": 2.5030631132275074, "learning_rate": 9.52809388258317e-06, "loss": 1.307, "step": 1495 }, { "epoch": 0.28, "grad_norm": 2.867164940207714, "learning_rate": 9.524983080291667e-06, "loss": 1.247, "step": 1500 }, { "epoch": 0.28, "grad_norm": 2.7835274007016695, "learning_rate": 9.521862569703802e-06, "loss": 1.2873, "step": 1505 }, { "epoch": 0.28, "grad_norm": 2.4127815440210476, "learning_rate": 9.518732357514592e-06, "loss": 1.2288, "step": 1510 }, { "epoch": 0.28, "grad_norm": 2.490311452425188, "learning_rate": 9.515592450439872e-06, "loss": 1.2596, "step": 1515 }, { "epoch": 0.28, "grad_norm": 2.5853819055443448, "learning_rate": 9.512442855216274e-06, "loss": 1.2479, "step": 1520 }, { "epoch": 0.28, "grad_norm": 2.387948927852224, "learning_rate": 9.509283578601212e-06, "loss": 1.2669, "step": 1525 }, { "epoch": 0.29, "grad_norm": 3.0927210433687904, "learning_rate": 9.50611462737288e-06, "loss": 1.2585, "step": 1530 }, { "epoch": 0.29, "grad_norm": 2.467133574190019, "learning_rate": 9.502936008330222e-06, "loss": 1.2901, "step": 1535 }, { "epoch": 0.29, "grad_norm": 2.6739108212719427, "learning_rate": 9.499747728292928e-06, "loss": 1.2252, "step": 1540 }, { "epoch": 0.29, "grad_norm": 2.5205586130534834, "learning_rate": 9.496549794101413e-06, "loss": 1.2726, "step": 1545 }, { "epoch": 0.29, "grad_norm": 2.604524536073613, "learning_rate": 9.493342212616807e-06, "loss": 1.2466, "step": 1550 }, { "epoch": 0.29, "grad_norm": 2.4525722518831237, "learning_rate": 9.490124990720936e-06, "loss": 1.286, "step": 1555 }, { "epoch": 0.29, "grad_norm": 2.56958620162423, "learning_rate": 9.486898135316315e-06, "loss": 1.2298, "step": 1560 }, { "epoch": 0.29, "grad_norm": 2.9199217646471243, "learning_rate": 9.483661653326118e-06, "loss": 1.2662, "step": 1565 }, { "epoch": 0.29, "grad_norm": 2.764973994546948, "learning_rate": 9.48041555169418e-06, "loss": 1.2296, "step": 1570 }, { "epoch": 0.29, "grad_norm": 2.758405862424743, "learning_rate": 9.477159837384973e-06, "loss": 1.2885, "step": 1575 }, { "epoch": 0.29, "grad_norm": 2.8682635603750635, "learning_rate": 9.47389451738359e-06, "loss": 1.2252, "step": 1580 }, { "epoch": 0.3, "grad_norm": 2.796661944415381, "learning_rate": 9.470619598695739e-06, "loss": 1.3119, "step": 1585 }, { "epoch": 0.3, "grad_norm": 2.188147528697614, "learning_rate": 9.467335088347717e-06, "loss": 1.2391, "step": 1590 }, { "epoch": 0.3, "grad_norm": 2.779350124162447, "learning_rate": 9.4640409933864e-06, "loss": 1.2321, "step": 1595 }, { "epoch": 0.3, "grad_norm": 2.530631252303528, "learning_rate": 9.460737320879232e-06, "loss": 1.2691, "step": 1600 }, { "epoch": 0.3, "grad_norm": 2.418655276721412, "learning_rate": 9.4574240779142e-06, "loss": 1.2405, "step": 1605 }, { "epoch": 0.3, "grad_norm": 2.5254333263722573, "learning_rate": 9.454101271599823e-06, "loss": 1.2432, "step": 1610 }, { "epoch": 0.3, "grad_norm": 2.617128013213717, "learning_rate": 9.45076890906515e-06, "loss": 1.2873, "step": 1615 }, { "epoch": 0.3, "grad_norm": 2.4365283243913654, "learning_rate": 9.447426997459717e-06, "loss": 1.2496, "step": 1620 }, { "epoch": 0.3, "grad_norm": 2.659554267254664, "learning_rate": 9.444075543953559e-06, "loss": 1.2188, "step": 1625 }, { "epoch": 0.3, "grad_norm": 2.962413555940077, "learning_rate": 9.440714555737177e-06, "loss": 1.3516, "step": 1630 }, { "epoch": 0.3, "grad_norm": 2.5097728308245077, "learning_rate": 9.437344040021532e-06, "loss": 1.2091, "step": 1635 }, { "epoch": 0.31, "grad_norm": 2.659238025294085, "learning_rate": 9.433964004038023e-06, "loss": 1.2308, "step": 1640 }, { "epoch": 0.31, "grad_norm": 2.4819889409692504, "learning_rate": 9.430574455038477e-06, "loss": 1.1989, "step": 1645 }, { "epoch": 0.31, "grad_norm": 2.6088722399364057, "learning_rate": 9.427175400295132e-06, "loss": 1.2804, "step": 1650 }, { "epoch": 0.31, "grad_norm": 2.7713314708309613, "learning_rate": 9.423766847100615e-06, "loss": 1.269, "step": 1655 }, { "epoch": 0.31, "grad_norm": 2.4942039927782065, "learning_rate": 9.420348802767939e-06, "loss": 1.2649, "step": 1660 }, { "epoch": 0.31, "grad_norm": 2.5025712121307104, "learning_rate": 9.416921274630474e-06, "loss": 1.3082, "step": 1665 }, { "epoch": 0.31, "grad_norm": 2.5408229636779085, "learning_rate": 9.413484270041942e-06, "loss": 1.2438, "step": 1670 }, { "epoch": 0.31, "grad_norm": 2.945770428536271, "learning_rate": 9.410037796376395e-06, "loss": 1.2459, "step": 1675 }, { "epoch": 0.31, "grad_norm": 2.6674498043140993, "learning_rate": 9.406581861028199e-06, "loss": 1.2352, "step": 1680 }, { "epoch": 0.31, "grad_norm": 2.8933373067619743, "learning_rate": 9.40311647141202e-06, "loss": 1.2846, "step": 1685 }, { "epoch": 0.32, "grad_norm": 2.5378972208134, "learning_rate": 9.399641634962814e-06, "loss": 1.2392, "step": 1690 }, { "epoch": 0.32, "grad_norm": 2.905367330524848, "learning_rate": 9.396157359135797e-06, "loss": 1.2788, "step": 1695 }, { "epoch": 0.32, "grad_norm": 2.5530777021198086, "learning_rate": 9.392663651406442e-06, "loss": 1.2617, "step": 1700 }, { "epoch": 0.32, "grad_norm": 2.7538523238416275, "learning_rate": 9.389160519270455e-06, "loss": 1.2558, "step": 1705 }, { "epoch": 0.32, "grad_norm": 2.5680458173433918, "learning_rate": 9.385647970243765e-06, "loss": 1.27, "step": 1710 }, { "epoch": 0.32, "grad_norm": 2.980880170661701, "learning_rate": 9.382126011862504e-06, "loss": 1.2391, "step": 1715 }, { "epoch": 0.32, "grad_norm": 2.7881725751657913, "learning_rate": 9.378594651682988e-06, "loss": 1.2394, "step": 1720 }, { "epoch": 0.32, "grad_norm": 2.514336389736585, "learning_rate": 9.375053897281712e-06, "loss": 1.2701, "step": 1725 }, { "epoch": 0.32, "grad_norm": 2.5630163523861142, "learning_rate": 9.371503756255318e-06, "loss": 1.2299, "step": 1730 }, { "epoch": 0.32, "grad_norm": 2.8156056466588, "learning_rate": 9.367944236220592e-06, "loss": 1.2195, "step": 1735 }, { "epoch": 0.32, "grad_norm": 2.7751652875235866, "learning_rate": 9.36437534481444e-06, "loss": 1.2178, "step": 1740 }, { "epoch": 0.33, "grad_norm": 2.56505063504321, "learning_rate": 9.360797089693876e-06, "loss": 1.1842, "step": 1745 }, { "epoch": 0.33, "grad_norm": 2.528699332800653, "learning_rate": 9.357209478536004e-06, "loss": 1.2836, "step": 1750 }, { "epoch": 0.33, "grad_norm": 2.381664933642811, "learning_rate": 9.353612519037998e-06, "loss": 1.2564, "step": 1755 }, { "epoch": 0.33, "grad_norm": 2.2616227243734466, "learning_rate": 9.350006218917093e-06, "loss": 1.206, "step": 1760 }, { "epoch": 0.33, "grad_norm": 3.215182355715651, "learning_rate": 9.346390585910564e-06, "loss": 1.2162, "step": 1765 }, { "epoch": 0.33, "grad_norm": 2.4068984775708535, "learning_rate": 9.342765627775701e-06, "loss": 1.256, "step": 1770 }, { "epoch": 0.33, "grad_norm": 3.111790616048246, "learning_rate": 9.339131352289813e-06, "loss": 1.244, "step": 1775 }, { "epoch": 0.33, "grad_norm": 2.497956759599184, "learning_rate": 9.335487767250196e-06, "loss": 1.2527, "step": 1780 }, { "epoch": 0.33, "grad_norm": 2.8488643685890254, "learning_rate": 9.331834880474112e-06, "loss": 1.2307, "step": 1785 }, { "epoch": 0.33, "grad_norm": 2.5441328642850043, "learning_rate": 9.328172699798787e-06, "loss": 1.2508, "step": 1790 }, { "epoch": 0.33, "grad_norm": 2.905091377061353, "learning_rate": 9.32450123308139e-06, "loss": 1.2705, "step": 1795 }, { "epoch": 0.34, "grad_norm": 2.441237879674571, "learning_rate": 9.320820488199001e-06, "loss": 1.2472, "step": 1800 }, { "epoch": 0.34, "grad_norm": 2.478636338912326, "learning_rate": 9.317130473048619e-06, "loss": 1.2011, "step": 1805 }, { "epoch": 0.34, "grad_norm": 2.9292520527141965, "learning_rate": 9.313431195547125e-06, "loss": 1.2806, "step": 1810 }, { "epoch": 0.34, "grad_norm": 2.707691316832115, "learning_rate": 9.309722663631273e-06, "loss": 1.2255, "step": 1815 }, { "epoch": 0.34, "grad_norm": 2.6886924956046268, "learning_rate": 9.306004885257675e-06, "loss": 1.1939, "step": 1820 }, { "epoch": 0.34, "grad_norm": 2.537382921658179, "learning_rate": 9.302277868402778e-06, "loss": 1.2764, "step": 1825 }, { "epoch": 0.34, "grad_norm": 2.6757765536638063, "learning_rate": 9.298541621062852e-06, "loss": 1.2118, "step": 1830 }, { "epoch": 0.34, "grad_norm": 3.1550215206551084, "learning_rate": 9.29479615125397e-06, "loss": 1.2447, "step": 1835 }, { "epoch": 0.34, "grad_norm": 2.438740255473035, "learning_rate": 9.291041467011993e-06, "loss": 1.1826, "step": 1840 }, { "epoch": 0.34, "grad_norm": 2.351000695867108, "learning_rate": 9.287277576392553e-06, "loss": 1.2527, "step": 1845 }, { "epoch": 0.35, "grad_norm": 2.4930032778748723, "learning_rate": 9.283504487471025e-06, "loss": 1.2268, "step": 1850 }, { "epoch": 0.35, "grad_norm": 2.6221734670109296, "learning_rate": 9.279722208342531e-06, "loss": 1.2498, "step": 1855 }, { "epoch": 0.35, "grad_norm": 2.498523147654453, "learning_rate": 9.275930747121904e-06, "loss": 1.2466, "step": 1860 }, { "epoch": 0.35, "grad_norm": 3.063307098020864, "learning_rate": 9.272130111943677e-06, "loss": 1.2658, "step": 1865 }, { "epoch": 0.35, "grad_norm": 2.3256287817017895, "learning_rate": 9.268320310962067e-06, "loss": 1.2377, "step": 1870 }, { "epoch": 0.35, "grad_norm": 2.5407897240944775, "learning_rate": 9.264501352350956e-06, "loss": 1.2475, "step": 1875 }, { "epoch": 0.35, "grad_norm": 2.5140612955200616, "learning_rate": 9.260673244303876e-06, "loss": 1.2515, "step": 1880 }, { "epoch": 0.35, "grad_norm": 2.5361195689296188, "learning_rate": 9.25683599503398e-06, "loss": 1.2869, "step": 1885 }, { "epoch": 0.35, "grad_norm": 2.43917003129886, "learning_rate": 9.252989612774045e-06, "loss": 1.2741, "step": 1890 }, { "epoch": 0.35, "grad_norm": 2.4002807651389135, "learning_rate": 9.249134105776438e-06, "loss": 1.2317, "step": 1895 }, { "epoch": 0.35, "grad_norm": 2.8840395380192585, "learning_rate": 9.2452694823131e-06, "loss": 1.2351, "step": 1900 }, { "epoch": 0.36, "grad_norm": 2.4386304087371546, "learning_rate": 9.241395750675535e-06, "loss": 1.2785, "step": 1905 }, { "epoch": 0.36, "grad_norm": 2.6751487281863238, "learning_rate": 9.237512919174784e-06, "loss": 1.2338, "step": 1910 }, { "epoch": 0.36, "grad_norm": 2.607104816785263, "learning_rate": 9.233620996141421e-06, "loss": 1.2751, "step": 1915 }, { "epoch": 0.36, "grad_norm": 2.4848314069202497, "learning_rate": 9.229719989925516e-06, "loss": 1.2258, "step": 1920 }, { "epoch": 0.36, "grad_norm": 2.496832970791497, "learning_rate": 9.22580990889663e-06, "loss": 1.1904, "step": 1925 }, { "epoch": 0.36, "grad_norm": 2.751264104084154, "learning_rate": 9.221890761443797e-06, "loss": 1.2238, "step": 1930 }, { "epoch": 0.36, "grad_norm": 2.361121060540872, "learning_rate": 9.2179625559755e-06, "loss": 1.2844, "step": 1935 }, { "epoch": 0.36, "grad_norm": 2.336073008697271, "learning_rate": 9.214025300919654e-06, "loss": 1.2414, "step": 1940 }, { "epoch": 0.36, "grad_norm": 2.568163211952336, "learning_rate": 9.21007900472359e-06, "loss": 1.2547, "step": 1945 }, { "epoch": 0.36, "grad_norm": 2.065600385985879, "learning_rate": 9.206123675854046e-06, "loss": 1.2857, "step": 1950 }, { "epoch": 0.36, "grad_norm": 2.611039306343727, "learning_rate": 9.202159322797127e-06, "loss": 1.2271, "step": 1955 }, { "epoch": 0.37, "grad_norm": 2.5230585835313497, "learning_rate": 9.198185954058305e-06, "loss": 1.2082, "step": 1960 }, { "epoch": 0.37, "grad_norm": 2.8770021273735913, "learning_rate": 9.194203578162395e-06, "loss": 1.2284, "step": 1965 }, { "epoch": 0.37, "grad_norm": 2.981346910969795, "learning_rate": 9.190212203653536e-06, "loss": 1.2703, "step": 1970 }, { "epoch": 0.37, "grad_norm": 2.476339506574488, "learning_rate": 9.186211839095173e-06, "loss": 1.2289, "step": 1975 }, { "epoch": 0.37, "grad_norm": 2.716079606788581, "learning_rate": 9.182202493070042e-06, "loss": 1.2389, "step": 1980 }, { "epoch": 0.37, "grad_norm": 2.665652883562263, "learning_rate": 9.178184174180144e-06, "loss": 1.2321, "step": 1985 }, { "epoch": 0.37, "grad_norm": 2.523860194811177, "learning_rate": 9.174156891046735e-06, "loss": 1.2759, "step": 1990 }, { "epoch": 0.37, "grad_norm": 2.8324143392540377, "learning_rate": 9.170120652310302e-06, "loss": 1.2357, "step": 1995 }, { "epoch": 0.37, "grad_norm": 3.480803939898195, "learning_rate": 9.166075466630544e-06, "loss": 1.2853, "step": 2000 }, { "epoch": 0.37, "grad_norm": 2.534748993502781, "learning_rate": 9.16202134268636e-06, "loss": 1.2719, "step": 2005 }, { "epoch": 0.37, "grad_norm": 2.270094668164831, "learning_rate": 9.157958289175823e-06, "loss": 1.2653, "step": 2010 }, { "epoch": 0.38, "grad_norm": 2.7071336774041996, "learning_rate": 9.153886314816167e-06, "loss": 1.2794, "step": 2015 }, { "epoch": 0.38, "grad_norm": 2.5214070416956895, "learning_rate": 9.14980542834376e-06, "loss": 1.2486, "step": 2020 }, { "epoch": 0.38, "grad_norm": 2.1348372380758938, "learning_rate": 9.145715638514097e-06, "loss": 1.2588, "step": 2025 }, { "epoch": 0.38, "grad_norm": 2.3960872799493917, "learning_rate": 9.141616954101774e-06, "loss": 1.2476, "step": 2030 }, { "epoch": 0.38, "grad_norm": 2.3568100815228568, "learning_rate": 9.137509383900465e-06, "loss": 1.2592, "step": 2035 }, { "epoch": 0.38, "grad_norm": 2.7326127821248325, "learning_rate": 9.133392936722914e-06, "loss": 1.2395, "step": 2040 }, { "epoch": 0.38, "grad_norm": 2.4530898848045104, "learning_rate": 9.129267621400908e-06, "loss": 1.2348, "step": 2045 }, { "epoch": 0.38, "grad_norm": 2.5224023797948827, "learning_rate": 9.125133446785262e-06, "loss": 1.2263, "step": 2050 }, { "epoch": 0.38, "grad_norm": 2.582762768877811, "learning_rate": 9.120990421745798e-06, "loss": 1.2129, "step": 2055 }, { "epoch": 0.38, "grad_norm": 2.687243860565364, "learning_rate": 9.116838555171324e-06, "loss": 1.1914, "step": 2060 }, { "epoch": 0.39, "grad_norm": 2.541625235096613, "learning_rate": 9.112677855969621e-06, "loss": 1.2614, "step": 2065 }, { "epoch": 0.39, "grad_norm": 2.553790871372534, "learning_rate": 9.108508333067417e-06, "loss": 1.2676, "step": 2070 }, { "epoch": 0.39, "grad_norm": 2.5187101896462605, "learning_rate": 9.104329995410373e-06, "loss": 1.2615, "step": 2075 }, { "epoch": 0.39, "grad_norm": 2.472597277322467, "learning_rate": 9.100142851963065e-06, "loss": 1.2858, "step": 2080 }, { "epoch": 0.39, "grad_norm": 2.6967302147065566, "learning_rate": 9.095946911708956e-06, "loss": 1.251, "step": 2085 }, { "epoch": 0.39, "grad_norm": 2.6563237373548394, "learning_rate": 9.091742183650385e-06, "loss": 1.2245, "step": 2090 }, { "epoch": 0.39, "grad_norm": 2.8826970023720215, "learning_rate": 9.087528676808546e-06, "loss": 1.2095, "step": 2095 }, { "epoch": 0.39, "grad_norm": 2.758147091936185, "learning_rate": 9.083306400223465e-06, "loss": 1.2625, "step": 2100 }, { "epoch": 0.39, "grad_norm": 2.708953811891164, "learning_rate": 9.079075362953986e-06, "loss": 1.2543, "step": 2105 }, { "epoch": 0.39, "grad_norm": 2.7842323272866554, "learning_rate": 9.074835574077751e-06, "loss": 1.1901, "step": 2110 }, { "epoch": 0.39, "grad_norm": 2.3340312098539564, "learning_rate": 9.07058704269117e-06, "loss": 1.237, "step": 2115 }, { "epoch": 0.4, "grad_norm": 2.201022209456728, "learning_rate": 9.066329777909424e-06, "loss": 1.2393, "step": 2120 }, { "epoch": 0.4, "grad_norm": 2.9721479460201854, "learning_rate": 9.062063788866413e-06, "loss": 1.2357, "step": 2125 }, { "epoch": 0.4, "grad_norm": 2.477950514807996, "learning_rate": 9.057789084714772e-06, "loss": 1.2783, "step": 2130 }, { "epoch": 0.4, "grad_norm": 2.6801686742333914, "learning_rate": 9.053505674625824e-06, "loss": 1.2518, "step": 2135 }, { "epoch": 0.4, "grad_norm": 2.284459164256047, "learning_rate": 9.049213567789575e-06, "loss": 1.2178, "step": 2140 }, { "epoch": 0.4, "grad_norm": 2.5271344856024984, "learning_rate": 9.044912773414686e-06, "loss": 1.1993, "step": 2145 }, { "epoch": 0.4, "grad_norm": 2.4050161954609566, "learning_rate": 9.040603300728463e-06, "loss": 1.2511, "step": 2150 }, { "epoch": 0.4, "grad_norm": 2.7704594988994393, "learning_rate": 9.036285158976823e-06, "loss": 1.1996, "step": 2155 }, { "epoch": 0.4, "grad_norm": 2.468996954771303, "learning_rate": 9.031958357424288e-06, "loss": 1.2108, "step": 2160 }, { "epoch": 0.4, "grad_norm": 2.68457819828506, "learning_rate": 9.02762290535396e-06, "loss": 1.2252, "step": 2165 }, { "epoch": 0.4, "grad_norm": 2.3714659097894177, "learning_rate": 9.023278812067499e-06, "loss": 1.2419, "step": 2170 }, { "epoch": 0.41, "grad_norm": 2.8938246248834525, "learning_rate": 9.018926086885103e-06, "loss": 1.2567, "step": 2175 }, { "epoch": 0.41, "grad_norm": 2.5030355446737245, "learning_rate": 9.01456473914549e-06, "loss": 1.1975, "step": 2180 }, { "epoch": 0.41, "grad_norm": 2.5309960309803676, "learning_rate": 9.010194778205882e-06, "loss": 1.3214, "step": 2185 }, { "epoch": 0.41, "grad_norm": 2.392058513782499, "learning_rate": 9.005816213441974e-06, "loss": 1.2852, "step": 2190 }, { "epoch": 0.41, "grad_norm": 2.4868847080941237, "learning_rate": 9.001429054247926e-06, "loss": 1.2317, "step": 2195 }, { "epoch": 0.41, "grad_norm": 3.0581127639032872, "learning_rate": 8.997033310036335e-06, "loss": 1.2517, "step": 2200 }, { "epoch": 0.41, "grad_norm": 2.4248422954221525, "learning_rate": 8.992628990238215e-06, "loss": 1.2476, "step": 2205 }, { "epoch": 0.41, "grad_norm": 2.294214303014934, "learning_rate": 8.98821610430298e-06, "loss": 1.2481, "step": 2210 }, { "epoch": 0.41, "grad_norm": 2.826945710941023, "learning_rate": 8.983794661698427e-06, "loss": 1.262, "step": 2215 }, { "epoch": 0.41, "grad_norm": 2.513263172269817, "learning_rate": 8.979364671910705e-06, "loss": 1.2564, "step": 2220 }, { "epoch": 0.41, "grad_norm": 2.388911911230159, "learning_rate": 8.974926144444305e-06, "loss": 1.2869, "step": 2225 }, { "epoch": 0.42, "grad_norm": 2.776503134174594, "learning_rate": 8.970479088822032e-06, "loss": 1.2186, "step": 2230 }, { "epoch": 0.42, "grad_norm": 2.4937959196461836, "learning_rate": 8.96602351458499e-06, "loss": 1.2154, "step": 2235 }, { "epoch": 0.42, "grad_norm": 2.6962622054584355, "learning_rate": 8.961559431292562e-06, "loss": 1.2443, "step": 2240 }, { "epoch": 0.42, "grad_norm": 2.5956355309208545, "learning_rate": 8.957086848522381e-06, "loss": 1.2554, "step": 2245 }, { "epoch": 0.42, "grad_norm": 2.7594593072064195, "learning_rate": 8.952605775870323e-06, "loss": 1.2676, "step": 2250 }, { "epoch": 0.42, "grad_norm": 2.714032226160169, "learning_rate": 8.948116222950471e-06, "loss": 1.2117, "step": 2255 }, { "epoch": 0.42, "grad_norm": 2.9432371600954874, "learning_rate": 8.943618199395111e-06, "loss": 1.3193, "step": 2260 }, { "epoch": 0.42, "grad_norm": 2.5517853164601925, "learning_rate": 8.939111714854693e-06, "loss": 1.2122, "step": 2265 }, { "epoch": 0.42, "grad_norm": 2.337294494695929, "learning_rate": 8.934596778997831e-06, "loss": 1.2581, "step": 2270 }, { "epoch": 0.42, "grad_norm": 2.6000119365477756, "learning_rate": 8.93007340151126e-06, "loss": 1.2595, "step": 2275 }, { "epoch": 0.43, "grad_norm": 2.491496550690855, "learning_rate": 8.925541592099832e-06, "loss": 1.2675, "step": 2280 }, { "epoch": 0.43, "grad_norm": 2.9739305294116725, "learning_rate": 8.92100136048649e-06, "loss": 1.2279, "step": 2285 }, { "epoch": 0.43, "grad_norm": 2.697112416494898, "learning_rate": 8.916452716412248e-06, "loss": 1.2281, "step": 2290 }, { "epoch": 0.43, "grad_norm": 2.9089535182374626, "learning_rate": 8.911895669636164e-06, "loss": 1.234, "step": 2295 }, { "epoch": 0.43, "grad_norm": 2.381662264130669, "learning_rate": 8.907330229935327e-06, "loss": 1.2221, "step": 2300 }, { "epoch": 0.43, "grad_norm": 2.5219995591332633, "learning_rate": 8.902756407104836e-06, "loss": 1.2423, "step": 2305 }, { "epoch": 0.43, "grad_norm": 2.354949055377429, "learning_rate": 8.898174210957768e-06, "loss": 1.2576, "step": 2310 }, { "epoch": 0.43, "grad_norm": 2.7733201062503143, "learning_rate": 8.89358365132517e-06, "loss": 1.2588, "step": 2315 }, { "epoch": 0.43, "grad_norm": 2.7596778146792897, "learning_rate": 8.888984738056035e-06, "loss": 1.249, "step": 2320 }, { "epoch": 0.43, "grad_norm": 2.892780169742038, "learning_rate": 8.884377481017273e-06, "loss": 1.2028, "step": 2325 }, { "epoch": 0.43, "grad_norm": 2.606206110122719, "learning_rate": 8.8797618900937e-06, "loss": 1.1971, "step": 2330 }, { "epoch": 0.44, "grad_norm": 2.7230716452282238, "learning_rate": 8.875137975188008e-06, "loss": 1.2664, "step": 2335 }, { "epoch": 0.44, "grad_norm": 2.774643601960243, "learning_rate": 8.87050574622075e-06, "loss": 1.1894, "step": 2340 }, { "epoch": 0.44, "grad_norm": 2.5722147356763716, "learning_rate": 8.865865213130322e-06, "loss": 1.2562, "step": 2345 }, { "epoch": 0.44, "grad_norm": 2.5301045858382305, "learning_rate": 8.861216385872926e-06, "loss": 1.2211, "step": 2350 }, { "epoch": 0.44, "grad_norm": 2.5336960944371865, "learning_rate": 8.85655927442257e-06, "loss": 1.2428, "step": 2355 }, { "epoch": 0.44, "grad_norm": 2.9864398074133374, "learning_rate": 8.851893888771024e-06, "loss": 1.2466, "step": 2360 }, { "epoch": 0.44, "grad_norm": 2.4719774947680957, "learning_rate": 8.84722023892782e-06, "loss": 1.2146, "step": 2365 }, { "epoch": 0.44, "grad_norm": 2.7847705036136907, "learning_rate": 8.842538334920217e-06, "loss": 1.2395, "step": 2370 }, { "epoch": 0.44, "grad_norm": 2.7115968083330313, "learning_rate": 8.837848186793182e-06, "loss": 1.2483, "step": 2375 }, { "epoch": 0.44, "grad_norm": 2.498538404354106, "learning_rate": 8.833149804609372e-06, "loss": 1.2014, "step": 2380 }, { "epoch": 0.44, "grad_norm": 3.0546576296254027, "learning_rate": 8.828443198449109e-06, "loss": 1.2135, "step": 2385 }, { "epoch": 0.45, "grad_norm": 2.8485505002879803, "learning_rate": 8.82372837841036e-06, "loss": 1.2318, "step": 2390 }, { "epoch": 0.45, "grad_norm": 2.571389509311176, "learning_rate": 8.819005354608712e-06, "loss": 1.1785, "step": 2395 }, { "epoch": 0.45, "grad_norm": 2.7341603256447065, "learning_rate": 8.814274137177359e-06, "loss": 1.2882, "step": 2400 }, { "epoch": 0.45, "grad_norm": 2.38673842882881, "learning_rate": 8.809534736267068e-06, "loss": 1.2341, "step": 2405 }, { "epoch": 0.45, "grad_norm": 2.5157200762234413, "learning_rate": 8.804787162046165e-06, "loss": 1.226, "step": 2410 }, { "epoch": 0.45, "grad_norm": 2.4512225231854035, "learning_rate": 8.800031424700518e-06, "loss": 1.3022, "step": 2415 }, { "epoch": 0.45, "grad_norm": 3.018409907978064, "learning_rate": 8.795267534433497e-06, "loss": 1.2353, "step": 2420 }, { "epoch": 0.45, "grad_norm": 2.336307047503812, "learning_rate": 8.790495501465976e-06, "loss": 1.244, "step": 2425 }, { "epoch": 0.45, "grad_norm": 2.6173817118831764, "learning_rate": 8.78571533603629e-06, "loss": 1.2408, "step": 2430 }, { "epoch": 0.45, "grad_norm": 3.082589792469222, "learning_rate": 8.780927048400231e-06, "loss": 1.2414, "step": 2435 }, { "epoch": 0.46, "grad_norm": 2.6145325587133597, "learning_rate": 8.77613064883101e-06, "loss": 1.1795, "step": 2440 }, { "epoch": 0.46, "grad_norm": 2.3165502789649275, "learning_rate": 8.771326147619243e-06, "loss": 1.2152, "step": 2445 }, { "epoch": 0.46, "grad_norm": 2.5107426100104853, "learning_rate": 8.766513555072929e-06, "loss": 1.2438, "step": 2450 }, { "epoch": 0.46, "grad_norm": 2.3252364646501835, "learning_rate": 8.761692881517428e-06, "loss": 1.206, "step": 2455 }, { "epoch": 0.46, "grad_norm": 2.369573589297014, "learning_rate": 8.756864137295436e-06, "loss": 1.2475, "step": 2460 }, { "epoch": 0.46, "grad_norm": 2.4304676652220514, "learning_rate": 8.752027332766965e-06, "loss": 1.2198, "step": 2465 }, { "epoch": 0.46, "grad_norm": 3.029950181599185, "learning_rate": 8.747182478309321e-06, "loss": 1.2473, "step": 2470 }, { "epoch": 0.46, "grad_norm": 2.3276833420238536, "learning_rate": 8.742329584317082e-06, "loss": 1.2786, "step": 2475 }, { "epoch": 0.46, "grad_norm": 2.5955319528618688, "learning_rate": 8.73746866120207e-06, "loss": 1.2491, "step": 2480 }, { "epoch": 0.46, "grad_norm": 2.271857973340684, "learning_rate": 8.732599719393341e-06, "loss": 1.28, "step": 2485 }, { "epoch": 0.46, "grad_norm": 2.4977245907120675, "learning_rate": 8.727722769337147e-06, "loss": 1.2397, "step": 2490 }, { "epoch": 0.47, "grad_norm": 2.389057171322813, "learning_rate": 8.722837821496925e-06, "loss": 1.2456, "step": 2495 }, { "epoch": 0.47, "grad_norm": 2.6678877706649873, "learning_rate": 8.717944886353276e-06, "loss": 1.2869, "step": 2500 }, { "epoch": 0.47, "grad_norm": 2.4737604548224987, "learning_rate": 8.713043974403928e-06, "loss": 1.2634, "step": 2505 }, { "epoch": 0.47, "grad_norm": 2.5974619042954354, "learning_rate": 8.708135096163728e-06, "loss": 1.2797, "step": 2510 }, { "epoch": 0.47, "grad_norm": 2.4140156710174474, "learning_rate": 8.70321826216462e-06, "loss": 1.2333, "step": 2515 }, { "epoch": 0.47, "grad_norm": 2.735433147498583, "learning_rate": 8.698293482955605e-06, "loss": 1.2615, "step": 2520 }, { "epoch": 0.47, "grad_norm": 2.717654768808532, "learning_rate": 8.693360769102743e-06, "loss": 1.2237, "step": 2525 }, { "epoch": 0.47, "grad_norm": 2.436937033475975, "learning_rate": 8.688420131189111e-06, "loss": 1.2109, "step": 2530 }, { "epoch": 0.47, "grad_norm": 2.3731683394062744, "learning_rate": 8.683471579814782e-06, "loss": 1.2538, "step": 2535 }, { "epoch": 0.47, "grad_norm": 2.423610268780273, "learning_rate": 8.67851512559682e-06, "loss": 1.2546, "step": 2540 }, { "epoch": 0.47, "grad_norm": 2.484873214909234, "learning_rate": 8.673550779169236e-06, "loss": 1.1855, "step": 2545 }, { "epoch": 0.48, "grad_norm": 2.4105445019685785, "learning_rate": 8.668578551182972e-06, "loss": 1.2437, "step": 2550 }, { "epoch": 0.48, "grad_norm": 2.3073902118692793, "learning_rate": 8.663598452305884e-06, "loss": 1.1848, "step": 2555 }, { "epoch": 0.48, "grad_norm": 2.485153537624552, "learning_rate": 8.658610493222714e-06, "loss": 1.203, "step": 2560 }, { "epoch": 0.48, "grad_norm": 2.552191735957582, "learning_rate": 8.65361468463507e-06, "loss": 1.2728, "step": 2565 }, { "epoch": 0.48, "grad_norm": 2.517770218713454, "learning_rate": 8.648611037261397e-06, "loss": 1.2595, "step": 2570 }, { "epoch": 0.48, "grad_norm": 2.520499429177203, "learning_rate": 8.643599561836958e-06, "loss": 1.2534, "step": 2575 }, { "epoch": 0.48, "grad_norm": 2.682134328033429, "learning_rate": 8.638580269113817e-06, "loss": 1.2171, "step": 2580 }, { "epoch": 0.48, "grad_norm": 2.855280608865029, "learning_rate": 8.6335531698608e-06, "loss": 1.3234, "step": 2585 }, { "epoch": 0.48, "grad_norm": 2.4942587653229733, "learning_rate": 8.628518274863494e-06, "loss": 1.1928, "step": 2590 }, { "epoch": 0.48, "grad_norm": 2.4240939844890934, "learning_rate": 8.623475594924202e-06, "loss": 1.2482, "step": 2595 }, { "epoch": 0.48, "grad_norm": 2.643886198160013, "learning_rate": 8.61842514086193e-06, "loss": 1.2556, "step": 2600 }, { "epoch": 0.49, "grad_norm": 3.0430371576127198, "learning_rate": 8.61336692351237e-06, "loss": 1.2242, "step": 2605 }, { "epoch": 0.49, "grad_norm": 2.914966940001587, "learning_rate": 8.608300953727865e-06, "loss": 1.2378, "step": 2610 }, { "epoch": 0.49, "grad_norm": 2.0564710269311353, "learning_rate": 8.603227242377392e-06, "loss": 1.2633, "step": 2615 }, { "epoch": 0.49, "grad_norm": 2.664747765412154, "learning_rate": 8.598145800346537e-06, "loss": 1.2316, "step": 2620 }, { "epoch": 0.49, "grad_norm": 2.590698026636923, "learning_rate": 8.593056638537472e-06, "loss": 1.2533, "step": 2625 }, { "epoch": 0.49, "grad_norm": 2.720013435674094, "learning_rate": 8.58795976786893e-06, "loss": 1.2326, "step": 2630 }, { "epoch": 0.49, "grad_norm": 2.587306223223574, "learning_rate": 8.582855199276189e-06, "loss": 1.2626, "step": 2635 }, { "epoch": 0.49, "grad_norm": 2.396094522080749, "learning_rate": 8.577742943711037e-06, "loss": 1.2044, "step": 2640 }, { "epoch": 0.49, "grad_norm": 2.632459801832667, "learning_rate": 8.572623012141756e-06, "loss": 1.2194, "step": 2645 }, { "epoch": 0.49, "grad_norm": 2.5802264347271486, "learning_rate": 8.5674954155531e-06, "loss": 1.2498, "step": 2650 }, { "epoch": 0.5, "grad_norm": 2.326993564179618, "learning_rate": 8.562360164946261e-06, "loss": 1.2104, "step": 2655 }, { "epoch": 0.5, "grad_norm": 2.4358311112489166, "learning_rate": 8.55721727133886e-06, "loss": 1.2509, "step": 2660 }, { "epoch": 0.5, "grad_norm": 2.432075090103631, "learning_rate": 8.552066745764911e-06, "loss": 1.1837, "step": 2665 }, { "epoch": 0.5, "grad_norm": 2.3984144854914065, "learning_rate": 8.546908599274807e-06, "loss": 1.2726, "step": 2670 }, { "epoch": 0.5, "grad_norm": 2.5690105626973687, "learning_rate": 8.541742842935286e-06, "loss": 1.2625, "step": 2675 }, { "epoch": 0.5, "grad_norm": 2.5583136858764464, "learning_rate": 8.536569487829416e-06, "loss": 1.263, "step": 2680 }, { "epoch": 0.5, "grad_norm": 2.429437667412415, "learning_rate": 8.531388545056569e-06, "loss": 1.2279, "step": 2685 }, { "epoch": 0.5, "grad_norm": 2.6678935500473546, "learning_rate": 8.526200025732395e-06, "loss": 1.2647, "step": 2690 }, { "epoch": 0.5, "grad_norm": 2.6772674553916387, "learning_rate": 8.521003940988796e-06, "loss": 1.2381, "step": 2695 }, { "epoch": 0.5, "grad_norm": 2.6232131162921357, "learning_rate": 8.515800301973916e-06, "loss": 1.2192, "step": 2700 }, { "epoch": 0.5, "grad_norm": 2.508584977632273, "learning_rate": 8.51058911985209e-06, "loss": 1.2486, "step": 2705 }, { "epoch": 0.51, "grad_norm": 2.4972313904885626, "learning_rate": 8.505370405803854e-06, "loss": 1.2121, "step": 2710 }, { "epoch": 0.51, "grad_norm": 2.876194235114713, "learning_rate": 8.50014417102589e-06, "loss": 1.2434, "step": 2715 }, { "epoch": 0.51, "grad_norm": 2.4126645230813786, "learning_rate": 8.494910426731026e-06, "loss": 1.1677, "step": 2720 }, { "epoch": 0.51, "grad_norm": 2.842765954994177, "learning_rate": 8.489669184148196e-06, "loss": 1.3138, "step": 2725 }, { "epoch": 0.51, "grad_norm": 2.7561841143747348, "learning_rate": 8.48442045452242e-06, "loss": 1.2539, "step": 2730 }, { "epoch": 0.51, "grad_norm": 2.4533831653730243, "learning_rate": 8.479164249114787e-06, "loss": 1.21, "step": 2735 }, { "epoch": 0.51, "grad_norm": 3.0366686445653324, "learning_rate": 8.473900579202419e-06, "loss": 1.262, "step": 2740 }, { "epoch": 0.51, "grad_norm": 2.511447651995392, "learning_rate": 8.468629456078456e-06, "loss": 1.2612, "step": 2745 }, { "epoch": 0.51, "grad_norm": 2.5858036767898906, "learning_rate": 8.463350891052032e-06, "loss": 1.2127, "step": 2750 }, { "epoch": 0.51, "grad_norm": 2.856917866346965, "learning_rate": 8.458064895448242e-06, "loss": 1.2439, "step": 2755 }, { "epoch": 0.51, "grad_norm": 2.8503466560810464, "learning_rate": 8.452771480608126e-06, "loss": 1.2404, "step": 2760 }, { "epoch": 0.52, "grad_norm": 2.724946308949215, "learning_rate": 8.447470657888642e-06, "loss": 1.2191, "step": 2765 }, { "epoch": 0.52, "grad_norm": 3.0293778238961124, "learning_rate": 8.442162438662637e-06, "loss": 1.2632, "step": 2770 }, { "epoch": 0.52, "grad_norm": 2.3428716544118227, "learning_rate": 8.436846834318836e-06, "loss": 1.2473, "step": 2775 }, { "epoch": 0.52, "grad_norm": 2.4162322848129127, "learning_rate": 8.431523856261803e-06, "loss": 1.2763, "step": 2780 }, { "epoch": 0.52, "grad_norm": 2.3173281978902582, "learning_rate": 8.426193515911922e-06, "loss": 1.2339, "step": 2785 }, { "epoch": 0.52, "grad_norm": 2.41881861854618, "learning_rate": 8.420855824705372e-06, "loss": 1.2428, "step": 2790 }, { "epoch": 0.52, "grad_norm": 2.4540386228110567, "learning_rate": 8.415510794094108e-06, "loss": 1.251, "step": 2795 }, { "epoch": 0.52, "grad_norm": 2.7791688006237814, "learning_rate": 8.410158435545825e-06, "loss": 1.2333, "step": 2800 }, { "epoch": 0.52, "grad_norm": 2.2050497673913547, "learning_rate": 8.404798760543947e-06, "loss": 1.2943, "step": 2805 }, { "epoch": 0.52, "grad_norm": 2.513060240203504, "learning_rate": 8.399431780587588e-06, "loss": 1.2542, "step": 2810 }, { "epoch": 0.52, "grad_norm": 2.4705441230982337, "learning_rate": 8.39405750719154e-06, "loss": 1.2266, "step": 2815 }, { "epoch": 0.53, "grad_norm": 2.3602531470830628, "learning_rate": 8.388675951886242e-06, "loss": 1.1894, "step": 2820 }, { "epoch": 0.53, "grad_norm": 2.5625025420117247, "learning_rate": 8.383287126217756e-06, "loss": 1.2414, "step": 2825 }, { "epoch": 0.53, "grad_norm": 2.270771349307043, "learning_rate": 8.377891041747742e-06, "loss": 1.2786, "step": 2830 }, { "epoch": 0.53, "grad_norm": 2.457830364346707, "learning_rate": 8.372487710053433e-06, "loss": 1.275, "step": 2835 }, { "epoch": 0.53, "grad_norm": 2.5075337353947913, "learning_rate": 8.367077142727612e-06, "loss": 1.2273, "step": 2840 }, { "epoch": 0.53, "grad_norm": 2.864871208968174, "learning_rate": 8.361659351378585e-06, "loss": 1.1916, "step": 2845 }, { "epoch": 0.53, "grad_norm": 2.478234689243336, "learning_rate": 8.35623434763016e-06, "loss": 1.2507, "step": 2850 }, { "epoch": 0.53, "grad_norm": 2.6409866161664124, "learning_rate": 8.350802143121614e-06, "loss": 1.1816, "step": 2855 }, { "epoch": 0.53, "grad_norm": 2.708641058912023, "learning_rate": 8.345362749507677e-06, "loss": 1.1989, "step": 2860 }, { "epoch": 0.53, "grad_norm": 2.512060087338582, "learning_rate": 8.339916178458502e-06, "loss": 1.2063, "step": 2865 }, { "epoch": 0.54, "grad_norm": 2.312677281400599, "learning_rate": 8.334462441659644e-06, "loss": 1.2793, "step": 2870 }, { "epoch": 0.54, "grad_norm": 2.583586314697303, "learning_rate": 8.329001550812024e-06, "loss": 1.2189, "step": 2875 }, { "epoch": 0.54, "grad_norm": 2.463821721761164, "learning_rate": 8.32353351763192e-06, "loss": 1.2308, "step": 2880 }, { "epoch": 0.54, "grad_norm": 2.3329831134769887, "learning_rate": 8.31805835385093e-06, "loss": 1.3068, "step": 2885 }, { "epoch": 0.54, "grad_norm": 2.3890079406752887, "learning_rate": 8.312576071215954e-06, "loss": 1.2802, "step": 2890 }, { "epoch": 0.54, "grad_norm": 2.4337592685605864, "learning_rate": 8.30708668148916e-06, "loss": 1.2084, "step": 2895 }, { "epoch": 0.54, "grad_norm": 2.979980545845011, "learning_rate": 8.301590196447968e-06, "loss": 1.2169, "step": 2900 }, { "epoch": 0.54, "grad_norm": 2.4713288910964226, "learning_rate": 8.296086627885022e-06, "loss": 1.24, "step": 2905 }, { "epoch": 0.54, "grad_norm": 2.57678209683771, "learning_rate": 8.29057598760816e-06, "loss": 1.2183, "step": 2910 }, { "epoch": 0.54, "grad_norm": 2.388983411954069, "learning_rate": 8.285058287440393e-06, "loss": 1.2159, "step": 2915 }, { "epoch": 0.54, "grad_norm": 2.5856899395384207, "learning_rate": 8.279533539219881e-06, "loss": 1.2423, "step": 2920 }, { "epoch": 0.55, "grad_norm": 2.656085681106573, "learning_rate": 8.274001754799902e-06, "loss": 1.2425, "step": 2925 }, { "epoch": 0.55, "grad_norm": 2.687407245664408, "learning_rate": 8.268462946048839e-06, "loss": 1.1885, "step": 2930 }, { "epoch": 0.55, "grad_norm": 2.602375749001676, "learning_rate": 8.262917124850132e-06, "loss": 1.2539, "step": 2935 }, { "epoch": 0.55, "grad_norm": 2.5032059088059926, "learning_rate": 8.257364303102275e-06, "loss": 1.2215, "step": 2940 }, { "epoch": 0.55, "grad_norm": 2.1354807044438715, "learning_rate": 8.251804492718781e-06, "loss": 1.1698, "step": 2945 }, { "epoch": 0.55, "grad_norm": 2.8821604335776976, "learning_rate": 8.246237705628156e-06, "loss": 1.2858, "step": 2950 }, { "epoch": 0.55, "grad_norm": 2.808397596971899, "learning_rate": 8.240663953773874e-06, "loss": 1.2568, "step": 2955 }, { "epoch": 0.55, "grad_norm": 2.5791493113582433, "learning_rate": 8.23508324911435e-06, "loss": 1.2067, "step": 2960 }, { "epoch": 0.55, "grad_norm": 2.7351925838304054, "learning_rate": 8.229495603622922e-06, "loss": 1.2393, "step": 2965 }, { "epoch": 0.55, "grad_norm": 2.937782581862869, "learning_rate": 8.223901029287811e-06, "loss": 1.219, "step": 2970 }, { "epoch": 0.55, "grad_norm": 2.5782397077136476, "learning_rate": 8.218299538112113e-06, "loss": 1.2006, "step": 2975 }, { "epoch": 0.56, "grad_norm": 2.6098996864666257, "learning_rate": 8.212691142113759e-06, "loss": 1.2895, "step": 2980 }, { "epoch": 0.56, "grad_norm": 2.3838592097882865, "learning_rate": 8.20707585332549e-06, "loss": 1.2064, "step": 2985 }, { "epoch": 0.56, "grad_norm": 2.9170817488913077, "learning_rate": 8.201453683794844e-06, "loss": 1.2256, "step": 2990 }, { "epoch": 0.56, "grad_norm": 2.7415094542817506, "learning_rate": 8.195824645584115e-06, "loss": 1.2449, "step": 2995 }, { "epoch": 0.56, "grad_norm": 3.020101156694181, "learning_rate": 8.19018875077034e-06, "loss": 1.196, "step": 3000 }, { "epoch": 0.56, "grad_norm": 3.5256819382363416, "learning_rate": 8.184546011445262e-06, "loss": 1.2331, "step": 3005 }, { "epoch": 0.56, "grad_norm": 2.489265808403552, "learning_rate": 8.178896439715308e-06, "loss": 1.2924, "step": 3010 }, { "epoch": 0.56, "grad_norm": 2.4229165953443896, "learning_rate": 8.173240047701566e-06, "loss": 1.2201, "step": 3015 }, { "epoch": 0.56, "grad_norm": 2.2691244384078706, "learning_rate": 8.167576847539756e-06, "loss": 1.2383, "step": 3020 }, { "epoch": 0.56, "grad_norm": 2.811907668419909, "learning_rate": 8.161906851380209e-06, "loss": 1.2604, "step": 3025 }, { "epoch": 0.57, "grad_norm": 2.259428143639486, "learning_rate": 8.156230071387829e-06, "loss": 1.2208, "step": 3030 }, { "epoch": 0.57, "grad_norm": 2.8094443482735585, "learning_rate": 8.15054651974208e-06, "loss": 1.2036, "step": 3035 }, { "epoch": 0.57, "grad_norm": 2.7986050897572725, "learning_rate": 8.144856208636953e-06, "loss": 1.1986, "step": 3040 }, { "epoch": 0.57, "grad_norm": 2.3585431474571235, "learning_rate": 8.139159150280942e-06, "loss": 1.1997, "step": 3045 }, { "epoch": 0.57, "grad_norm": 2.555014380109738, "learning_rate": 8.133455356897017e-06, "loss": 1.1899, "step": 3050 }, { "epoch": 0.57, "grad_norm": 2.5930294103690983, "learning_rate": 8.127744840722596e-06, "loss": 1.2354, "step": 3055 }, { "epoch": 0.57, "grad_norm": 2.4164140212051173, "learning_rate": 8.122027614009525e-06, "loss": 1.1931, "step": 3060 }, { "epoch": 0.57, "grad_norm": 2.4544780551977095, "learning_rate": 8.11630368902404e-06, "loss": 1.2359, "step": 3065 }, { "epoch": 0.57, "grad_norm": 2.6112887233948108, "learning_rate": 8.11057307804676e-06, "loss": 1.2432, "step": 3070 }, { "epoch": 0.57, "grad_norm": 2.550549949346128, "learning_rate": 8.104835793372634e-06, "loss": 1.2374, "step": 3075 }, { "epoch": 0.57, "grad_norm": 2.5394631647393333, "learning_rate": 8.09909184731094e-06, "loss": 1.2879, "step": 3080 }, { "epoch": 0.58, "grad_norm": 2.448831569746075, "learning_rate": 8.093341252185244e-06, "loss": 1.2475, "step": 3085 }, { "epoch": 0.58, "grad_norm": 2.262631971760282, "learning_rate": 8.087584020333378e-06, "loss": 1.2495, "step": 3090 }, { "epoch": 0.58, "grad_norm": 2.3580312532472107, "learning_rate": 8.081820164107417e-06, "loss": 1.2066, "step": 3095 }, { "epoch": 0.58, "grad_norm": 2.280906364318545, "learning_rate": 8.076049695873637e-06, "loss": 1.2622, "step": 3100 }, { "epoch": 0.58, "grad_norm": 2.717321347834505, "learning_rate": 8.070272628012517e-06, "loss": 1.2626, "step": 3105 }, { "epoch": 0.58, "grad_norm": 2.745888808242339, "learning_rate": 8.064488972918678e-06, "loss": 1.2383, "step": 3110 }, { "epoch": 0.58, "grad_norm": 2.4430976141986522, "learning_rate": 8.058698743000885e-06, "loss": 1.2229, "step": 3115 }, { "epoch": 0.58, "grad_norm": 2.2964872809414674, "learning_rate": 8.052901950682007e-06, "loss": 1.1775, "step": 3120 }, { "epoch": 0.58, "grad_norm": 2.601409079456989, "learning_rate": 8.047098608398991e-06, "loss": 1.2243, "step": 3125 }, { "epoch": 0.58, "grad_norm": 2.6030241980938924, "learning_rate": 8.041288728602838e-06, "loss": 1.2544, "step": 3130 }, { "epoch": 0.58, "grad_norm": 2.4914596853603257, "learning_rate": 8.035472323758572e-06, "loss": 1.2529, "step": 3135 }, { "epoch": 0.59, "grad_norm": 2.5770189045893295, "learning_rate": 8.029649406345223e-06, "loss": 1.1977, "step": 3140 }, { "epoch": 0.59, "grad_norm": 2.631376359022823, "learning_rate": 8.023819988855788e-06, "loss": 1.1996, "step": 3145 }, { "epoch": 0.59, "grad_norm": 2.1749751309010317, "learning_rate": 8.01798408379721e-06, "loss": 1.178, "step": 3150 }, { "epoch": 0.59, "grad_norm": 2.7989592102512493, "learning_rate": 8.012141703690356e-06, "loss": 1.2529, "step": 3155 }, { "epoch": 0.59, "grad_norm": 2.4294484853415645, "learning_rate": 8.006292861069977e-06, "loss": 1.2502, "step": 3160 }, { "epoch": 0.59, "grad_norm": 2.5272009149226746, "learning_rate": 8.0004375684847e-06, "loss": 1.2616, "step": 3165 }, { "epoch": 0.59, "grad_norm": 2.5718843710944066, "learning_rate": 7.994575838496979e-06, "loss": 1.2543, "step": 3170 }, { "epoch": 0.59, "grad_norm": 2.4527756457949077, "learning_rate": 7.988707683683088e-06, "loss": 1.2024, "step": 3175 }, { "epoch": 0.59, "grad_norm": 2.2354347723383285, "learning_rate": 7.98283311663308e-06, "loss": 1.2409, "step": 3180 }, { "epoch": 0.59, "grad_norm": 2.526326729932712, "learning_rate": 7.976952149950768e-06, "loss": 1.2212, "step": 3185 }, { "epoch": 0.59, "grad_norm": 2.372828024861733, "learning_rate": 7.971064796253695e-06, "loss": 1.2728, "step": 3190 }, { "epoch": 0.6, "grad_norm": 2.487614641155206, "learning_rate": 7.965171068173108e-06, "loss": 1.2172, "step": 3195 }, { "epoch": 0.6, "grad_norm": 2.567329023677344, "learning_rate": 7.959270978353928e-06, "loss": 1.1692, "step": 3200 }, { "epoch": 0.6, "grad_norm": 2.157718620065115, "learning_rate": 7.953364539454724e-06, "loss": 1.2415, "step": 3205 }, { "epoch": 0.6, "grad_norm": 2.8667488031824813, "learning_rate": 7.947451764147694e-06, "loss": 1.2713, "step": 3210 }, { "epoch": 0.6, "grad_norm": 2.599094361368479, "learning_rate": 7.941532665118621e-06, "loss": 1.2505, "step": 3215 }, { "epoch": 0.6, "grad_norm": 2.4541586898227084, "learning_rate": 7.935607255066867e-06, "loss": 1.2747, "step": 3220 }, { "epoch": 0.6, "grad_norm": 2.3108705882965324, "learning_rate": 7.929675546705318e-06, "loss": 1.2838, "step": 3225 }, { "epoch": 0.6, "grad_norm": 2.6954248407554617, "learning_rate": 7.923737552760389e-06, "loss": 1.2476, "step": 3230 }, { "epoch": 0.6, "grad_norm": 2.5820318599293217, "learning_rate": 7.917793285971972e-06, "loss": 1.1464, "step": 3235 }, { "epoch": 0.6, "grad_norm": 2.5389808963239773, "learning_rate": 7.91184275909342e-06, "loss": 1.2377, "step": 3240 }, { "epoch": 0.61, "grad_norm": 2.4877167669233318, "learning_rate": 7.905885984891514e-06, "loss": 1.2184, "step": 3245 }, { "epoch": 0.61, "grad_norm": 2.892266466172481, "learning_rate": 7.899922976146446e-06, "loss": 1.2468, "step": 3250 }, { "epoch": 0.61, "grad_norm": 2.463904687153827, "learning_rate": 7.893953745651774e-06, "loss": 1.2562, "step": 3255 }, { "epoch": 0.61, "grad_norm": 2.5775003087085153, "learning_rate": 7.887978306214412e-06, "loss": 1.2282, "step": 3260 }, { "epoch": 0.61, "grad_norm": 2.5006547798451093, "learning_rate": 7.881996670654593e-06, "loss": 1.172, "step": 3265 }, { "epoch": 0.61, "grad_norm": 2.5863204612482757, "learning_rate": 7.876008851805841e-06, "loss": 1.3017, "step": 3270 }, { "epoch": 0.61, "grad_norm": 2.4859534188574264, "learning_rate": 7.870014862514954e-06, "loss": 1.2004, "step": 3275 }, { "epoch": 0.61, "grad_norm": 2.773160463069392, "learning_rate": 7.864014715641958e-06, "loss": 1.3237, "step": 3280 }, { "epoch": 0.61, "grad_norm": 2.787324849208085, "learning_rate": 7.858008424060098e-06, "loss": 1.2493, "step": 3285 }, { "epoch": 0.61, "grad_norm": 2.5342367849847203, "learning_rate": 7.8519960006558e-06, "loss": 1.1987, "step": 3290 }, { "epoch": 0.61, "grad_norm": 2.613544873848067, "learning_rate": 7.845977458328641e-06, "loss": 1.2352, "step": 3295 }, { "epoch": 0.62, "grad_norm": 2.331618258739045, "learning_rate": 7.83995280999134e-06, "loss": 1.2313, "step": 3300 }, { "epoch": 0.62, "grad_norm": 2.474266299250581, "learning_rate": 7.833922068569699e-06, "loss": 1.2009, "step": 3305 }, { "epoch": 0.62, "grad_norm": 2.480106249358808, "learning_rate": 7.827885247002604e-06, "loss": 1.1864, "step": 3310 }, { "epoch": 0.62, "grad_norm": 2.3278379337755157, "learning_rate": 7.82184235824198e-06, "loss": 1.1984, "step": 3315 }, { "epoch": 0.62, "grad_norm": 2.61292859257405, "learning_rate": 7.815793415252776e-06, "loss": 1.1849, "step": 3320 }, { "epoch": 0.62, "grad_norm": 2.292538348374208, "learning_rate": 7.809738431012922e-06, "loss": 1.2256, "step": 3325 }, { "epoch": 0.62, "grad_norm": 2.481732800973384, "learning_rate": 7.803677418513314e-06, "loss": 1.221, "step": 3330 }, { "epoch": 0.62, "grad_norm": 2.6567638646549647, "learning_rate": 7.797610390757781e-06, "loss": 1.2608, "step": 3335 }, { "epoch": 0.62, "grad_norm": 2.7081940886811267, "learning_rate": 7.79153736076306e-06, "loss": 1.2218, "step": 3340 }, { "epoch": 0.62, "grad_norm": 2.6863450330567775, "learning_rate": 7.785458341558759e-06, "loss": 1.271, "step": 3345 }, { "epoch": 0.62, "grad_norm": 2.4418830984759747, "learning_rate": 7.779373346187343e-06, "loss": 1.2338, "step": 3350 }, { "epoch": 0.63, "grad_norm": 2.5393616241756836, "learning_rate": 7.773282387704095e-06, "loss": 1.2133, "step": 3355 }, { "epoch": 0.63, "grad_norm": 2.1693849381009347, "learning_rate": 7.767185479177092e-06, "loss": 1.2466, "step": 3360 }, { "epoch": 0.63, "grad_norm": 2.3461814758289066, "learning_rate": 7.76108263368718e-06, "loss": 1.2427, "step": 3365 }, { "epoch": 0.63, "grad_norm": 2.483322016918312, "learning_rate": 7.754973864327934e-06, "loss": 1.2986, "step": 3370 }, { "epoch": 0.63, "grad_norm": 2.327833315357301, "learning_rate": 7.748859184205653e-06, "loss": 1.1808, "step": 3375 }, { "epoch": 0.63, "grad_norm": 2.529809653474871, "learning_rate": 7.742738606439304e-06, "loss": 1.2513, "step": 3380 }, { "epoch": 0.63, "grad_norm": 2.9085797113080685, "learning_rate": 7.73661214416051e-06, "loss": 1.2094, "step": 3385 }, { "epoch": 0.63, "grad_norm": 2.3983677621797654, "learning_rate": 7.730479810513527e-06, "loss": 1.2382, "step": 3390 }, { "epoch": 0.63, "grad_norm": 2.3326323104199544, "learning_rate": 7.724341618655201e-06, "loss": 1.2556, "step": 3395 }, { "epoch": 0.63, "grad_norm": 2.564858311955747, "learning_rate": 7.718197581754947e-06, "loss": 1.2206, "step": 3400 }, { "epoch": 0.63, "grad_norm": 2.5755523825131617, "learning_rate": 7.712047712994722e-06, "loss": 1.2795, "step": 3405 }, { "epoch": 0.64, "grad_norm": 2.734228499086126, "learning_rate": 7.705892025568996e-06, "loss": 1.2526, "step": 3410 }, { "epoch": 0.64, "grad_norm": 2.618647964492142, "learning_rate": 7.69973053268472e-06, "loss": 1.2267, "step": 3415 }, { "epoch": 0.64, "grad_norm": 2.4211951168322323, "learning_rate": 7.693563247561302e-06, "loss": 1.1891, "step": 3420 }, { "epoch": 0.64, "grad_norm": 2.956630395418801, "learning_rate": 7.68739018343058e-06, "loss": 1.2446, "step": 3425 }, { "epoch": 0.64, "grad_norm": 2.3367616810534484, "learning_rate": 7.681211353536787e-06, "loss": 1.2206, "step": 3430 }, { "epoch": 0.64, "grad_norm": 2.375902773902396, "learning_rate": 7.675026771136528e-06, "loss": 1.2322, "step": 3435 }, { "epoch": 0.64, "grad_norm": 3.5013401997908384, "learning_rate": 7.668836449498749e-06, "loss": 1.196, "step": 3440 }, { "epoch": 0.64, "grad_norm": 2.7544164936127697, "learning_rate": 7.662640401904709e-06, "loss": 1.2775, "step": 3445 }, { "epoch": 0.64, "grad_norm": 2.33908987713843, "learning_rate": 7.656438641647956e-06, "loss": 1.2146, "step": 3450 }, { "epoch": 0.64, "grad_norm": 2.427538531906867, "learning_rate": 7.650231182034289e-06, "loss": 1.2559, "step": 3455 }, { "epoch": 0.65, "grad_norm": 2.850404299913776, "learning_rate": 7.64401803638174e-06, "loss": 1.1813, "step": 3460 }, { "epoch": 0.65, "grad_norm": 2.3597140412277695, "learning_rate": 7.637799218020533e-06, "loss": 1.1822, "step": 3465 }, { "epoch": 0.65, "grad_norm": 2.917103092601735, "learning_rate": 7.631574740293073e-06, "loss": 1.2315, "step": 3470 }, { "epoch": 0.65, "grad_norm": 2.682869811949341, "learning_rate": 7.625344616553897e-06, "loss": 1.2308, "step": 3475 }, { "epoch": 0.65, "grad_norm": 2.515419590465464, "learning_rate": 7.6191088601696615e-06, "loss": 1.2428, "step": 3480 }, { "epoch": 0.65, "grad_norm": 2.233078474751225, "learning_rate": 7.612867484519107e-06, "loss": 1.2121, "step": 3485 }, { "epoch": 0.65, "grad_norm": 2.23514466267859, "learning_rate": 7.6066205029930294e-06, "loss": 1.2103, "step": 3490 }, { "epoch": 0.65, "grad_norm": 2.831634837573794, "learning_rate": 7.600367928994252e-06, "loss": 1.303, "step": 3495 }, { "epoch": 0.65, "grad_norm": 2.7631860527584355, "learning_rate": 7.594109775937595e-06, "loss": 1.2173, "step": 3500 }, { "epoch": 0.65, "grad_norm": 2.5870696417201717, "learning_rate": 7.58784605724985e-06, "loss": 1.2141, "step": 3505 }, { "epoch": 0.65, "grad_norm": 2.34215008768121, "learning_rate": 7.581576786369752e-06, "loss": 1.2298, "step": 3510 }, { "epoch": 0.66, "grad_norm": 2.1049228123123487, "learning_rate": 7.575301976747944e-06, "loss": 1.2195, "step": 3515 }, { "epoch": 0.66, "grad_norm": 2.8693762742995124, "learning_rate": 7.569021641846954e-06, "loss": 1.2206, "step": 3520 }, { "epoch": 0.66, "grad_norm": 2.551123234988086, "learning_rate": 7.562735795141163e-06, "loss": 1.185, "step": 3525 }, { "epoch": 0.66, "grad_norm": 2.4250229123964004, "learning_rate": 7.556444450116782e-06, "loss": 1.1781, "step": 3530 }, { "epoch": 0.66, "grad_norm": 2.4284380080427117, "learning_rate": 7.550147620271812e-06, "loss": 1.2299, "step": 3535 }, { "epoch": 0.66, "grad_norm": 2.3780483099018777, "learning_rate": 7.54384531911603e-06, "loss": 1.2534, "step": 3540 }, { "epoch": 0.66, "grad_norm": 2.7264232568865983, "learning_rate": 7.537537560170941e-06, "loss": 1.1865, "step": 3545 }, { "epoch": 0.66, "grad_norm": 2.5294854459143767, "learning_rate": 7.53122435696977e-06, "loss": 1.2579, "step": 3550 }, { "epoch": 0.66, "grad_norm": 2.2660161661863563, "learning_rate": 7.5249057230574155e-06, "loss": 1.2119, "step": 3555 }, { "epoch": 0.66, "grad_norm": 3.2233305983948206, "learning_rate": 7.51858167199043e-06, "loss": 1.1961, "step": 3560 }, { "epoch": 0.66, "grad_norm": 2.3319770150325603, "learning_rate": 7.512252217336989e-06, "loss": 1.2339, "step": 3565 }, { "epoch": 0.67, "grad_norm": 3.071379491685929, "learning_rate": 7.505917372676863e-06, "loss": 1.2428, "step": 3570 }, { "epoch": 0.67, "grad_norm": 2.5041481151057856, "learning_rate": 7.4995771516013815e-06, "loss": 1.2326, "step": 3575 }, { "epoch": 0.67, "grad_norm": 2.4707186038211626, "learning_rate": 7.4932315677134105e-06, "loss": 1.2469, "step": 3580 }, { "epoch": 0.67, "grad_norm": 2.4784712914427525, "learning_rate": 7.486880634627328e-06, "loss": 1.2581, "step": 3585 }, { "epoch": 0.67, "grad_norm": 2.8602362997757402, "learning_rate": 7.480524365968981e-06, "loss": 1.2216, "step": 3590 }, { "epoch": 0.67, "grad_norm": 2.5401997745842486, "learning_rate": 7.474162775375667e-06, "loss": 1.2347, "step": 3595 }, { "epoch": 0.67, "grad_norm": 2.3472339553819674, "learning_rate": 7.467795876496101e-06, "loss": 1.2321, "step": 3600 }, { "epoch": 0.67, "grad_norm": 2.7093269313095365, "learning_rate": 7.4614236829903894e-06, "loss": 1.2661, "step": 3605 }, { "epoch": 0.67, "grad_norm": 2.7812103887052766, "learning_rate": 7.455046208529996e-06, "loss": 1.1655, "step": 3610 }, { "epoch": 0.67, "grad_norm": 3.376729838029134, "learning_rate": 7.448663466797713e-06, "loss": 1.1841, "step": 3615 }, { "epoch": 0.68, "grad_norm": 2.5747589195664333, "learning_rate": 7.442275471487636e-06, "loss": 1.2376, "step": 3620 }, { "epoch": 0.68, "grad_norm": 2.451020523839669, "learning_rate": 7.435882236305133e-06, "loss": 1.206, "step": 3625 }, { "epoch": 0.68, "grad_norm": 2.245081634477842, "learning_rate": 7.4294837749668114e-06, "loss": 1.1919, "step": 3630 }, { "epoch": 0.68, "grad_norm": 2.4840846987847653, "learning_rate": 7.4230801012004925e-06, "loss": 1.2257, "step": 3635 }, { "epoch": 0.68, "grad_norm": 2.278915066123373, "learning_rate": 7.416671228745181e-06, "loss": 1.1602, "step": 3640 }, { "epoch": 0.68, "grad_norm": 2.4499028758210275, "learning_rate": 7.410257171351034e-06, "loss": 1.221, "step": 3645 }, { "epoch": 0.68, "grad_norm": 3.6861762079284337, "learning_rate": 7.403837942779337e-06, "loss": 1.2513, "step": 3650 }, { "epoch": 0.68, "grad_norm": 2.8481514636316234, "learning_rate": 7.397413556802464e-06, "loss": 1.1957, "step": 3655 }, { "epoch": 0.68, "grad_norm": 2.2520010448465255, "learning_rate": 7.390984027203858e-06, "loss": 1.2156, "step": 3660 }, { "epoch": 0.68, "grad_norm": 2.2144209219898223, "learning_rate": 7.3845493677779975e-06, "loss": 1.1903, "step": 3665 }, { "epoch": 0.68, "grad_norm": 2.5534532157329597, "learning_rate": 7.378109592330367e-06, "loss": 1.2322, "step": 3670 }, { "epoch": 0.69, "grad_norm": 2.3019311785840046, "learning_rate": 7.371664714677424e-06, "loss": 1.212, "step": 3675 }, { "epoch": 0.69, "grad_norm": 2.635004055428559, "learning_rate": 7.365214748646577e-06, "loss": 1.2254, "step": 3680 }, { "epoch": 0.69, "grad_norm": 4.04336948549475, "learning_rate": 7.3587597080761505e-06, "loss": 1.1767, "step": 3685 }, { "epoch": 0.69, "grad_norm": 2.521375182264338, "learning_rate": 7.352299606815356e-06, "loss": 1.2269, "step": 3690 }, { "epoch": 0.69, "grad_norm": 2.3893615350098143, "learning_rate": 7.345834458724262e-06, "loss": 1.2566, "step": 3695 }, { "epoch": 0.69, "grad_norm": 3.0395949427272115, "learning_rate": 7.3393642776737625e-06, "loss": 1.2476, "step": 3700 }, { "epoch": 0.69, "grad_norm": 2.739571762112554, "learning_rate": 7.332889077545558e-06, "loss": 1.1958, "step": 3705 }, { "epoch": 0.69, "grad_norm": 2.65164618555974, "learning_rate": 7.326408872232107e-06, "loss": 1.26, "step": 3710 }, { "epoch": 0.69, "grad_norm": 2.526691816795283, "learning_rate": 7.319923675636615e-06, "loss": 1.2448, "step": 3715 }, { "epoch": 0.69, "grad_norm": 2.77656114577537, "learning_rate": 7.313433501672991e-06, "loss": 1.2472, "step": 3720 }, { "epoch": 0.69, "grad_norm": 2.8625605756818553, "learning_rate": 7.3069383642658245e-06, "loss": 1.2494, "step": 3725 }, { "epoch": 0.7, "grad_norm": 2.560276503666187, "learning_rate": 7.300438277350354e-06, "loss": 1.2785, "step": 3730 }, { "epoch": 0.7, "grad_norm": 2.4990702759200243, "learning_rate": 7.293933254872439e-06, "loss": 1.2291, "step": 3735 }, { "epoch": 0.7, "grad_norm": 3.0835335467596074, "learning_rate": 7.287423310788524e-06, "loss": 1.2341, "step": 3740 }, { "epoch": 0.7, "grad_norm": 2.1231234217056913, "learning_rate": 7.2809084590656174e-06, "loss": 1.2133, "step": 3745 }, { "epoch": 0.7, "grad_norm": 2.5855027096845027, "learning_rate": 7.274388713681252e-06, "loss": 1.2473, "step": 3750 }, { "epoch": 0.7, "grad_norm": 2.3812570368674146, "learning_rate": 7.267864088623465e-06, "loss": 1.1988, "step": 3755 }, { "epoch": 0.7, "grad_norm": 2.396924022356612, "learning_rate": 7.261334597890759e-06, "loss": 1.1919, "step": 3760 }, { "epoch": 0.7, "grad_norm": 2.293490431649686, "learning_rate": 7.254800255492078e-06, "loss": 1.196, "step": 3765 }, { "epoch": 0.7, "grad_norm": 3.116322647421966, "learning_rate": 7.248261075446773e-06, "loss": 1.2607, "step": 3770 }, { "epoch": 0.7, "grad_norm": 2.511617024768492, "learning_rate": 7.241717071784576e-06, "loss": 1.2038, "step": 3775 }, { "epoch": 0.7, "grad_norm": 2.4100645815926804, "learning_rate": 7.235168258545569e-06, "loss": 1.213, "step": 3780 }, { "epoch": 0.71, "grad_norm": 2.699345731526474, "learning_rate": 7.228614649780151e-06, "loss": 1.2615, "step": 3785 }, { "epoch": 0.71, "grad_norm": 2.8315172335251644, "learning_rate": 7.222056259549009e-06, "loss": 1.2952, "step": 3790 }, { "epoch": 0.71, "grad_norm": 2.671653052341253, "learning_rate": 7.21549310192309e-06, "loss": 1.2385, "step": 3795 }, { "epoch": 0.71, "grad_norm": 2.4881987224843463, "learning_rate": 7.208925190983571e-06, "loss": 1.1953, "step": 3800 }, { "epoch": 0.71, "grad_norm": 5.397308968654964, "learning_rate": 7.202352540821823e-06, "loss": 1.188, "step": 3805 }, { "epoch": 0.71, "grad_norm": 2.315595168564112, "learning_rate": 7.195775165539389e-06, "loss": 1.1864, "step": 3810 }, { "epoch": 0.71, "grad_norm": 2.7725671060957984, "learning_rate": 7.1891930792479466e-06, "loss": 1.2379, "step": 3815 }, { "epoch": 0.71, "grad_norm": 2.5781468482007175, "learning_rate": 7.182606296069282e-06, "loss": 1.2162, "step": 3820 }, { "epoch": 0.71, "grad_norm": 2.3949068618768985, "learning_rate": 7.176014830135262e-06, "loss": 1.2382, "step": 3825 }, { "epoch": 0.71, "grad_norm": 2.306974962429036, "learning_rate": 7.169418695587791e-06, "loss": 1.209, "step": 3830 }, { "epoch": 0.72, "grad_norm": 2.4234118244741993, "learning_rate": 7.1628179065788e-06, "loss": 1.2339, "step": 3835 }, { "epoch": 0.72, "grad_norm": 2.6049010934618946, "learning_rate": 7.1562124772702014e-06, "loss": 1.2113, "step": 3840 }, { "epoch": 0.72, "grad_norm": 2.320735650008491, "learning_rate": 7.1496024218338645e-06, "loss": 1.2364, "step": 3845 }, { "epoch": 0.72, "grad_norm": 2.483396009383728, "learning_rate": 7.14298775445158e-06, "loss": 1.2469, "step": 3850 }, { "epoch": 0.72, "grad_norm": 2.518323908064992, "learning_rate": 7.13636848931504e-06, "loss": 1.2442, "step": 3855 }, { "epoch": 0.72, "grad_norm": 3.17206675979895, "learning_rate": 7.129744640625793e-06, "loss": 1.2622, "step": 3860 }, { "epoch": 0.72, "grad_norm": 2.9675291147438307, "learning_rate": 7.123116222595233e-06, "loss": 1.1703, "step": 3865 }, { "epoch": 0.72, "grad_norm": 2.2495951739460156, "learning_rate": 7.116483249444543e-06, "loss": 1.2185, "step": 3870 }, { "epoch": 0.72, "grad_norm": 2.4475476932021563, "learning_rate": 7.109845735404693e-06, "loss": 1.2649, "step": 3875 }, { "epoch": 0.72, "grad_norm": 2.453807043549154, "learning_rate": 7.103203694716387e-06, "loss": 1.1565, "step": 3880 }, { "epoch": 0.72, "grad_norm": 2.8207713989621284, "learning_rate": 7.0965571416300394e-06, "loss": 1.2525, "step": 3885 }, { "epoch": 0.73, "grad_norm": 3.276570824599666, "learning_rate": 7.089906090405754e-06, "loss": 1.2386, "step": 3890 }, { "epoch": 0.73, "grad_norm": 3.0267611954698816, "learning_rate": 7.083250555313276e-06, "loss": 1.1895, "step": 3895 }, { "epoch": 0.73, "grad_norm": 2.4575239109756386, "learning_rate": 7.076590550631979e-06, "loss": 1.2019, "step": 3900 }, { "epoch": 0.73, "grad_norm": 2.567815585392051, "learning_rate": 7.069926090650821e-06, "loss": 1.2415, "step": 3905 }, { "epoch": 0.73, "grad_norm": 2.470581883290022, "learning_rate": 7.063257189668321e-06, "loss": 1.235, "step": 3910 }, { "epoch": 0.73, "grad_norm": 2.786238810744643, "learning_rate": 7.056583861992523e-06, "loss": 1.2203, "step": 3915 }, { "epoch": 0.73, "grad_norm": 2.1595794451729184, "learning_rate": 7.049906121940974e-06, "loss": 1.2336, "step": 3920 }, { "epoch": 0.73, "grad_norm": 2.7786181362023474, "learning_rate": 7.043223983840681e-06, "loss": 1.2226, "step": 3925 }, { "epoch": 0.73, "grad_norm": 2.2829956340398945, "learning_rate": 7.036537462028093e-06, "loss": 1.2451, "step": 3930 }, { "epoch": 0.73, "grad_norm": 2.425946589100039, "learning_rate": 7.029846570849059e-06, "loss": 1.1942, "step": 3935 }, { "epoch": 0.73, "grad_norm": 3.3968924559272717, "learning_rate": 7.023151324658807e-06, "loss": 1.2156, "step": 3940 }, { "epoch": 0.74, "grad_norm": 2.162913532948155, "learning_rate": 7.016451737821907e-06, "loss": 1.2059, "step": 3945 }, { "epoch": 0.74, "grad_norm": 2.0812857304137697, "learning_rate": 7.009747824712238e-06, "loss": 1.1948, "step": 3950 }, { "epoch": 0.74, "grad_norm": 2.708378845612042, "learning_rate": 7.003039599712968e-06, "loss": 1.1979, "step": 3955 }, { "epoch": 0.74, "grad_norm": 2.391502232822183, "learning_rate": 6.996327077216509e-06, "loss": 1.2545, "step": 3960 }, { "epoch": 0.74, "grad_norm": 2.788548039051949, "learning_rate": 6.989610271624501e-06, "loss": 1.2301, "step": 3965 }, { "epoch": 0.74, "grad_norm": 2.166345384579877, "learning_rate": 6.982889197347764e-06, "loss": 1.246, "step": 3970 }, { "epoch": 0.74, "grad_norm": 2.3258307219501893, "learning_rate": 6.9761638688062864e-06, "loss": 1.1612, "step": 3975 }, { "epoch": 0.74, "grad_norm": 2.430041825610154, "learning_rate": 6.969434300429176e-06, "loss": 1.2322, "step": 3980 }, { "epoch": 0.74, "grad_norm": 2.4302309246907416, "learning_rate": 6.962700506654641e-06, "loss": 1.1925, "step": 3985 }, { "epoch": 0.74, "grad_norm": 2.444079835086675, "learning_rate": 6.9559625019299535e-06, "loss": 1.2203, "step": 3990 }, { "epoch": 0.75, "grad_norm": 2.7073060094127217, "learning_rate": 6.949220300711422e-06, "loss": 1.2291, "step": 3995 }, { "epoch": 0.75, "grad_norm": 2.8700411507881056, "learning_rate": 6.942473917464359e-06, "loss": 1.2168, "step": 4000 }, { "epoch": 0.75, "grad_norm": 2.719479119750876, "learning_rate": 6.9357233666630445e-06, "loss": 1.2352, "step": 4005 }, { "epoch": 0.75, "grad_norm": 2.3178114215528183, "learning_rate": 6.9289686627907045e-06, "loss": 1.2019, "step": 4010 }, { "epoch": 0.75, "grad_norm": 2.5183702507345362, "learning_rate": 6.9222098203394764e-06, "loss": 1.2316, "step": 4015 }, { "epoch": 0.75, "grad_norm": 2.345595014646884, "learning_rate": 6.915446853810373e-06, "loss": 1.1935, "step": 4020 }, { "epoch": 0.75, "grad_norm": 2.75761648382353, "learning_rate": 6.908679777713256e-06, "loss": 1.189, "step": 4025 }, { "epoch": 0.75, "grad_norm": 2.4061222491813727, "learning_rate": 6.90190860656681e-06, "loss": 1.1825, "step": 4030 }, { "epoch": 0.75, "grad_norm": 2.5927441284141244, "learning_rate": 6.8951333548984935e-06, "loss": 1.1896, "step": 4035 }, { "epoch": 0.75, "grad_norm": 2.606025902673968, "learning_rate": 6.888354037244533e-06, "loss": 1.1845, "step": 4040 }, { "epoch": 0.75, "grad_norm": 2.503320901473635, "learning_rate": 6.881570668149868e-06, "loss": 1.1908, "step": 4045 }, { "epoch": 0.76, "grad_norm": 2.580339195095967, "learning_rate": 6.874783262168138e-06, "loss": 1.1778, "step": 4050 }, { "epoch": 0.76, "grad_norm": 2.490610410423907, "learning_rate": 6.867991833861638e-06, "loss": 1.2201, "step": 4055 }, { "epoch": 0.76, "grad_norm": 2.704097620844027, "learning_rate": 6.861196397801297e-06, "loss": 1.1642, "step": 4060 }, { "epoch": 0.76, "grad_norm": 2.438950075594202, "learning_rate": 6.854396968566639e-06, "loss": 1.2412, "step": 4065 }, { "epoch": 0.76, "grad_norm": 2.3645119920317783, "learning_rate": 6.847593560745758e-06, "loss": 1.2257, "step": 4070 }, { "epoch": 0.76, "grad_norm": 2.5375149761967726, "learning_rate": 6.840786188935282e-06, "loss": 1.2149, "step": 4075 }, { "epoch": 0.76, "grad_norm": 2.911196562829905, "learning_rate": 6.833974867740347e-06, "loss": 1.1989, "step": 4080 }, { "epoch": 0.76, "grad_norm": 2.5478507571563096, "learning_rate": 6.827159611774557e-06, "loss": 1.2376, "step": 4085 }, { "epoch": 0.76, "grad_norm": 2.5288802803835604, "learning_rate": 6.820340435659962e-06, "loss": 1.2257, "step": 4090 }, { "epoch": 0.76, "grad_norm": 2.998542518592259, "learning_rate": 6.813517354027025e-06, "loss": 1.1785, "step": 4095 }, { "epoch": 0.76, "grad_norm": 2.3206933604689834, "learning_rate": 6.806690381514581e-06, "loss": 1.2461, "step": 4100 }, { "epoch": 0.77, "grad_norm": 2.5445568697444902, "learning_rate": 6.799859532769818e-06, "loss": 1.256, "step": 4105 }, { "epoch": 0.77, "grad_norm": 2.608304464852376, "learning_rate": 6.7930248224482385e-06, "loss": 1.1701, "step": 4110 }, { "epoch": 0.77, "grad_norm": 2.659114117064667, "learning_rate": 6.786186265213633e-06, "loss": 1.1906, "step": 4115 }, { "epoch": 0.77, "grad_norm": 2.545472949009353, "learning_rate": 6.779343875738042e-06, "loss": 1.1745, "step": 4120 }, { "epoch": 0.77, "grad_norm": 2.120427823947941, "learning_rate": 6.772497668701727e-06, "loss": 1.1989, "step": 4125 }, { "epoch": 0.77, "grad_norm": 2.310856981052136, "learning_rate": 6.7656476587931455e-06, "loss": 1.2058, "step": 4130 }, { "epoch": 0.77, "grad_norm": 2.454517067939305, "learning_rate": 6.758793860708908e-06, "loss": 1.2168, "step": 4135 }, { "epoch": 0.77, "grad_norm": 2.2246318729776347, "learning_rate": 6.751936289153759e-06, "loss": 1.2466, "step": 4140 }, { "epoch": 0.77, "grad_norm": 2.337677050118351, "learning_rate": 6.7450749588405294e-06, "loss": 1.2283, "step": 4145 }, { "epoch": 0.77, "grad_norm": 2.6016517303715685, "learning_rate": 6.7382098844901254e-06, "loss": 1.2219, "step": 4150 }, { "epoch": 0.77, "grad_norm": 2.39640045650889, "learning_rate": 6.731341080831478e-06, "loss": 1.1366, "step": 4155 }, { "epoch": 0.78, "grad_norm": 2.332603496703322, "learning_rate": 6.724468562601526e-06, "loss": 1.2843, "step": 4160 }, { "epoch": 0.78, "grad_norm": 2.599943943825198, "learning_rate": 6.717592344545168e-06, "loss": 1.242, "step": 4165 }, { "epoch": 0.78, "grad_norm": 2.4915493854484976, "learning_rate": 6.710712441415252e-06, "loss": 1.1962, "step": 4170 }, { "epoch": 0.78, "grad_norm": 2.1027225494150255, "learning_rate": 6.7038288679725224e-06, "loss": 1.2139, "step": 4175 }, { "epoch": 0.78, "grad_norm": 2.356586756973305, "learning_rate": 6.6969416389856065e-06, "loss": 1.2431, "step": 4180 }, { "epoch": 0.78, "grad_norm": 2.156520158907269, "learning_rate": 6.690050769230968e-06, "loss": 1.1878, "step": 4185 }, { "epoch": 0.78, "grad_norm": 2.5483349132387425, "learning_rate": 6.683156273492884e-06, "loss": 1.1834, "step": 4190 }, { "epoch": 0.78, "grad_norm": 2.283013595177182, "learning_rate": 6.676258166563414e-06, "loss": 1.259, "step": 4195 }, { "epoch": 0.78, "grad_norm": 2.4877649502496584, "learning_rate": 6.6693564632423626e-06, "loss": 1.1508, "step": 4200 }, { "epoch": 0.78, "grad_norm": 2.4403429188173584, "learning_rate": 6.6624511783372494e-06, "loss": 1.2324, "step": 4205 }, { "epoch": 0.79, "grad_norm": 2.5999418377938555, "learning_rate": 6.655542326663278e-06, "loss": 1.2015, "step": 4210 }, { "epoch": 0.79, "grad_norm": 2.3817244646066533, "learning_rate": 6.648629923043308e-06, "loss": 1.1421, "step": 4215 }, { "epoch": 0.79, "grad_norm": 2.7861597820205075, "learning_rate": 6.641713982307819e-06, "loss": 1.2487, "step": 4220 }, { "epoch": 0.79, "grad_norm": 2.604737390676733, "learning_rate": 6.634794519294876e-06, "loss": 1.1909, "step": 4225 }, { "epoch": 0.79, "grad_norm": 2.1490298175211113, "learning_rate": 6.627871548850101e-06, "loss": 1.2128, "step": 4230 }, { "epoch": 0.79, "grad_norm": 2.7478682969064128, "learning_rate": 6.620945085826648e-06, "loss": 1.2724, "step": 4235 }, { "epoch": 0.79, "grad_norm": 2.4473416505881747, "learning_rate": 6.614015145085157e-06, "loss": 1.1413, "step": 4240 }, { "epoch": 0.79, "grad_norm": 2.664153209181916, "learning_rate": 6.607081741493731e-06, "loss": 1.1926, "step": 4245 }, { "epoch": 0.79, "grad_norm": 2.664772133407704, "learning_rate": 6.6001448899279045e-06, "loss": 1.2905, "step": 4250 }, { "epoch": 0.79, "grad_norm": 2.463824076819037, "learning_rate": 6.593204605270608e-06, "loss": 1.1638, "step": 4255 }, { "epoch": 0.79, "grad_norm": 2.754141316699185, "learning_rate": 6.58626090241214e-06, "loss": 1.2046, "step": 4260 }, { "epoch": 0.8, "grad_norm": 2.2989864973374483, "learning_rate": 6.579313796250127e-06, "loss": 1.1798, "step": 4265 }, { "epoch": 0.8, "grad_norm": 2.560534271429197, "learning_rate": 6.572363301689504e-06, "loss": 1.2023, "step": 4270 }, { "epoch": 0.8, "grad_norm": 2.278717838505588, "learning_rate": 6.565409433642471e-06, "loss": 1.1969, "step": 4275 }, { "epoch": 0.8, "grad_norm": 2.4833739605923917, "learning_rate": 6.55845220702847e-06, "loss": 1.1965, "step": 4280 }, { "epoch": 0.8, "grad_norm": 2.430188408103723, "learning_rate": 6.551491636774145e-06, "loss": 1.2309, "step": 4285 }, { "epoch": 0.8, "grad_norm": 2.6975259952464294, "learning_rate": 6.544527737813315e-06, "loss": 1.192, "step": 4290 }, { "epoch": 0.8, "grad_norm": 2.4624516425401946, "learning_rate": 6.5375605250869434e-06, "loss": 1.2492, "step": 4295 }, { "epoch": 0.8, "grad_norm": 2.562999083675579, "learning_rate": 6.530590013543096e-06, "loss": 1.1914, "step": 4300 }, { "epoch": 0.8, "grad_norm": 2.1022978731614415, "learning_rate": 6.523616218136924e-06, "loss": 1.1834, "step": 4305 }, { "epoch": 0.8, "grad_norm": 2.2686047736925516, "learning_rate": 6.51663915383062e-06, "loss": 1.2209, "step": 4310 }, { "epoch": 0.8, "grad_norm": 2.5183595374337884, "learning_rate": 6.509658835593391e-06, "loss": 1.1786, "step": 4315 }, { "epoch": 0.81, "grad_norm": 2.4391880597280107, "learning_rate": 6.502675278401426e-06, "loss": 1.1716, "step": 4320 }, { "epoch": 0.81, "grad_norm": 2.484001041771429, "learning_rate": 6.495688497237861e-06, "loss": 1.2571, "step": 4325 }, { "epoch": 0.81, "grad_norm": 2.2961665977309793, "learning_rate": 6.48869850709275e-06, "loss": 1.1841, "step": 4330 }, { "epoch": 0.81, "grad_norm": 2.816933957342352, "learning_rate": 6.481705322963035e-06, "loss": 1.1983, "step": 4335 }, { "epoch": 0.81, "grad_norm": 2.417764873329051, "learning_rate": 6.474708959852504e-06, "loss": 1.1827, "step": 4340 }, { "epoch": 0.81, "grad_norm": 2.30617051197943, "learning_rate": 6.467709432771772e-06, "loss": 1.2341, "step": 4345 }, { "epoch": 0.81, "grad_norm": 2.4526028616620983, "learning_rate": 6.4607067567382385e-06, "loss": 1.2723, "step": 4350 }, { "epoch": 0.81, "grad_norm": 3.2024751951646775, "learning_rate": 6.4537009467760605e-06, "loss": 1.2418, "step": 4355 }, { "epoch": 0.81, "grad_norm": 2.334451571090072, "learning_rate": 6.446692017916118e-06, "loss": 1.1688, "step": 4360 }, { "epoch": 0.81, "grad_norm": 2.6791856005998147, "learning_rate": 6.4396799851959815e-06, "loss": 1.1688, "step": 4365 }, { "epoch": 0.81, "grad_norm": 3.4087530077465655, "learning_rate": 6.432664863659886e-06, "loss": 1.229, "step": 4370 }, { "epoch": 0.82, "grad_norm": 2.4259591534025837, "learning_rate": 6.425646668358686e-06, "loss": 1.2277, "step": 4375 }, { "epoch": 0.82, "grad_norm": 2.495323027270627, "learning_rate": 6.418625414349837e-06, "loss": 1.2399, "step": 4380 }, { "epoch": 0.82, "grad_norm": 2.896598795176137, "learning_rate": 6.411601116697352e-06, "loss": 1.2328, "step": 4385 }, { "epoch": 0.82, "grad_norm": 2.458597929411267, "learning_rate": 6.404573790471778e-06, "loss": 1.197, "step": 4390 }, { "epoch": 0.82, "grad_norm": 2.4345585639841105, "learning_rate": 6.39754345075016e-06, "loss": 1.268, "step": 4395 }, { "epoch": 0.82, "grad_norm": 3.1164297658059166, "learning_rate": 6.390510112616004e-06, "loss": 1.2478, "step": 4400 }, { "epoch": 0.82, "grad_norm": 2.3126599056846207, "learning_rate": 6.383473791159252e-06, "loss": 1.1894, "step": 4405 }, { "epoch": 0.82, "grad_norm": 2.6416467416762117, "learning_rate": 6.376434501476246e-06, "loss": 1.2318, "step": 4410 }, { "epoch": 0.82, "grad_norm": 2.5167282692335986, "learning_rate": 6.3693922586697e-06, "loss": 1.1838, "step": 4415 }, { "epoch": 0.82, "grad_norm": 2.4469171349202745, "learning_rate": 6.362347077848656e-06, "loss": 1.1867, "step": 4420 }, { "epoch": 0.83, "grad_norm": 2.7546951334155323, "learning_rate": 6.355298974128467e-06, "loss": 1.2411, "step": 4425 }, { "epoch": 0.83, "grad_norm": 2.2263388765183314, "learning_rate": 6.348247962630752e-06, "loss": 1.2784, "step": 4430 }, { "epoch": 0.83, "grad_norm": 2.4165973613827636, "learning_rate": 6.341194058483375e-06, "loss": 1.1628, "step": 4435 }, { "epoch": 0.83, "grad_norm": 2.5473458867918954, "learning_rate": 6.3341372768203965e-06, "loss": 1.199, "step": 4440 }, { "epoch": 0.83, "grad_norm": 2.5383914426106053, "learning_rate": 6.3270776327820606e-06, "loss": 1.1682, "step": 4445 }, { "epoch": 0.83, "grad_norm": 2.4476722364188306, "learning_rate": 6.320015141514744e-06, "loss": 1.227, "step": 4450 }, { "epoch": 0.83, "grad_norm": 2.5620633286885006, "learning_rate": 6.31294981817094e-06, "loss": 1.1892, "step": 4455 }, { "epoch": 0.83, "grad_norm": 2.371789840507076, "learning_rate": 6.305881677909211e-06, "loss": 1.23, "step": 4460 }, { "epoch": 0.83, "grad_norm": 2.316049180608952, "learning_rate": 6.298810735894167e-06, "loss": 1.2697, "step": 4465 }, { "epoch": 0.83, "grad_norm": 2.6693599514255, "learning_rate": 6.29173700729643e-06, "loss": 1.2239, "step": 4470 }, { "epoch": 0.83, "grad_norm": 2.5293838325304963, "learning_rate": 6.284660507292599e-06, "loss": 1.2408, "step": 4475 }, { "epoch": 0.84, "grad_norm": 2.3739300942695007, "learning_rate": 6.277581251065217e-06, "loss": 1.2146, "step": 4480 }, { "epoch": 0.84, "grad_norm": 2.1861207671387173, "learning_rate": 6.270499253802742e-06, "loss": 1.1965, "step": 4485 }, { "epoch": 0.84, "grad_norm": 2.4626377264331505, "learning_rate": 6.263414530699516e-06, "loss": 1.2275, "step": 4490 }, { "epoch": 0.84, "grad_norm": 2.4393045656196857, "learning_rate": 6.256327096955726e-06, "loss": 1.2562, "step": 4495 }, { "epoch": 0.84, "grad_norm": 2.420788009586598, "learning_rate": 6.249236967777373e-06, "loss": 1.2532, "step": 4500 }, { "epoch": 0.84, "grad_norm": 2.3682500570454206, "learning_rate": 6.242144158376245e-06, "loss": 1.2035, "step": 4505 }, { "epoch": 0.84, "grad_norm": 2.617981119739426, "learning_rate": 6.235048683969879e-06, "loss": 1.2266, "step": 4510 }, { "epoch": 0.84, "grad_norm": 2.6458940270866633, "learning_rate": 6.227950559781528e-06, "loss": 1.228, "step": 4515 }, { "epoch": 0.84, "grad_norm": 2.589007218774909, "learning_rate": 6.220849801040133e-06, "loss": 1.2056, "step": 4520 }, { "epoch": 0.84, "grad_norm": 2.877953150570889, "learning_rate": 6.2137464229802835e-06, "loss": 1.2288, "step": 4525 }, { "epoch": 0.84, "grad_norm": 2.8622664846448544, "learning_rate": 6.206640440842193e-06, "loss": 1.2137, "step": 4530 }, { "epoch": 0.85, "grad_norm": 2.633031891709981, "learning_rate": 6.199531869871661e-06, "loss": 1.2116, "step": 4535 }, { "epoch": 0.85, "grad_norm": 2.524845529953313, "learning_rate": 6.192420725320038e-06, "loss": 1.1503, "step": 4540 }, { "epoch": 0.85, "grad_norm": 2.1461215608130306, "learning_rate": 6.185307022444198e-06, "loss": 1.2836, "step": 4545 }, { "epoch": 0.85, "grad_norm": 2.3233940868358087, "learning_rate": 6.178190776506507e-06, "loss": 1.2276, "step": 4550 }, { "epoch": 0.85, "grad_norm": 2.571444746747503, "learning_rate": 6.1710720027747835e-06, "loss": 1.2365, "step": 4555 }, { "epoch": 0.85, "grad_norm": 2.442325328268483, "learning_rate": 6.163950716522268e-06, "loss": 1.199, "step": 4560 }, { "epoch": 0.85, "grad_norm": 2.8973344425831624, "learning_rate": 6.1568269330275974e-06, "loss": 1.2247, "step": 4565 }, { "epoch": 0.85, "grad_norm": 2.265438975600916, "learning_rate": 6.149700667574761e-06, "loss": 1.2674, "step": 4570 }, { "epoch": 0.85, "grad_norm": 2.427805906563997, "learning_rate": 6.142571935453074e-06, "loss": 1.2244, "step": 4575 }, { "epoch": 0.85, "grad_norm": 2.71037186034558, "learning_rate": 6.135440751957147e-06, "loss": 1.2107, "step": 4580 }, { "epoch": 0.86, "grad_norm": 2.47935744731514, "learning_rate": 6.128307132386846e-06, "loss": 1.2215, "step": 4585 }, { "epoch": 0.86, "grad_norm": 2.4534851470879913, "learning_rate": 6.121171092047267e-06, "loss": 1.2313, "step": 4590 }, { "epoch": 0.86, "grad_norm": 2.4955695058785343, "learning_rate": 6.114032646248697e-06, "loss": 1.1893, "step": 4595 }, { "epoch": 0.86, "grad_norm": 2.7299946810100377, "learning_rate": 6.106891810306584e-06, "loss": 1.2553, "step": 4600 }, { "epoch": 0.86, "grad_norm": 2.4519592083822843, "learning_rate": 6.0997485995415065e-06, "loss": 1.1517, "step": 4605 }, { "epoch": 0.86, "grad_norm": 2.8237625939623467, "learning_rate": 6.092603029279136e-06, "loss": 1.2106, "step": 4610 }, { "epoch": 0.86, "grad_norm": 2.2919033004960423, "learning_rate": 6.0854551148502085e-06, "loss": 1.2014, "step": 4615 }, { "epoch": 0.86, "grad_norm": 2.353224396529788, "learning_rate": 6.078304871590485e-06, "loss": 1.2138, "step": 4620 }, { "epoch": 0.86, "grad_norm": 2.5985774084094504, "learning_rate": 6.071152314840726e-06, "loss": 1.2418, "step": 4625 }, { "epoch": 0.86, "grad_norm": 2.6678992574870835, "learning_rate": 6.0639974599466585e-06, "loss": 1.2188, "step": 4630 }, { "epoch": 0.86, "grad_norm": 2.4496687988011967, "learning_rate": 6.056840322258934e-06, "loss": 1.1627, "step": 4635 }, { "epoch": 0.87, "grad_norm": 2.4241155011469644, "learning_rate": 6.049680917133106e-06, "loss": 1.245, "step": 4640 }, { "epoch": 0.87, "grad_norm": 2.381657027399795, "learning_rate": 6.04251925992959e-06, "loss": 1.188, "step": 4645 }, { "epoch": 0.87, "grad_norm": 2.832348257790817, "learning_rate": 6.035355366013638e-06, "loss": 1.207, "step": 4650 }, { "epoch": 0.87, "grad_norm": 2.5691320543151273, "learning_rate": 6.028189250755293e-06, "loss": 1.2605, "step": 4655 }, { "epoch": 0.87, "grad_norm": 2.9545303881123983, "learning_rate": 6.021020929529371e-06, "loss": 1.1689, "step": 4660 }, { "epoch": 0.87, "grad_norm": 2.651237945956081, "learning_rate": 6.013850417715419e-06, "loss": 1.24, "step": 4665 }, { "epoch": 0.87, "grad_norm": 2.674374005358842, "learning_rate": 6.006677730697682e-06, "loss": 1.1851, "step": 4670 }, { "epoch": 0.87, "grad_norm": 2.360244102364902, "learning_rate": 5.999502883865073e-06, "loss": 1.2477, "step": 4675 }, { "epoch": 0.87, "grad_norm": 2.6191630744555634, "learning_rate": 5.992325892611139e-06, "loss": 1.2689, "step": 4680 }, { "epoch": 0.87, "grad_norm": 2.342487040862795, "learning_rate": 5.9851467723340295e-06, "loss": 1.2413, "step": 4685 }, { "epoch": 0.87, "grad_norm": 2.4245312125321803, "learning_rate": 5.977965538436458e-06, "loss": 1.2348, "step": 4690 }, { "epoch": 0.88, "grad_norm": 2.2720239156004025, "learning_rate": 5.9707822063256745e-06, "loss": 1.1965, "step": 4695 }, { "epoch": 0.88, "grad_norm": 2.5605489902324408, "learning_rate": 5.9635967914134316e-06, "loss": 1.2057, "step": 4700 }, { "epoch": 0.88, "grad_norm": 2.55093673845062, "learning_rate": 5.956409309115952e-06, "loss": 1.2399, "step": 4705 }, { "epoch": 0.88, "grad_norm": 2.482655438785984, "learning_rate": 5.94921977485389e-06, "loss": 1.2163, "step": 4710 }, { "epoch": 0.88, "grad_norm": 2.92841769520834, "learning_rate": 5.942028204052304e-06, "loss": 1.1962, "step": 4715 }, { "epoch": 0.88, "grad_norm": 2.403145984699388, "learning_rate": 5.934834612140621e-06, "loss": 1.2121, "step": 4720 }, { "epoch": 0.88, "grad_norm": 2.5665557751164987, "learning_rate": 5.927639014552608e-06, "loss": 1.1943, "step": 4725 }, { "epoch": 0.88, "grad_norm": 2.4345512205304614, "learning_rate": 5.920441426726331e-06, "loss": 1.2182, "step": 4730 }, { "epoch": 0.88, "grad_norm": 2.4570381781809294, "learning_rate": 5.9132418641041256e-06, "loss": 1.1541, "step": 4735 }, { "epoch": 0.88, "grad_norm": 2.42607286783389, "learning_rate": 5.90604034213257e-06, "loss": 1.2285, "step": 4740 }, { "epoch": 0.88, "grad_norm": 2.1059221387409184, "learning_rate": 5.898836876262439e-06, "loss": 1.2695, "step": 4745 }, { "epoch": 0.89, "grad_norm": 2.389763480309045, "learning_rate": 5.891631481948684e-06, "loss": 1.2433, "step": 4750 }, { "epoch": 0.89, "grad_norm": 2.4520356682997058, "learning_rate": 5.88442417465039e-06, "loss": 1.2204, "step": 4755 }, { "epoch": 0.89, "grad_norm": 2.466763311510685, "learning_rate": 5.877214969830746e-06, "loss": 1.1942, "step": 4760 }, { "epoch": 0.89, "grad_norm": 2.456029969725007, "learning_rate": 5.870003882957015e-06, "loss": 1.224, "step": 4765 }, { "epoch": 0.89, "grad_norm": 2.3061090049071393, "learning_rate": 5.862790929500497e-06, "loss": 1.1981, "step": 4770 }, { "epoch": 0.89, "grad_norm": 2.4913818728157646, "learning_rate": 5.855576124936496e-06, "loss": 1.2408, "step": 4775 }, { "epoch": 0.89, "grad_norm": 2.2335777978602813, "learning_rate": 5.848359484744286e-06, "loss": 1.1723, "step": 4780 }, { "epoch": 0.89, "grad_norm": 2.347554354081274, "learning_rate": 5.841141024407083e-06, "loss": 1.2121, "step": 4785 }, { "epoch": 0.89, "grad_norm": 2.3692356900549623, "learning_rate": 5.833920759412006e-06, "loss": 1.2553, "step": 4790 }, { "epoch": 0.89, "grad_norm": 2.5382367232442076, "learning_rate": 5.826698705250047e-06, "loss": 1.2015, "step": 4795 }, { "epoch": 0.9, "grad_norm": 2.2459980708439216, "learning_rate": 5.819474877416031e-06, "loss": 1.2305, "step": 4800 }, { "epoch": 0.9, "grad_norm": 2.546675162324036, "learning_rate": 5.8122492914086e-06, "loss": 1.2121, "step": 4805 }, { "epoch": 0.9, "grad_norm": 2.677752571152061, "learning_rate": 5.805021962730155e-06, "loss": 1.2224, "step": 4810 }, { "epoch": 0.9, "grad_norm": 2.212925811044383, "learning_rate": 5.797792906886847e-06, "loss": 1.1743, "step": 4815 }, { "epoch": 0.9, "grad_norm": 2.6409343572042823, "learning_rate": 5.790562139388522e-06, "loss": 1.1881, "step": 4820 }, { "epoch": 0.9, "grad_norm": 2.3756770154413114, "learning_rate": 5.783329675748711e-06, "loss": 1.2045, "step": 4825 }, { "epoch": 0.9, "grad_norm": 2.443951055189872, "learning_rate": 5.776095531484574e-06, "loss": 1.1974, "step": 4830 }, { "epoch": 0.9, "grad_norm": 2.5850671647243453, "learning_rate": 5.768859722116876e-06, "loss": 1.2254, "step": 4835 }, { "epoch": 0.9, "grad_norm": 2.9605499293438924, "learning_rate": 5.761622263169964e-06, "loss": 1.2438, "step": 4840 }, { "epoch": 0.9, "grad_norm": 2.56592411129834, "learning_rate": 5.754383170171716e-06, "loss": 1.2011, "step": 4845 }, { "epoch": 0.9, "grad_norm": 2.4821775141535216, "learning_rate": 5.747142458653518e-06, "loss": 1.2443, "step": 4850 }, { "epoch": 0.91, "grad_norm": 2.4637437926975387, "learning_rate": 5.7399001441502285e-06, "loss": 1.2813, "step": 4855 }, { "epoch": 0.91, "grad_norm": 2.7814434455793617, "learning_rate": 5.732656242200148e-06, "loss": 1.2608, "step": 4860 }, { "epoch": 0.91, "grad_norm": 2.6231313656942885, "learning_rate": 5.725410768344977e-06, "loss": 1.1879, "step": 4865 }, { "epoch": 0.91, "grad_norm": 2.455722926219591, "learning_rate": 5.718163738129796e-06, "loss": 1.191, "step": 4870 }, { "epoch": 0.91, "grad_norm": 2.4432078663934735, "learning_rate": 5.710915167103015e-06, "loss": 1.2193, "step": 4875 }, { "epoch": 0.91, "grad_norm": 2.424558271609936, "learning_rate": 5.703665070816361e-06, "loss": 1.19, "step": 4880 }, { "epoch": 0.91, "grad_norm": 2.6356126763167746, "learning_rate": 5.696413464824824e-06, "loss": 1.2016, "step": 4885 }, { "epoch": 0.91, "grad_norm": 2.449397519305924, "learning_rate": 5.689160364686637e-06, "loss": 1.1989, "step": 4890 }, { "epoch": 0.91, "grad_norm": 2.459503260258635, "learning_rate": 5.681905785963241e-06, "loss": 1.1441, "step": 4895 }, { "epoch": 0.91, "grad_norm": 2.3464241399602597, "learning_rate": 5.6746497442192425e-06, "loss": 1.2284, "step": 4900 }, { "epoch": 0.91, "grad_norm": 3.1074072930659193, "learning_rate": 5.667392255022396e-06, "loss": 1.2021, "step": 4905 }, { "epoch": 0.92, "grad_norm": 2.7749582202860004, "learning_rate": 5.660133333943552e-06, "loss": 1.1882, "step": 4910 }, { "epoch": 0.92, "grad_norm": 2.9527764645138235, "learning_rate": 5.652872996556643e-06, "loss": 1.2202, "step": 4915 }, { "epoch": 0.92, "grad_norm": 3.3891739848268343, "learning_rate": 5.645611258438629e-06, "loss": 1.2491, "step": 4920 }, { "epoch": 0.92, "grad_norm": 2.7067471875098437, "learning_rate": 5.638348135169486e-06, "loss": 1.1965, "step": 4925 }, { "epoch": 0.92, "grad_norm": 2.3502698720472175, "learning_rate": 5.631083642332154e-06, "loss": 1.1626, "step": 4930 }, { "epoch": 0.92, "grad_norm": 2.6969350244271646, "learning_rate": 5.623817795512515e-06, "loss": 1.2453, "step": 4935 }, { "epoch": 0.92, "grad_norm": 2.400025323822739, "learning_rate": 5.616550610299355e-06, "loss": 1.2092, "step": 4940 }, { "epoch": 0.92, "grad_norm": 2.6363089823661365, "learning_rate": 5.609282102284335e-06, "loss": 1.1759, "step": 4945 }, { "epoch": 0.92, "grad_norm": 2.4198122938368276, "learning_rate": 5.602012287061945e-06, "loss": 1.2302, "step": 4950 }, { "epoch": 0.92, "grad_norm": 2.6128455829596833, "learning_rate": 5.594741180229491e-06, "loss": 1.1694, "step": 4955 }, { "epoch": 0.92, "grad_norm": 2.4408008785514927, "learning_rate": 5.58746879738704e-06, "loss": 1.1767, "step": 4960 }, { "epoch": 0.93, "grad_norm": 2.686534311568624, "learning_rate": 5.580195154137405e-06, "loss": 1.2308, "step": 4965 }, { "epoch": 0.93, "grad_norm": 2.358550700692953, "learning_rate": 5.5729202660860984e-06, "loss": 1.2395, "step": 4970 }, { "epoch": 0.93, "grad_norm": 2.4836834824616965, "learning_rate": 5.565644148841303e-06, "loss": 1.1877, "step": 4975 }, { "epoch": 0.93, "grad_norm": 2.9208432064602214, "learning_rate": 5.558366818013842e-06, "loss": 1.157, "step": 4980 }, { "epoch": 0.93, "grad_norm": 2.864971854169532, "learning_rate": 5.551088289217139e-06, "loss": 1.2356, "step": 4985 }, { "epoch": 0.93, "grad_norm": 2.1570568875709517, "learning_rate": 5.54380857806719e-06, "loss": 1.1621, "step": 4990 }, { "epoch": 0.93, "grad_norm": 2.825445528201766, "learning_rate": 5.536527700182526e-06, "loss": 1.2424, "step": 4995 }, { "epoch": 0.93, "grad_norm": 2.541720844778179, "learning_rate": 5.5292456711841845e-06, "loss": 1.2044, "step": 5000 }, { "epoch": 0.93, "grad_norm": 2.9093417603501726, "learning_rate": 5.521962506695671e-06, "loss": 1.2173, "step": 5005 }, { "epoch": 0.93, "grad_norm": 2.385465021207973, "learning_rate": 5.514678222342922e-06, "loss": 1.1977, "step": 5010 }, { "epoch": 0.94, "grad_norm": 2.647317844287573, "learning_rate": 5.507392833754285e-06, "loss": 1.1785, "step": 5015 }, { "epoch": 0.94, "grad_norm": 2.473551842843763, "learning_rate": 5.5001063565604705e-06, "loss": 1.2015, "step": 5020 }, { "epoch": 0.94, "grad_norm": 2.7241247663506782, "learning_rate": 5.492818806394532e-06, "loss": 1.2281, "step": 5025 }, { "epoch": 0.94, "grad_norm": 2.518896282402336, "learning_rate": 5.485530198891814e-06, "loss": 1.2264, "step": 5030 }, { "epoch": 0.94, "grad_norm": 2.4126046214585672, "learning_rate": 5.47824054968994e-06, "loss": 1.206, "step": 5035 }, { "epoch": 0.94, "grad_norm": 2.724587523366145, "learning_rate": 5.47094987442876e-06, "loss": 1.1847, "step": 5040 }, { "epoch": 0.94, "grad_norm": 2.1906266563183596, "learning_rate": 5.463658188750333e-06, "loss": 1.1167, "step": 5045 }, { "epoch": 0.94, "grad_norm": 2.636133430468642, "learning_rate": 5.456365508298882e-06, "loss": 1.1843, "step": 5050 }, { "epoch": 0.94, "grad_norm": 2.498460580189244, "learning_rate": 5.449071848720762e-06, "loss": 1.1861, "step": 5055 }, { "epoch": 0.94, "grad_norm": 2.6456079430925814, "learning_rate": 5.441777225664434e-06, "loss": 1.2144, "step": 5060 }, { "epoch": 0.94, "grad_norm": 2.635032475206284, "learning_rate": 5.434481654780421e-06, "loss": 1.1949, "step": 5065 }, { "epoch": 0.95, "grad_norm": 2.414349835457233, "learning_rate": 5.427185151721284e-06, "loss": 1.1837, "step": 5070 }, { "epoch": 0.95, "grad_norm": 2.517171242137346, "learning_rate": 5.4198877321415785e-06, "loss": 1.1937, "step": 5075 }, { "epoch": 0.95, "grad_norm": 2.3322860590179593, "learning_rate": 5.412589411697835e-06, "loss": 1.221, "step": 5080 }, { "epoch": 0.95, "grad_norm": 2.954235361194853, "learning_rate": 5.405290206048508e-06, "loss": 1.2411, "step": 5085 }, { "epoch": 0.95, "grad_norm": 2.468404065206049, "learning_rate": 5.397990130853955e-06, "loss": 1.1789, "step": 5090 }, { "epoch": 0.95, "grad_norm": 2.355727263917547, "learning_rate": 5.390689201776399e-06, "loss": 1.1613, "step": 5095 }, { "epoch": 0.95, "grad_norm": 2.2426355580536663, "learning_rate": 5.383387434479895e-06, "loss": 1.208, "step": 5100 }, { "epoch": 0.95, "grad_norm": 2.380680152812773, "learning_rate": 5.376084844630298e-06, "loss": 1.2055, "step": 5105 }, { "epoch": 0.95, "grad_norm": 2.569418503583293, "learning_rate": 5.3687814478952235e-06, "loss": 1.184, "step": 5110 }, { "epoch": 0.95, "grad_norm": 2.526665130298868, "learning_rate": 5.361477259944022e-06, "loss": 1.2064, "step": 5115 }, { "epoch": 0.95, "grad_norm": 2.369294746436644, "learning_rate": 5.35417229644774e-06, "loss": 1.2072, "step": 5120 }, { "epoch": 0.96, "grad_norm": 2.720314631994901, "learning_rate": 5.346866573079088e-06, "loss": 1.2119, "step": 5125 }, { "epoch": 0.96, "grad_norm": 2.3314723731875633, "learning_rate": 5.339560105512406e-06, "loss": 1.1816, "step": 5130 }, { "epoch": 0.96, "grad_norm": 2.26140189167418, "learning_rate": 5.332252909423634e-06, "loss": 1.1715, "step": 5135 }, { "epoch": 0.96, "grad_norm": 2.7679372661959385, "learning_rate": 5.324945000490271e-06, "loss": 1.2743, "step": 5140 }, { "epoch": 0.96, "grad_norm": 2.411111902010626, "learning_rate": 5.317636394391346e-06, "loss": 1.2343, "step": 5145 }, { "epoch": 0.96, "grad_norm": 3.389608636178554, "learning_rate": 5.310327106807387e-06, "loss": 1.1983, "step": 5150 }, { "epoch": 0.96, "grad_norm": 2.5290136942085546, "learning_rate": 5.303017153420382e-06, "loss": 1.2139, "step": 5155 }, { "epoch": 0.96, "grad_norm": 2.5697341004928056, "learning_rate": 5.295706549913746e-06, "loss": 1.1951, "step": 5160 }, { "epoch": 0.96, "grad_norm": 2.6841860442942402, "learning_rate": 5.288395311972291e-06, "loss": 1.21, "step": 5165 }, { "epoch": 0.96, "grad_norm": 2.427048051792136, "learning_rate": 5.281083455282187e-06, "loss": 1.1969, "step": 5170 }, { "epoch": 0.97, "grad_norm": 2.382376197299203, "learning_rate": 5.273770995530938e-06, "loss": 1.1831, "step": 5175 }, { "epoch": 0.97, "grad_norm": 2.5132668182963886, "learning_rate": 5.266457948407336e-06, "loss": 1.2145, "step": 5180 }, { "epoch": 0.97, "grad_norm": 2.3797244199180527, "learning_rate": 5.259144329601434e-06, "loss": 1.1751, "step": 5185 }, { "epoch": 0.97, "grad_norm": 2.3611741368911194, "learning_rate": 5.251830154804515e-06, "loss": 1.2064, "step": 5190 }, { "epoch": 0.97, "grad_norm": 2.3567756326205287, "learning_rate": 5.244515439709048e-06, "loss": 1.2077, "step": 5195 }, { "epoch": 0.97, "grad_norm": 2.224791750500246, "learning_rate": 5.23720020000867e-06, "loss": 1.2123, "step": 5200 }, { "epoch": 0.97, "grad_norm": 2.4136633237187897, "learning_rate": 5.229884451398137e-06, "loss": 1.2132, "step": 5205 }, { "epoch": 0.97, "grad_norm": 2.304370257710667, "learning_rate": 5.2225682095733e-06, "loss": 1.2289, "step": 5210 }, { "epoch": 0.97, "grad_norm": 2.5814542726053635, "learning_rate": 5.215251490231066e-06, "loss": 1.272, "step": 5215 }, { "epoch": 0.97, "grad_norm": 2.3550014377964037, "learning_rate": 5.207934309069369e-06, "loss": 1.2217, "step": 5220 }, { "epoch": 0.97, "grad_norm": 2.4860052514393702, "learning_rate": 5.2006166817871304e-06, "loss": 1.227, "step": 5225 }, { "epoch": 0.98, "grad_norm": 2.463931792592068, "learning_rate": 5.193298624084233e-06, "loss": 1.2201, "step": 5230 }, { "epoch": 0.98, "grad_norm": 2.6186162346655157, "learning_rate": 5.185980151661479e-06, "loss": 1.2194, "step": 5235 }, { "epoch": 0.98, "grad_norm": 2.4191065869581587, "learning_rate": 5.178661280220566e-06, "loss": 1.1958, "step": 5240 }, { "epoch": 0.98, "grad_norm": 2.5133518806542865, "learning_rate": 5.1713420254640405e-06, "loss": 1.1761, "step": 5245 }, { "epoch": 0.98, "grad_norm": 2.325396160641914, "learning_rate": 5.164022403095274e-06, "loss": 1.178, "step": 5250 }, { "epoch": 0.98, "grad_norm": 2.54773375649405, "learning_rate": 5.156702428818431e-06, "loss": 1.2196, "step": 5255 }, { "epoch": 0.98, "grad_norm": 2.4900490503130928, "learning_rate": 5.149382118338426e-06, "loss": 1.2077, "step": 5260 }, { "epoch": 0.98, "grad_norm": 2.252151812435276, "learning_rate": 5.142061487360896e-06, "loss": 1.23, "step": 5265 }, { "epoch": 0.98, "grad_norm": 2.4776213894416377, "learning_rate": 5.134740551592165e-06, "loss": 1.2005, "step": 5270 }, { "epoch": 0.98, "grad_norm": 2.4952786157824662, "learning_rate": 5.127419326739216e-06, "loss": 1.1988, "step": 5275 }, { "epoch": 0.98, "grad_norm": 2.144004636814163, "learning_rate": 5.120097828509645e-06, "loss": 1.2339, "step": 5280 }, { "epoch": 0.99, "grad_norm": 2.3998029555969547, "learning_rate": 5.112776072611638e-06, "loss": 1.1466, "step": 5285 }, { "epoch": 0.99, "grad_norm": 2.37290798802621, "learning_rate": 5.105454074753933e-06, "loss": 1.1743, "step": 5290 }, { "epoch": 0.99, "grad_norm": 2.6835621702851475, "learning_rate": 5.09813185064579e-06, "loss": 1.1698, "step": 5295 }, { "epoch": 0.99, "grad_norm": 2.512506207520888, "learning_rate": 5.09080941599695e-06, "loss": 1.1823, "step": 5300 }, { "epoch": 0.99, "grad_norm": 2.137533406863869, "learning_rate": 5.083486786517608e-06, "loss": 1.1922, "step": 5305 }, { "epoch": 0.99, "grad_norm": 4.104624434210896, "learning_rate": 5.076163977918379e-06, "loss": 1.2323, "step": 5310 }, { "epoch": 0.99, "grad_norm": 2.683564602708311, "learning_rate": 5.068841005910256e-06, "loss": 1.2063, "step": 5315 }, { "epoch": 0.99, "grad_norm": 2.7608580905626847, "learning_rate": 5.061517886204592e-06, "loss": 1.2186, "step": 5320 }, { "epoch": 0.99, "grad_norm": 2.4056499009625663, "learning_rate": 5.054194634513047e-06, "loss": 1.1975, "step": 5325 }, { "epoch": 0.99, "grad_norm": 2.2975592932071107, "learning_rate": 5.046871266547569e-06, "loss": 1.2494, "step": 5330 }, { "epoch": 0.99, "grad_norm": 2.741698022771611, "learning_rate": 5.039547798020358e-06, "loss": 1.235, "step": 5335 }, { "epoch": 1.0, "grad_norm": 2.5059532665114173, "learning_rate": 5.0322242446438265e-06, "loss": 1.2242, "step": 5340 }, { "epoch": 1.0, "grad_norm": 2.52273255127118, "learning_rate": 5.0249006221305675e-06, "loss": 1.2645, "step": 5345 }, { "epoch": 1.0, "grad_norm": 2.513615769567221, "learning_rate": 5.017576946193325e-06, "loss": 1.1787, "step": 5350 }, { "epoch": 1.0, "grad_norm": 2.3227647148793498, "learning_rate": 5.01025323254496e-06, "loss": 1.144, "step": 5355 }, { "epoch": 1.0, "grad_norm": 2.3674913562659863, "learning_rate": 5.002929496898407e-06, "loss": 1.2229, "step": 5360 }, { "epoch": 1.0, "grad_norm": 2.681463697074552, "learning_rate": 4.995605754966652e-06, "loss": 1.0945, "step": 5365 }, { "epoch": 1.0, "grad_norm": 2.3815705086639753, "learning_rate": 4.988282022462699e-06, "loss": 1.0172, "step": 5370 }, { "epoch": 1.0, "grad_norm": 2.419736712855131, "learning_rate": 4.980958315099525e-06, "loss": 1.0478, "step": 5375 }, { "epoch": 1.0, "grad_norm": 2.4222079854149547, "learning_rate": 4.973634648590055e-06, "loss": 1.0388, "step": 5380 }, { "epoch": 1.0, "grad_norm": 2.5561149360584605, "learning_rate": 4.9663110386471245e-06, "loss": 1.026, "step": 5385 }, { "epoch": 1.01, "grad_norm": 2.197462219377321, "learning_rate": 4.958987500983455e-06, "loss": 1.0145, "step": 5390 }, { "epoch": 1.01, "grad_norm": 2.5821217339392653, "learning_rate": 4.951664051311604e-06, "loss": 0.9705, "step": 5395 }, { "epoch": 1.01, "grad_norm": 2.7055799291037737, "learning_rate": 4.944340705343947e-06, "loss": 1.0235, "step": 5400 }, { "epoch": 1.01, "grad_norm": 2.3518618011207413, "learning_rate": 4.9370174787926284e-06, "loss": 0.9809, "step": 5405 }, { "epoch": 1.01, "grad_norm": 2.41358216404054, "learning_rate": 4.929694387369548e-06, "loss": 1.0209, "step": 5410 }, { "epoch": 1.01, "grad_norm": 2.1864823466664203, "learning_rate": 4.922371446786308e-06, "loss": 1.0187, "step": 5415 }, { "epoch": 1.01, "grad_norm": 2.4422953264549574, "learning_rate": 4.915048672754189e-06, "loss": 1.0231, "step": 5420 }, { "epoch": 1.01, "grad_norm": 2.4734873276063487, "learning_rate": 4.907726080984112e-06, "loss": 1.0291, "step": 5425 }, { "epoch": 1.01, "grad_norm": 2.7353098849284017, "learning_rate": 4.900403687186607e-06, "loss": 1.0135, "step": 5430 }, { "epoch": 1.01, "grad_norm": 2.2107790506597755, "learning_rate": 4.893081507071788e-06, "loss": 1.0397, "step": 5435 }, { "epoch": 1.01, "grad_norm": 2.4179695684073854, "learning_rate": 4.8857595563492996e-06, "loss": 1.0093, "step": 5440 }, { "epoch": 1.02, "grad_norm": 2.3001310602664566, "learning_rate": 4.878437850728298e-06, "loss": 1.0419, "step": 5445 }, { "epoch": 1.02, "grad_norm": 2.1598933391908686, "learning_rate": 4.871116405917413e-06, "loss": 1.0052, "step": 5450 }, { "epoch": 1.02, "grad_norm": 2.4913258463261476, "learning_rate": 4.863795237624719e-06, "loss": 1.0391, "step": 5455 }, { "epoch": 1.02, "grad_norm": 2.5724129547310493, "learning_rate": 4.856474361557692e-06, "loss": 1.0534, "step": 5460 }, { "epoch": 1.02, "grad_norm": 2.3554782257142155, "learning_rate": 4.849153793423182e-06, "loss": 1.0422, "step": 5465 }, { "epoch": 1.02, "grad_norm": 2.2282999376603057, "learning_rate": 4.841833548927379e-06, "loss": 1.0285, "step": 5470 }, { "epoch": 1.02, "grad_norm": 2.4711423811860342, "learning_rate": 4.834513643775784e-06, "loss": 0.9753, "step": 5475 }, { "epoch": 1.02, "grad_norm": 2.669156173349385, "learning_rate": 4.827194093673158e-06, "loss": 1.0262, "step": 5480 }, { "epoch": 1.02, "grad_norm": 2.6338258701202966, "learning_rate": 4.81987491432351e-06, "loss": 1.0041, "step": 5485 }, { "epoch": 1.02, "grad_norm": 2.3801846206028876, "learning_rate": 4.812556121430051e-06, "loss": 1.0225, "step": 5490 }, { "epoch": 1.02, "grad_norm": 2.6627376022318034, "learning_rate": 4.805237730695161e-06, "loss": 0.9694, "step": 5495 }, { "epoch": 1.03, "grad_norm": 2.7291035336136678, "learning_rate": 4.7979197578203606e-06, "loss": 1.0077, "step": 5500 }, { "epoch": 1.03, "grad_norm": 2.463838481125147, "learning_rate": 4.790602218506271e-06, "loss": 1.0493, "step": 5505 }, { "epoch": 1.03, "grad_norm": 2.433765492278771, "learning_rate": 4.783285128452584e-06, "loss": 1.0118, "step": 5510 }, { "epoch": 1.03, "grad_norm": 2.607628992304912, "learning_rate": 4.775968503358026e-06, "loss": 1.0695, "step": 5515 }, { "epoch": 1.03, "grad_norm": 2.5295928086957846, "learning_rate": 4.76865235892033e-06, "loss": 1.0166, "step": 5520 }, { "epoch": 1.03, "grad_norm": 2.203736263547884, "learning_rate": 4.761336710836195e-06, "loss": 0.9946, "step": 5525 }, { "epoch": 1.03, "grad_norm": 2.869557388787276, "learning_rate": 4.7540215748012545e-06, "loss": 0.985, "step": 5530 }, { "epoch": 1.03, "grad_norm": 2.531865966904552, "learning_rate": 4.746706966510043e-06, "loss": 0.9826, "step": 5535 }, { "epoch": 1.03, "grad_norm": 2.7954245771234665, "learning_rate": 4.739392901655966e-06, "loss": 1.0198, "step": 5540 }, { "epoch": 1.03, "grad_norm": 2.282486161494159, "learning_rate": 4.732079395931261e-06, "loss": 1.0174, "step": 5545 }, { "epoch": 1.04, "grad_norm": 2.5815196933075706, "learning_rate": 4.724766465026965e-06, "loss": 1.0575, "step": 5550 }, { "epoch": 1.04, "grad_norm": 2.5209180513093745, "learning_rate": 4.717454124632883e-06, "loss": 1.0617, "step": 5555 }, { "epoch": 1.04, "grad_norm": 2.375427874388073, "learning_rate": 4.710142390437551e-06, "loss": 0.9906, "step": 5560 }, { "epoch": 1.04, "grad_norm": 2.6600796567133838, "learning_rate": 4.702831278128209e-06, "loss": 1.03, "step": 5565 }, { "epoch": 1.04, "grad_norm": 2.9237698514498924, "learning_rate": 4.695520803390758e-06, "loss": 1.0186, "step": 5570 }, { "epoch": 1.04, "grad_norm": 2.759992337244157, "learning_rate": 4.688210981909734e-06, "loss": 0.9961, "step": 5575 }, { "epoch": 1.04, "grad_norm": 2.536540198344336, "learning_rate": 4.680901829368268e-06, "loss": 1.075, "step": 5580 }, { "epoch": 1.04, "grad_norm": 2.604142431895161, "learning_rate": 4.673593361448062e-06, "loss": 1.0397, "step": 5585 }, { "epoch": 1.04, "grad_norm": 2.3833711820364543, "learning_rate": 4.666285593829343e-06, "loss": 0.9868, "step": 5590 }, { "epoch": 1.04, "grad_norm": 2.6825074384656213, "learning_rate": 4.658978542190839e-06, "loss": 0.9972, "step": 5595 }, { "epoch": 1.04, "grad_norm": 2.7437235400270397, "learning_rate": 4.651672222209738e-06, "loss": 1.0479, "step": 5600 }, { "epoch": 1.05, "grad_norm": 2.140569839724797, "learning_rate": 4.6443666495616646e-06, "loss": 0.9903, "step": 5605 }, { "epoch": 1.05, "grad_norm": 2.5891585356017837, "learning_rate": 4.637061839920634e-06, "loss": 1.0201, "step": 5610 }, { "epoch": 1.05, "grad_norm": 2.3985815097913132, "learning_rate": 4.62975780895903e-06, "loss": 0.9745, "step": 5615 }, { "epoch": 1.05, "grad_norm": 2.4716522327814774, "learning_rate": 4.622454572347558e-06, "loss": 1.0602, "step": 5620 }, { "epoch": 1.05, "grad_norm": 2.4858109653750677, "learning_rate": 4.615152145755224e-06, "loss": 1.0142, "step": 5625 }, { "epoch": 1.05, "grad_norm": 2.65100239334809, "learning_rate": 4.6078505448492985e-06, "loss": 0.9855, "step": 5630 }, { "epoch": 1.05, "grad_norm": 2.622925290593546, "learning_rate": 4.600549785295278e-06, "loss": 0.9852, "step": 5635 }, { "epoch": 1.05, "grad_norm": 2.3714256985145266, "learning_rate": 4.5932498827568525e-06, "loss": 0.9897, "step": 5640 }, { "epoch": 1.05, "grad_norm": 2.7272449228012237, "learning_rate": 4.585950852895872e-06, "loss": 1.0032, "step": 5645 }, { "epoch": 1.05, "grad_norm": 2.291278250033059, "learning_rate": 4.578652711372321e-06, "loss": 1.0166, "step": 5650 }, { "epoch": 1.05, "grad_norm": 2.2818866479353472, "learning_rate": 4.571355473844271e-06, "loss": 1.0199, "step": 5655 }, { "epoch": 1.06, "grad_norm": 2.81859930508086, "learning_rate": 4.564059155967857e-06, "loss": 1.0655, "step": 5660 }, { "epoch": 1.06, "grad_norm": 2.4198976668946934, "learning_rate": 4.556763773397242e-06, "loss": 0.9985, "step": 5665 }, { "epoch": 1.06, "grad_norm": 2.6075117504033107, "learning_rate": 4.549469341784578e-06, "loss": 1.0957, "step": 5670 }, { "epoch": 1.06, "grad_norm": 2.394284879916922, "learning_rate": 4.542175876779982e-06, "loss": 1.0545, "step": 5675 }, { "epoch": 1.06, "grad_norm": 2.4984055889771932, "learning_rate": 4.5348833940314945e-06, "loss": 1.0149, "step": 5680 }, { "epoch": 1.06, "grad_norm": 2.2787926703782895, "learning_rate": 4.527591909185049e-06, "loss": 0.9748, "step": 5685 }, { "epoch": 1.06, "grad_norm": 2.3430908487524498, "learning_rate": 4.520301437884436e-06, "loss": 1.0172, "step": 5690 }, { "epoch": 1.06, "grad_norm": 2.555392414885646, "learning_rate": 4.5130119957712746e-06, "loss": 1.1048, "step": 5695 }, { "epoch": 1.06, "grad_norm": 2.40399252374117, "learning_rate": 4.5057235984849765e-06, "loss": 0.9381, "step": 5700 }, { "epoch": 1.06, "grad_norm": 2.507638615828214, "learning_rate": 4.498436261662707e-06, "loss": 1.0001, "step": 5705 }, { "epoch": 1.06, "grad_norm": 2.955757388241879, "learning_rate": 4.491150000939358e-06, "loss": 1.0098, "step": 5710 }, { "epoch": 1.07, "grad_norm": 2.3178496259731642, "learning_rate": 4.483864831947518e-06, "loss": 1.06, "step": 5715 }, { "epoch": 1.07, "grad_norm": 2.861349825430097, "learning_rate": 4.4765807703174256e-06, "loss": 1.0323, "step": 5720 }, { "epoch": 1.07, "grad_norm": 2.4567917565647823, "learning_rate": 4.469297831676947e-06, "loss": 1.0381, "step": 5725 }, { "epoch": 1.07, "grad_norm": 2.3765296344065683, "learning_rate": 4.4620160316515395e-06, "loss": 1.001, "step": 5730 }, { "epoch": 1.07, "grad_norm": 2.851117308902495, "learning_rate": 4.454735385864215e-06, "loss": 1.0239, "step": 5735 }, { "epoch": 1.07, "grad_norm": 2.2475477463435363, "learning_rate": 4.447455909935513e-06, "loss": 1.006, "step": 5740 }, { "epoch": 1.07, "grad_norm": 2.3691317072426736, "learning_rate": 4.4401776194834615e-06, "loss": 0.9884, "step": 5745 }, { "epoch": 1.07, "grad_norm": 2.7912385165600986, "learning_rate": 4.432900530123543e-06, "loss": 1.0358, "step": 5750 }, { "epoch": 1.07, "grad_norm": 2.532523308925846, "learning_rate": 4.425624657468662e-06, "loss": 0.9825, "step": 5755 }, { "epoch": 1.07, "grad_norm": 2.5901674352213346, "learning_rate": 4.41835001712912e-06, "loss": 0.9793, "step": 5760 }, { "epoch": 1.08, "grad_norm": 2.526653070020505, "learning_rate": 4.411076624712567e-06, "loss": 1.0093, "step": 5765 }, { "epoch": 1.08, "grad_norm": 2.5107555130617225, "learning_rate": 4.403804495823979e-06, "loss": 1.0016, "step": 5770 }, { "epoch": 1.08, "grad_norm": 2.3247381049982936, "learning_rate": 4.396533646065618e-06, "loss": 1.0014, "step": 5775 }, { "epoch": 1.08, "grad_norm": 2.750111283951606, "learning_rate": 4.3892640910370085e-06, "loss": 1.0422, "step": 5780 }, { "epoch": 1.08, "grad_norm": 2.585259483582809, "learning_rate": 4.381995846334891e-06, "loss": 1.0446, "step": 5785 }, { "epoch": 1.08, "grad_norm": 2.8484814547628146, "learning_rate": 4.374728927553195e-06, "loss": 1.0186, "step": 5790 }, { "epoch": 1.08, "grad_norm": 2.5484297678172694, "learning_rate": 4.367463350283008e-06, "loss": 1.033, "step": 5795 }, { "epoch": 1.08, "grad_norm": 3.2517629474206915, "learning_rate": 4.3601991301125345e-06, "loss": 1.0305, "step": 5800 }, { "epoch": 1.08, "grad_norm": 2.317879031880154, "learning_rate": 4.352936282627076e-06, "loss": 1.009, "step": 5805 }, { "epoch": 1.08, "grad_norm": 2.873136359968325, "learning_rate": 4.345674823408983e-06, "loss": 1.0396, "step": 5810 }, { "epoch": 1.08, "grad_norm": 2.6184270537855627, "learning_rate": 4.338414768037625e-06, "loss": 1.0268, "step": 5815 }, { "epoch": 1.09, "grad_norm": 2.5891458972953485, "learning_rate": 4.3311561320893635e-06, "loss": 1.0537, "step": 5820 }, { "epoch": 1.09, "grad_norm": 2.3900157941023097, "learning_rate": 4.3238989311375165e-06, "loss": 0.9919, "step": 5825 }, { "epoch": 1.09, "grad_norm": 2.188987069441718, "learning_rate": 4.316643180752321e-06, "loss": 0.9548, "step": 5830 }, { "epoch": 1.09, "grad_norm": 2.7746815671456826, "learning_rate": 4.309388896500899e-06, "loss": 1.0482, "step": 5835 }, { "epoch": 1.09, "grad_norm": 2.6451781748141037, "learning_rate": 4.3021360939472296e-06, "loss": 1.0075, "step": 5840 }, { "epoch": 1.09, "grad_norm": 2.7182175432694016, "learning_rate": 4.294884788652111e-06, "loss": 1.0218, "step": 5845 }, { "epoch": 1.09, "grad_norm": 2.7398738561495937, "learning_rate": 4.2876349961731375e-06, "loss": 1.031, "step": 5850 }, { "epoch": 1.09, "grad_norm": 2.8602106485650522, "learning_rate": 4.280386732064645e-06, "loss": 0.9569, "step": 5855 }, { "epoch": 1.09, "grad_norm": 2.5471621565131817, "learning_rate": 4.273140011877698e-06, "loss": 1.0346, "step": 5860 }, { "epoch": 1.09, "grad_norm": 2.599772140063331, "learning_rate": 4.265894851160047e-06, "loss": 1.0129, "step": 5865 }, { "epoch": 1.09, "grad_norm": 2.5023497756964166, "learning_rate": 4.258651265456096e-06, "loss": 0.9783, "step": 5870 }, { "epoch": 1.1, "grad_norm": 2.3315837831174955, "learning_rate": 4.251409270306872e-06, "loss": 1.0385, "step": 5875 }, { "epoch": 1.1, "grad_norm": 2.2339673161919738, "learning_rate": 4.244168881249986e-06, "loss": 0.9796, "step": 5880 }, { "epoch": 1.1, "grad_norm": 2.9198639199308145, "learning_rate": 4.236930113819605e-06, "loss": 1.0418, "step": 5885 }, { "epoch": 1.1, "grad_norm": 2.2996710537906098, "learning_rate": 4.229692983546418e-06, "loss": 0.992, "step": 5890 }, { "epoch": 1.1, "grad_norm": 2.2304355168240186, "learning_rate": 4.2224575059576e-06, "loss": 1.0011, "step": 5895 }, { "epoch": 1.1, "grad_norm": 2.5260445789366557, "learning_rate": 4.215223696576781e-06, "loss": 0.99, "step": 5900 }, { "epoch": 1.1, "grad_norm": 2.465790598030769, "learning_rate": 4.2079915709240095e-06, "loss": 1.0134, "step": 5905 }, { "epoch": 1.1, "grad_norm": 2.653095522617236, "learning_rate": 4.200761144515724e-06, "loss": 0.9633, "step": 5910 }, { "epoch": 1.1, "grad_norm": 2.411929291599091, "learning_rate": 4.193532432864718e-06, "loss": 1.0315, "step": 5915 }, { "epoch": 1.1, "grad_norm": 2.2292387604409547, "learning_rate": 4.186305451480104e-06, "loss": 1.0199, "step": 5920 }, { "epoch": 1.1, "grad_norm": 2.7973446623576033, "learning_rate": 4.179080215867282e-06, "loss": 0.9596, "step": 5925 }, { "epoch": 1.11, "grad_norm": 2.4745090701678882, "learning_rate": 4.1718567415279085e-06, "loss": 0.8889, "step": 5930 }, { "epoch": 1.11, "grad_norm": 2.428997596511232, "learning_rate": 4.164635043959861e-06, "loss": 0.9836, "step": 5935 }, { "epoch": 1.11, "grad_norm": 2.439369458922769, "learning_rate": 4.157415138657203e-06, "loss": 1.0524, "step": 5940 }, { "epoch": 1.11, "grad_norm": 2.879991571996203, "learning_rate": 4.150197041110154e-06, "loss": 1.0329, "step": 5945 }, { "epoch": 1.11, "grad_norm": 2.441256974326701, "learning_rate": 4.142980766805055e-06, "loss": 0.9827, "step": 5950 }, { "epoch": 1.11, "grad_norm": 2.2621481444945166, "learning_rate": 4.135766331224334e-06, "loss": 1.0164, "step": 5955 }, { "epoch": 1.11, "grad_norm": 2.4153632637114977, "learning_rate": 4.128553749846477e-06, "loss": 1.0018, "step": 5960 }, { "epoch": 1.11, "grad_norm": 2.4291621713233456, "learning_rate": 4.12134303814599e-06, "loss": 1.0382, "step": 5965 }, { "epoch": 1.11, "grad_norm": 2.4020903183200004, "learning_rate": 4.114134211593366e-06, "loss": 1.0146, "step": 5970 }, { "epoch": 1.11, "grad_norm": 2.392997196335174, "learning_rate": 4.106927285655052e-06, "loss": 1.0068, "step": 5975 }, { "epoch": 1.12, "grad_norm": 2.3982209610639917, "learning_rate": 4.099722275793427e-06, "loss": 0.9805, "step": 5980 }, { "epoch": 1.12, "grad_norm": 2.4610808257505177, "learning_rate": 4.0925191974667485e-06, "loss": 0.9476, "step": 5985 }, { "epoch": 1.12, "grad_norm": 2.410262471073752, "learning_rate": 4.085318066129137e-06, "loss": 0.9853, "step": 5990 }, { "epoch": 1.12, "grad_norm": 2.520389162835538, "learning_rate": 4.0781188972305245e-06, "loss": 1.0039, "step": 5995 }, { "epoch": 1.12, "grad_norm": 2.484693819518564, "learning_rate": 4.07092170621665e-06, "loss": 0.9892, "step": 6000 }, { "epoch": 1.12, "grad_norm": 2.5500391170036405, "learning_rate": 4.063726508528995e-06, "loss": 1.0229, "step": 6005 }, { "epoch": 1.12, "grad_norm": 2.620556442354383, "learning_rate": 4.05653331960477e-06, "loss": 1.0512, "step": 6010 }, { "epoch": 1.12, "grad_norm": 2.8079283170008646, "learning_rate": 4.049342154876871e-06, "loss": 0.9743, "step": 6015 }, { "epoch": 1.12, "grad_norm": 2.2753916987815677, "learning_rate": 4.042153029773861e-06, "loss": 0.9979, "step": 6020 }, { "epoch": 1.12, "grad_norm": 3.294635777323174, "learning_rate": 4.034965959719919e-06, "loss": 0.9607, "step": 6025 }, { "epoch": 1.12, "grad_norm": 2.6078387298892918, "learning_rate": 4.027780960134813e-06, "loss": 0.9941, "step": 6030 }, { "epoch": 1.13, "grad_norm": 2.7700921852331244, "learning_rate": 4.0205980464338765e-06, "loss": 0.9948, "step": 6035 }, { "epoch": 1.13, "grad_norm": 2.6699866883313113, "learning_rate": 4.01341723402796e-06, "loss": 0.9995, "step": 6040 }, { "epoch": 1.13, "grad_norm": 2.194966684039299, "learning_rate": 4.006238538323415e-06, "loss": 1.005, "step": 6045 }, { "epoch": 1.13, "grad_norm": 2.6389273378201152, "learning_rate": 3.999061974722041e-06, "loss": 1.0018, "step": 6050 }, { "epoch": 1.13, "grad_norm": 2.288324978564677, "learning_rate": 3.991887558621071e-06, "loss": 1.0326, "step": 6055 }, { "epoch": 1.13, "grad_norm": 2.382363146274824, "learning_rate": 3.984715305413125e-06, "loss": 0.9227, "step": 6060 }, { "epoch": 1.13, "grad_norm": 2.545591262311767, "learning_rate": 3.97754523048619e-06, "loss": 0.9955, "step": 6065 }, { "epoch": 1.13, "grad_norm": 2.696133141462619, "learning_rate": 3.970377349223572e-06, "loss": 1.0089, "step": 6070 }, { "epoch": 1.13, "grad_norm": 2.5249857811313383, "learning_rate": 3.963211677003872e-06, "loss": 1.06, "step": 6075 }, { "epoch": 1.13, "grad_norm": 2.6974517008498076, "learning_rate": 3.956048229200956e-06, "loss": 1.0025, "step": 6080 }, { "epoch": 1.13, "grad_norm": 2.708829049326595, "learning_rate": 3.948887021183911e-06, "loss": 0.9934, "step": 6085 }, { "epoch": 1.14, "grad_norm": 2.753223542699741, "learning_rate": 3.941728068317026e-06, "loss": 1.0309, "step": 6090 }, { "epoch": 1.14, "grad_norm": 2.5571783646544533, "learning_rate": 3.934571385959745e-06, "loss": 1.0078, "step": 6095 }, { "epoch": 1.14, "grad_norm": 2.9619368449595704, "learning_rate": 3.927416989466644e-06, "loss": 1.0263, "step": 6100 }, { "epoch": 1.14, "grad_norm": 2.527413370302673, "learning_rate": 3.920264894187393e-06, "loss": 1.0192, "step": 6105 }, { "epoch": 1.14, "grad_norm": 2.290679414980372, "learning_rate": 3.913115115466728e-06, "loss": 1.0123, "step": 6110 }, { "epoch": 1.14, "grad_norm": 2.1791472017494864, "learning_rate": 3.9059676686444095e-06, "loss": 1.0294, "step": 6115 }, { "epoch": 1.14, "grad_norm": 2.4477872352744097, "learning_rate": 3.898822569055201e-06, "loss": 0.9835, "step": 6120 }, { "epoch": 1.14, "grad_norm": 2.4553291419944574, "learning_rate": 3.8916798320288254e-06, "loss": 0.9878, "step": 6125 }, { "epoch": 1.14, "grad_norm": 2.5549208400564756, "learning_rate": 3.884539472889938e-06, "loss": 1.0245, "step": 6130 }, { "epoch": 1.14, "grad_norm": 2.485934742620887, "learning_rate": 3.877401506958095e-06, "loss": 0.9945, "step": 6135 }, { "epoch": 1.15, "grad_norm": 2.2578183339754023, "learning_rate": 3.8702659495477144e-06, "loss": 1.0209, "step": 6140 }, { "epoch": 1.15, "grad_norm": 2.3465134445411304, "learning_rate": 3.863132815968048e-06, "loss": 1.0666, "step": 6145 }, { "epoch": 1.15, "grad_norm": 2.29022378683888, "learning_rate": 3.856002121523147e-06, "loss": 1.0055, "step": 6150 }, { "epoch": 1.15, "grad_norm": 2.660614384744464, "learning_rate": 3.848873881511831e-06, "loss": 1.0101, "step": 6155 }, { "epoch": 1.15, "grad_norm": 2.615319592937916, "learning_rate": 3.841748111227652e-06, "loss": 0.9582, "step": 6160 }, { "epoch": 1.15, "grad_norm": 2.817918853778393, "learning_rate": 3.834624825958864e-06, "loss": 0.9969, "step": 6165 }, { "epoch": 1.15, "grad_norm": 2.4439689869979455, "learning_rate": 3.827504040988388e-06, "loss": 0.9914, "step": 6170 }, { "epoch": 1.15, "grad_norm": 2.4582126352422438, "learning_rate": 3.8203857715937845e-06, "loss": 1.0226, "step": 6175 }, { "epoch": 1.15, "grad_norm": 2.9601365316898867, "learning_rate": 3.8132700330472124e-06, "loss": 0.9979, "step": 6180 }, { "epoch": 1.15, "grad_norm": 2.1744301885466704, "learning_rate": 3.8061568406154035e-06, "loss": 0.9677, "step": 6185 }, { "epoch": 1.15, "grad_norm": 2.9345644472209083, "learning_rate": 3.799046209559623e-06, "loss": 0.9926, "step": 6190 }, { "epoch": 1.16, "grad_norm": 2.487795059019343, "learning_rate": 3.7919381551356478e-06, "loss": 1.0199, "step": 6195 }, { "epoch": 1.16, "grad_norm": 2.7762118273919603, "learning_rate": 3.7848326925937207e-06, "loss": 0.985, "step": 6200 }, { "epoch": 1.16, "grad_norm": 2.385619014287349, "learning_rate": 3.7777298371785257e-06, "loss": 1.0049, "step": 6205 }, { "epoch": 1.16, "grad_norm": 3.1858905013848773, "learning_rate": 3.770629604129153e-06, "loss": 1.0067, "step": 6210 }, { "epoch": 1.16, "grad_norm": 2.646520353336217, "learning_rate": 3.7635320086790635e-06, "loss": 1.0203, "step": 6215 }, { "epoch": 1.16, "grad_norm": 2.703518348813667, "learning_rate": 3.7564370660560665e-06, "loss": 0.9956, "step": 6220 }, { "epoch": 1.16, "grad_norm": 2.6488264930358074, "learning_rate": 3.7493447914822766e-06, "loss": 1.0232, "step": 6225 }, { "epoch": 1.16, "grad_norm": 3.0881970088316533, "learning_rate": 3.7422552001740782e-06, "loss": 0.9966, "step": 6230 }, { "epoch": 1.16, "grad_norm": 2.6057449190922015, "learning_rate": 3.7351683073421036e-06, "loss": 1.0195, "step": 6235 }, { "epoch": 1.16, "grad_norm": 2.442247586855112, "learning_rate": 3.7280841281912007e-06, "loss": 0.999, "step": 6240 }, { "epoch": 1.16, "grad_norm": 2.6614307097799346, "learning_rate": 3.7210026779203863e-06, "loss": 1.0111, "step": 6245 }, { "epoch": 1.17, "grad_norm": 2.883103898286734, "learning_rate": 3.713923971722826e-06, "loss": 0.989, "step": 6250 }, { "epoch": 1.17, "grad_norm": 2.3731982209108873, "learning_rate": 3.7068480247857975e-06, "loss": 1.077, "step": 6255 }, { "epoch": 1.17, "grad_norm": 2.5547604391314667, "learning_rate": 3.6997748522906627e-06, "loss": 1.0154, "step": 6260 }, { "epoch": 1.17, "grad_norm": 2.6226187885824643, "learning_rate": 3.6927044694128255e-06, "loss": 1.0549, "step": 6265 }, { "epoch": 1.17, "grad_norm": 3.0423886957703217, "learning_rate": 3.685636891321706e-06, "loss": 0.9967, "step": 6270 }, { "epoch": 1.17, "grad_norm": 2.689767779138463, "learning_rate": 3.678572133180708e-06, "loss": 0.9555, "step": 6275 }, { "epoch": 1.17, "grad_norm": 2.5322314585255947, "learning_rate": 3.671510210147183e-06, "loss": 0.9532, "step": 6280 }, { "epoch": 1.17, "grad_norm": 2.5502386618866866, "learning_rate": 3.6644511373724035e-06, "loss": 1.0262, "step": 6285 }, { "epoch": 1.17, "grad_norm": 2.9597133665829354, "learning_rate": 3.657394930001524e-06, "loss": 1.0865, "step": 6290 }, { "epoch": 1.17, "grad_norm": 2.4548956067402226, "learning_rate": 3.650341603173552e-06, "loss": 1.0551, "step": 6295 }, { "epoch": 1.17, "grad_norm": 2.415307407799765, "learning_rate": 3.6432911720213127e-06, "loss": 1.0286, "step": 6300 }, { "epoch": 1.18, "grad_norm": 2.518703622236714, "learning_rate": 3.636243651671424e-06, "loss": 0.9453, "step": 6305 }, { "epoch": 1.18, "grad_norm": 2.9236092035869587, "learning_rate": 3.6291990572442527e-06, "loss": 1.0284, "step": 6310 }, { "epoch": 1.18, "grad_norm": 2.7290628973336393, "learning_rate": 3.6221574038538926e-06, "loss": 0.9858, "step": 6315 }, { "epoch": 1.18, "grad_norm": 2.746796572631986, "learning_rate": 3.615118706608125e-06, "loss": 0.9979, "step": 6320 }, { "epoch": 1.18, "grad_norm": 2.2102817767160086, "learning_rate": 3.6080829806083885e-06, "loss": 1.0017, "step": 6325 }, { "epoch": 1.18, "grad_norm": 2.3433097259852578, "learning_rate": 3.6010502409497493e-06, "loss": 0.9991, "step": 6330 }, { "epoch": 1.18, "grad_norm": 2.669305876106401, "learning_rate": 3.594020502720865e-06, "loss": 1.0773, "step": 6335 }, { "epoch": 1.18, "grad_norm": 2.5602011856012203, "learning_rate": 3.586993781003954e-06, "loss": 1.003, "step": 6340 }, { "epoch": 1.18, "grad_norm": 2.3053351276904617, "learning_rate": 3.5799700908747607e-06, "loss": 0.9884, "step": 6345 }, { "epoch": 1.18, "grad_norm": 2.3995150259514064, "learning_rate": 3.5729494474025296e-06, "loss": 1.0127, "step": 6350 }, { "epoch": 1.19, "grad_norm": 2.3227336825063083, "learning_rate": 3.565931865649965e-06, "loss": 1.0194, "step": 6355 }, { "epoch": 1.19, "grad_norm": 2.5258564235269643, "learning_rate": 3.5589173606732042e-06, "loss": 1.0305, "step": 6360 }, { "epoch": 1.19, "grad_norm": 2.5245519835366306, "learning_rate": 3.551905947521781e-06, "loss": 1.0075, "step": 6365 }, { "epoch": 1.19, "grad_norm": 2.6202125604841493, "learning_rate": 3.5448976412385994e-06, "loss": 1.0026, "step": 6370 }, { "epoch": 1.19, "grad_norm": 2.2985573829337143, "learning_rate": 3.537892456859895e-06, "loss": 1.0377, "step": 6375 }, { "epoch": 1.19, "grad_norm": 2.5445427577379225, "learning_rate": 3.5308904094152047e-06, "loss": 1.0336, "step": 6380 }, { "epoch": 1.19, "grad_norm": 2.587984616767967, "learning_rate": 3.5238915139273387e-06, "loss": 0.9534, "step": 6385 }, { "epoch": 1.19, "grad_norm": 2.4587427920639082, "learning_rate": 3.5168957854123386e-06, "loss": 1.041, "step": 6390 }, { "epoch": 1.19, "grad_norm": 2.5776496842048604, "learning_rate": 3.5099032388794596e-06, "loss": 1.0119, "step": 6395 }, { "epoch": 1.19, "grad_norm": 2.9999317197733704, "learning_rate": 3.5029138893311245e-06, "loss": 1.0094, "step": 6400 }, { "epoch": 1.19, "grad_norm": 2.370429876481651, "learning_rate": 3.4959277517628953e-06, "loss": 0.9934, "step": 6405 }, { "epoch": 1.2, "grad_norm": 2.3397878191899757, "learning_rate": 3.4889448411634465e-06, "loss": 1.0449, "step": 6410 }, { "epoch": 1.2, "grad_norm": 2.9633064271669465, "learning_rate": 3.4819651725145303e-06, "loss": 1.0328, "step": 6415 }, { "epoch": 1.2, "grad_norm": 2.838930291488574, "learning_rate": 3.4749887607909416e-06, "loss": 0.9886, "step": 6420 }, { "epoch": 1.2, "grad_norm": 2.662756964502569, "learning_rate": 3.468015620960484e-06, "loss": 0.9936, "step": 6425 }, { "epoch": 1.2, "grad_norm": 2.5081288815673592, "learning_rate": 3.4610457679839445e-06, "loss": 1.0417, "step": 6430 }, { "epoch": 1.2, "grad_norm": 2.1688593673892846, "learning_rate": 3.4540792168150618e-06, "loss": 0.988, "step": 6435 }, { "epoch": 1.2, "grad_norm": 2.549976740705903, "learning_rate": 3.447115982400485e-06, "loss": 0.9867, "step": 6440 }, { "epoch": 1.2, "grad_norm": 2.70924169526118, "learning_rate": 3.440156079679749e-06, "loss": 1.0214, "step": 6445 }, { "epoch": 1.2, "grad_norm": 2.201236785717843, "learning_rate": 3.4331995235852406e-06, "loss": 0.996, "step": 6450 }, { "epoch": 1.2, "grad_norm": 2.3856197789691813, "learning_rate": 3.4262463290421654e-06, "loss": 0.9932, "step": 6455 }, { "epoch": 1.2, "grad_norm": 2.4317340460371484, "learning_rate": 3.4192965109685215e-06, "loss": 1.0217, "step": 6460 }, { "epoch": 1.21, "grad_norm": 2.2660839453173676, "learning_rate": 3.412350084275057e-06, "loss": 0.995, "step": 6465 }, { "epoch": 1.21, "grad_norm": 2.8062486187830142, "learning_rate": 3.4054070638652458e-06, "loss": 0.9988, "step": 6470 }, { "epoch": 1.21, "grad_norm": 2.8224885243520745, "learning_rate": 3.3984674646352543e-06, "loss": 1.0084, "step": 6475 }, { "epoch": 1.21, "grad_norm": 2.355107056771508, "learning_rate": 3.39153130147391e-06, "loss": 1.0043, "step": 6480 }, { "epoch": 1.21, "grad_norm": 2.319540034779808, "learning_rate": 3.3845985892626654e-06, "loss": 0.9765, "step": 6485 }, { "epoch": 1.21, "grad_norm": 2.4007112000801794, "learning_rate": 3.3776693428755714e-06, "loss": 1.0303, "step": 6490 }, { "epoch": 1.21, "grad_norm": 2.7687023865947022, "learning_rate": 3.3707435771792417e-06, "loss": 1.0198, "step": 6495 }, { "epoch": 1.21, "grad_norm": 2.2967682488576187, "learning_rate": 3.3638213070328223e-06, "loss": 1.0012, "step": 6500 }, { "epoch": 1.21, "grad_norm": 3.00503160474256, "learning_rate": 3.356902547287961e-06, "loss": 1.0415, "step": 6505 }, { "epoch": 1.21, "grad_norm": 2.581469591673954, "learning_rate": 3.3499873127887726e-06, "loss": 1.0051, "step": 6510 }, { "epoch": 1.21, "grad_norm": 2.6784260311780703, "learning_rate": 3.343075618371808e-06, "loss": 0.968, "step": 6515 }, { "epoch": 1.22, "grad_norm": 2.742914883406482, "learning_rate": 3.336167478866024e-06, "loss": 1.054, "step": 6520 }, { "epoch": 1.22, "grad_norm": 2.5967700561103912, "learning_rate": 3.329262909092752e-06, "loss": 0.9289, "step": 6525 }, { "epoch": 1.22, "grad_norm": 2.932847442630695, "learning_rate": 3.322361923865661e-06, "loss": 1.0103, "step": 6530 }, { "epoch": 1.22, "grad_norm": 4.336241175871444, "learning_rate": 3.3154645379907315e-06, "loss": 0.9699, "step": 6535 }, { "epoch": 1.22, "grad_norm": 2.2502099078873927, "learning_rate": 3.3085707662662208e-06, "loss": 1.0008, "step": 6540 }, { "epoch": 1.22, "grad_norm": 2.3994266512691262, "learning_rate": 3.3016806234826336e-06, "loss": 0.9688, "step": 6545 }, { "epoch": 1.22, "grad_norm": 2.6533685347490947, "learning_rate": 3.294794124422688e-06, "loss": 1.0142, "step": 6550 }, { "epoch": 1.22, "grad_norm": 2.516906851667742, "learning_rate": 3.2879112838612837e-06, "loss": 0.9894, "step": 6555 }, { "epoch": 1.22, "grad_norm": 2.372071290525371, "learning_rate": 3.281032116565473e-06, "loss": 1.0566, "step": 6560 }, { "epoch": 1.22, "grad_norm": 2.6515335583185022, "learning_rate": 3.274156637294421e-06, "loss": 1.0129, "step": 6565 }, { "epoch": 1.23, "grad_norm": 2.316592337770323, "learning_rate": 3.2672848607993913e-06, "loss": 0.9844, "step": 6570 }, { "epoch": 1.23, "grad_norm": 2.7973489389722634, "learning_rate": 3.260416801823694e-06, "loss": 1.0009, "step": 6575 }, { "epoch": 1.23, "grad_norm": 2.348401957786061, "learning_rate": 3.253552475102668e-06, "loss": 1.0305, "step": 6580 }, { "epoch": 1.23, "grad_norm": 2.365305340735876, "learning_rate": 3.2466918953636394e-06, "loss": 0.9931, "step": 6585 }, { "epoch": 1.23, "grad_norm": 2.536643326010143, "learning_rate": 3.2398350773259035e-06, "loss": 1.031, "step": 6590 }, { "epoch": 1.23, "grad_norm": 2.6324859382475863, "learning_rate": 3.232982035700678e-06, "loss": 1.0175, "step": 6595 }, { "epoch": 1.23, "grad_norm": 2.3943506218465296, "learning_rate": 3.2261327851910827e-06, "loss": 1.0144, "step": 6600 }, { "epoch": 1.23, "grad_norm": 2.3642765166995945, "learning_rate": 3.2192873404920966e-06, "loss": 0.9929, "step": 6605 }, { "epoch": 1.23, "grad_norm": 2.357721061823941, "learning_rate": 3.212445716290543e-06, "loss": 1.0205, "step": 6610 }, { "epoch": 1.23, "grad_norm": 3.0330694342984508, "learning_rate": 3.205607927265044e-06, "loss": 1.031, "step": 6615 }, { "epoch": 1.23, "grad_norm": 2.6767493180451973, "learning_rate": 3.1987739880859902e-06, "loss": 1.0337, "step": 6620 }, { "epoch": 1.24, "grad_norm": 2.595396432430348, "learning_rate": 3.1919439134155173e-06, "loss": 1.0096, "step": 6625 }, { "epoch": 1.24, "grad_norm": 2.4346297592857926, "learning_rate": 3.1851177179074645e-06, "loss": 1.0022, "step": 6630 }, { "epoch": 1.24, "grad_norm": 2.7420307620549025, "learning_rate": 3.1782954162073566e-06, "loss": 1.0477, "step": 6635 }, { "epoch": 1.24, "grad_norm": 3.026839985368973, "learning_rate": 3.1714770229523563e-06, "loss": 1.0093, "step": 6640 }, { "epoch": 1.24, "grad_norm": 2.408559067936416, "learning_rate": 3.1646625527712426e-06, "loss": 0.963, "step": 6645 }, { "epoch": 1.24, "grad_norm": 2.656230938630765, "learning_rate": 3.1578520202843806e-06, "loss": 1.0291, "step": 6650 }, { "epoch": 1.24, "grad_norm": 3.1429634200338072, "learning_rate": 3.151045440103685e-06, "loss": 1.061, "step": 6655 }, { "epoch": 1.24, "grad_norm": 3.0525857382979384, "learning_rate": 3.1442428268325915e-06, "loss": 1.0265, "step": 6660 }, { "epoch": 1.24, "grad_norm": 2.5262738969825596, "learning_rate": 3.137444195066023e-06, "loss": 1.0584, "step": 6665 }, { "epoch": 1.24, "grad_norm": 2.8656853508515683, "learning_rate": 3.130649559390362e-06, "loss": 1.0356, "step": 6670 }, { "epoch": 1.24, "grad_norm": 2.678563207407218, "learning_rate": 3.123858934383418e-06, "loss": 1.0051, "step": 6675 }, { "epoch": 1.25, "grad_norm": 2.6967742942369313, "learning_rate": 3.1170723346143936e-06, "loss": 0.9988, "step": 6680 }, { "epoch": 1.25, "grad_norm": 2.6334943332730294, "learning_rate": 3.110289774643857e-06, "loss": 0.9862, "step": 6685 }, { "epoch": 1.25, "grad_norm": 2.5356006656026158, "learning_rate": 3.1035112690237086e-06, "loss": 1.0316, "step": 6690 }, { "epoch": 1.25, "grad_norm": 2.507316947200383, "learning_rate": 3.096736832297148e-06, "loss": 0.9558, "step": 6695 }, { "epoch": 1.25, "grad_norm": 3.15448089703757, "learning_rate": 3.08996647899865e-06, "loss": 0.9614, "step": 6700 }, { "epoch": 1.25, "grad_norm": 2.792658317811039, "learning_rate": 3.0832002236539243e-06, "loss": 0.9971, "step": 6705 }, { "epoch": 1.25, "grad_norm": 2.680576302235998, "learning_rate": 3.07643808077989e-06, "loss": 1.0216, "step": 6710 }, { "epoch": 1.25, "grad_norm": 2.5763702681537897, "learning_rate": 3.069680064884641e-06, "loss": 0.9948, "step": 6715 }, { "epoch": 1.25, "grad_norm": 2.5684486308454457, "learning_rate": 3.0629261904674206e-06, "loss": 0.9914, "step": 6720 }, { "epoch": 1.25, "grad_norm": 2.456176378605224, "learning_rate": 3.056176472018584e-06, "loss": 1.0467, "step": 6725 }, { "epoch": 1.26, "grad_norm": 2.3265015455090543, "learning_rate": 3.0494309240195706e-06, "loss": 1.0284, "step": 6730 }, { "epoch": 1.26, "grad_norm": 2.5090802418299103, "learning_rate": 3.0426895609428713e-06, "loss": 0.9892, "step": 6735 }, { "epoch": 1.26, "grad_norm": 2.4337598655357797, "learning_rate": 3.0359523972519976e-06, "loss": 1.0318, "step": 6740 }, { "epoch": 1.26, "grad_norm": 2.9534700024492158, "learning_rate": 3.029219447401456e-06, "loss": 0.9617, "step": 6745 }, { "epoch": 1.26, "grad_norm": 2.5169294067657986, "learning_rate": 3.0224907258367063e-06, "loss": 1.0083, "step": 6750 }, { "epoch": 1.26, "grad_norm": 2.5412138380242717, "learning_rate": 3.0157662469941394e-06, "loss": 1.0024, "step": 6755 }, { "epoch": 1.26, "grad_norm": 2.383419470697286, "learning_rate": 3.009046025301042e-06, "loss": 1.0007, "step": 6760 }, { "epoch": 1.26, "grad_norm": 2.346170762298227, "learning_rate": 3.0023300751755713e-06, "loss": 1.0101, "step": 6765 }, { "epoch": 1.26, "grad_norm": 2.6426094982234685, "learning_rate": 2.995618411026715e-06, "loss": 1.0129, "step": 6770 }, { "epoch": 1.26, "grad_norm": 2.6651384968183742, "learning_rate": 2.988911047254268e-06, "loss": 0.9953, "step": 6775 }, { "epoch": 1.26, "grad_norm": 2.2908540342398376, "learning_rate": 2.982207998248795e-06, "loss": 0.9605, "step": 6780 }, { "epoch": 1.27, "grad_norm": 2.605274696824992, "learning_rate": 2.9755092783916106e-06, "loss": 1.0554, "step": 6785 }, { "epoch": 1.27, "grad_norm": 2.54625674691826, "learning_rate": 2.968814902054735e-06, "loss": 0.9603, "step": 6790 }, { "epoch": 1.27, "grad_norm": 2.353611332509148, "learning_rate": 2.962124883600874e-06, "loss": 1.0114, "step": 6795 }, { "epoch": 1.27, "grad_norm": 2.3484072442486625, "learning_rate": 2.955439237383377e-06, "loss": 0.9552, "step": 6800 }, { "epoch": 1.27, "grad_norm": 2.538093611992046, "learning_rate": 2.9487579777462165e-06, "loss": 1.0189, "step": 6805 }, { "epoch": 1.27, "grad_norm": 2.404444291570621, "learning_rate": 2.94208111902396e-06, "loss": 0.9687, "step": 6810 }, { "epoch": 1.27, "grad_norm": 2.5355302453879744, "learning_rate": 2.9354086755417226e-06, "loss": 0.993, "step": 6815 }, { "epoch": 1.27, "grad_norm": 2.63694242852118, "learning_rate": 2.9287406616151513e-06, "loss": 1.0192, "step": 6820 }, { "epoch": 1.27, "grad_norm": 2.2105266714422642, "learning_rate": 2.9220770915503884e-06, "loss": 1.0121, "step": 6825 }, { "epoch": 1.27, "grad_norm": 2.5519970083541343, "learning_rate": 2.9154179796440463e-06, "loss": 1.0066, "step": 6830 }, { "epoch": 1.27, "grad_norm": 2.5838265435049017, "learning_rate": 2.9087633401831654e-06, "loss": 1.036, "step": 6835 }, { "epoch": 1.28, "grad_norm": 2.5274982625597473, "learning_rate": 2.9021131874451957e-06, "loss": 1.0009, "step": 6840 }, { "epoch": 1.28, "grad_norm": 2.594414889982177, "learning_rate": 2.8954675356979566e-06, "loss": 1.0087, "step": 6845 }, { "epoch": 1.28, "grad_norm": 2.5135656552653973, "learning_rate": 2.8888263991996172e-06, "loss": 1.0323, "step": 6850 }, { "epoch": 1.28, "grad_norm": 2.4202739645255775, "learning_rate": 2.882189792198654e-06, "loss": 0.9964, "step": 6855 }, { "epoch": 1.28, "grad_norm": 2.5258407501718967, "learning_rate": 2.8755577289338267e-06, "loss": 0.9941, "step": 6860 }, { "epoch": 1.28, "grad_norm": 2.8800933954985335, "learning_rate": 2.868930223634149e-06, "loss": 1.0738, "step": 6865 }, { "epoch": 1.28, "grad_norm": 2.536918280979007, "learning_rate": 2.862307290518846e-06, "loss": 1.0208, "step": 6870 }, { "epoch": 1.28, "grad_norm": 2.371938641774631, "learning_rate": 2.855688943797348e-06, "loss": 1.0219, "step": 6875 }, { "epoch": 1.28, "grad_norm": 2.33992110293397, "learning_rate": 2.8490751976692345e-06, "loss": 0.9992, "step": 6880 }, { "epoch": 1.28, "grad_norm": 2.635082343200966, "learning_rate": 2.8424660663242178e-06, "loss": 1.0006, "step": 6885 }, { "epoch": 1.28, "grad_norm": 2.865808841306876, "learning_rate": 2.835861563942107e-06, "loss": 1.0357, "step": 6890 }, { "epoch": 1.29, "grad_norm": 2.4160110151856866, "learning_rate": 2.829261704692787e-06, "loss": 0.9538, "step": 6895 }, { "epoch": 1.29, "grad_norm": 2.544168845714422, "learning_rate": 2.8226665027361753e-06, "loss": 1.0393, "step": 6900 }, { "epoch": 1.29, "grad_norm": 2.467913497990881, "learning_rate": 2.8160759722221942e-06, "loss": 1.0535, "step": 6905 }, { "epoch": 1.29, "grad_norm": 2.9028252530293353, "learning_rate": 2.809490127290746e-06, "loss": 1.0147, "step": 6910 }, { "epoch": 1.29, "grad_norm": 2.6008808280831097, "learning_rate": 2.802908982071685e-06, "loss": 1.047, "step": 6915 }, { "epoch": 1.29, "grad_norm": 2.6288368611864863, "learning_rate": 2.796332550684778e-06, "loss": 1.0448, "step": 6920 }, { "epoch": 1.29, "grad_norm": 2.448524250695845, "learning_rate": 2.7897608472396776e-06, "loss": 1.0377, "step": 6925 }, { "epoch": 1.29, "grad_norm": 2.4677282962158085, "learning_rate": 2.783193885835894e-06, "loss": 0.9857, "step": 6930 }, { "epoch": 1.29, "grad_norm": 2.7995182741898494, "learning_rate": 2.7766316805627623e-06, "loss": 1.0324, "step": 6935 }, { "epoch": 1.29, "grad_norm": 2.6020088847197256, "learning_rate": 2.7700742454994195e-06, "loss": 0.9802, "step": 6940 }, { "epoch": 1.3, "grad_norm": 2.584249913609, "learning_rate": 2.7635215947147574e-06, "loss": 1.0669, "step": 6945 }, { "epoch": 1.3, "grad_norm": 2.6987668383796994, "learning_rate": 2.7569737422674103e-06, "loss": 1.0323, "step": 6950 }, { "epoch": 1.3, "grad_norm": 2.784482570862148, "learning_rate": 2.7504307022057152e-06, "loss": 0.9833, "step": 6955 }, { "epoch": 1.3, "grad_norm": 2.47770812516728, "learning_rate": 2.7438924885676886e-06, "loss": 0.9966, "step": 6960 }, { "epoch": 1.3, "grad_norm": 2.73598006360337, "learning_rate": 2.7373591153809864e-06, "loss": 1.0233, "step": 6965 }, { "epoch": 1.3, "grad_norm": 2.690482453892539, "learning_rate": 2.7308305966628823e-06, "loss": 0.9964, "step": 6970 }, { "epoch": 1.3, "grad_norm": 2.4336117130705035, "learning_rate": 2.724306946420234e-06, "loss": 0.9767, "step": 6975 }, { "epoch": 1.3, "grad_norm": 2.6576192606668587, "learning_rate": 2.7177881786494538e-06, "loss": 0.995, "step": 6980 }, { "epoch": 1.3, "grad_norm": 2.142004404838875, "learning_rate": 2.711274307336479e-06, "loss": 0.9556, "step": 6985 }, { "epoch": 1.3, "grad_norm": 2.533080687394775, "learning_rate": 2.7047653464567416e-06, "loss": 1.0542, "step": 6990 }, { "epoch": 1.3, "grad_norm": 2.7053823200502127, "learning_rate": 2.6982613099751376e-06, "loss": 0.963, "step": 6995 }, { "epoch": 1.31, "grad_norm": 2.597526805111531, "learning_rate": 2.6917622118459975e-06, "loss": 1.0127, "step": 7000 }, { "epoch": 1.31, "grad_norm": 2.987091945981233, "learning_rate": 2.6852680660130616e-06, "loss": 1.0293, "step": 7005 }, { "epoch": 1.31, "grad_norm": 2.3621389957330217, "learning_rate": 2.678778886409438e-06, "loss": 1.0367, "step": 7010 }, { "epoch": 1.31, "grad_norm": 2.8142933423345666, "learning_rate": 2.6722946869575837e-06, "loss": 0.9871, "step": 7015 }, { "epoch": 1.31, "grad_norm": 2.8818765360063967, "learning_rate": 2.6658154815692693e-06, "loss": 0.9571, "step": 7020 }, { "epoch": 1.31, "grad_norm": 2.470646940527506, "learning_rate": 2.659341284145553e-06, "loss": 0.975, "step": 7025 }, { "epoch": 1.31, "grad_norm": 3.5524754035548893, "learning_rate": 2.6528721085767457e-06, "loss": 1.0368, "step": 7030 }, { "epoch": 1.31, "grad_norm": 2.612626897766346, "learning_rate": 2.646407968742385e-06, "loss": 0.9813, "step": 7035 }, { "epoch": 1.31, "grad_norm": 2.4800999564613626, "learning_rate": 2.6399488785112055e-06, "loss": 1.0118, "step": 7040 }, { "epoch": 1.31, "grad_norm": 2.5510537924141943, "learning_rate": 2.633494851741104e-06, "loss": 1.0442, "step": 7045 }, { "epoch": 1.31, "grad_norm": 2.636748644407911, "learning_rate": 2.6270459022791218e-06, "loss": 1.0212, "step": 7050 }, { "epoch": 1.32, "grad_norm": 2.7034201669908917, "learning_rate": 2.6206020439613988e-06, "loss": 1.0306, "step": 7055 }, { "epoch": 1.32, "grad_norm": 2.337916153105764, "learning_rate": 2.6141632906131578e-06, "loss": 1.0701, "step": 7060 }, { "epoch": 1.32, "grad_norm": 2.7302392511630447, "learning_rate": 2.60772965604866e-06, "loss": 1.0052, "step": 7065 }, { "epoch": 1.32, "grad_norm": 2.622516557603338, "learning_rate": 2.6013011540711954e-06, "loss": 1.0121, "step": 7070 }, { "epoch": 1.32, "grad_norm": 2.457751042922651, "learning_rate": 2.5948777984730354e-06, "loss": 0.9813, "step": 7075 }, { "epoch": 1.32, "grad_norm": 2.5201979245792114, "learning_rate": 2.5884596030354127e-06, "loss": 1.0001, "step": 7080 }, { "epoch": 1.32, "grad_norm": 3.100741302309318, "learning_rate": 2.5820465815284845e-06, "loss": 1.0238, "step": 7085 }, { "epoch": 1.32, "grad_norm": 2.8057319534838077, "learning_rate": 2.5756387477113154e-06, "loss": 1.0933, "step": 7090 }, { "epoch": 1.32, "grad_norm": 2.739482279660433, "learning_rate": 2.5692361153318333e-06, "loss": 1.053, "step": 7095 }, { "epoch": 1.32, "grad_norm": 3.0396582626801707, "learning_rate": 2.56283869812681e-06, "loss": 1.0126, "step": 7100 }, { "epoch": 1.33, "grad_norm": 2.8612314821867355, "learning_rate": 2.5564465098218225e-06, "loss": 1.0162, "step": 7105 }, { "epoch": 1.33, "grad_norm": 2.540230694944602, "learning_rate": 2.5500595641312344e-06, "loss": 1.0183, "step": 7110 }, { "epoch": 1.33, "grad_norm": 2.5484379608694723, "learning_rate": 2.543677874758163e-06, "loss": 1.0186, "step": 7115 }, { "epoch": 1.33, "grad_norm": 2.7804517969115725, "learning_rate": 2.5373014553944443e-06, "loss": 1.0007, "step": 7120 }, { "epoch": 1.33, "grad_norm": 2.4281339631896053, "learning_rate": 2.530930319720608e-06, "loss": 0.9656, "step": 7125 }, { "epoch": 1.33, "grad_norm": 2.506088098134065, "learning_rate": 2.5245644814058475e-06, "loss": 1.0063, "step": 7130 }, { "epoch": 1.33, "grad_norm": 2.8452273943404682, "learning_rate": 2.5182039541079963e-06, "loss": 0.9963, "step": 7135 }, { "epoch": 1.33, "grad_norm": 2.717683642251381, "learning_rate": 2.511848751473485e-06, "loss": 0.9919, "step": 7140 }, { "epoch": 1.33, "grad_norm": 2.3677700987643617, "learning_rate": 2.5054988871373225e-06, "loss": 1.0083, "step": 7145 }, { "epoch": 1.33, "grad_norm": 2.5612910826475708, "learning_rate": 2.499154374723069e-06, "loss": 0.9992, "step": 7150 }, { "epoch": 1.33, "grad_norm": 2.7925071554565806, "learning_rate": 2.492815227842795e-06, "loss": 1.0352, "step": 7155 }, { "epoch": 1.34, "grad_norm": 2.6005784247999735, "learning_rate": 2.486481460097068e-06, "loss": 0.9978, "step": 7160 }, { "epoch": 1.34, "grad_norm": 2.2973468599232247, "learning_rate": 2.4801530850749086e-06, "loss": 0.996, "step": 7165 }, { "epoch": 1.34, "grad_norm": 2.831479201002048, "learning_rate": 2.4738301163537675e-06, "loss": 1.0229, "step": 7170 }, { "epoch": 1.34, "grad_norm": 2.4063077604608827, "learning_rate": 2.4675125674994983e-06, "loss": 0.9992, "step": 7175 }, { "epoch": 1.34, "grad_norm": 2.4806704995189772, "learning_rate": 2.4612004520663246e-06, "loss": 0.968, "step": 7180 }, { "epoch": 1.34, "grad_norm": 2.6313623117831324, "learning_rate": 2.4548937835968143e-06, "loss": 0.988, "step": 7185 }, { "epoch": 1.34, "grad_norm": 2.903837181362707, "learning_rate": 2.448592575621849e-06, "loss": 0.9885, "step": 7190 }, { "epoch": 1.34, "grad_norm": 2.6711362094088305, "learning_rate": 2.4422968416605897e-06, "loss": 1.019, "step": 7195 }, { "epoch": 1.34, "grad_norm": 3.1576982437768524, "learning_rate": 2.4360065952204636e-06, "loss": 1.033, "step": 7200 }, { "epoch": 1.34, "grad_norm": 2.92877302700557, "learning_rate": 2.429721849797115e-06, "loss": 1.0043, "step": 7205 }, { "epoch": 1.34, "grad_norm": 2.4781966294522877, "learning_rate": 2.4234426188743893e-06, "loss": 1.0097, "step": 7210 }, { "epoch": 1.35, "grad_norm": 2.9640141348991427, "learning_rate": 2.4171689159243005e-06, "loss": 0.9569, "step": 7215 }, { "epoch": 1.35, "grad_norm": 2.485092344715958, "learning_rate": 2.4109007544070024e-06, "loss": 1.0391, "step": 7220 }, { "epoch": 1.35, "grad_norm": 2.700577108794825, "learning_rate": 2.4046381477707587e-06, "loss": 1.0008, "step": 7225 }, { "epoch": 1.35, "grad_norm": 2.439655699190244, "learning_rate": 2.3983811094519172e-06, "loss": 0.9949, "step": 7230 }, { "epoch": 1.35, "grad_norm": 2.2209150426109816, "learning_rate": 2.3921296528748774e-06, "loss": 1.0128, "step": 7235 }, { "epoch": 1.35, "grad_norm": 2.3755750932435036, "learning_rate": 2.385883791452061e-06, "loss": 1.0717, "step": 7240 }, { "epoch": 1.35, "grad_norm": 2.4671325785441445, "learning_rate": 2.3796435385838926e-06, "loss": 1.0212, "step": 7245 }, { "epoch": 1.35, "grad_norm": 3.012659647333001, "learning_rate": 2.3734089076587572e-06, "loss": 1.0254, "step": 7250 }, { "epoch": 1.35, "grad_norm": 2.512602473812372, "learning_rate": 2.3671799120529793e-06, "loss": 0.9908, "step": 7255 }, { "epoch": 1.35, "grad_norm": 2.4269671022760564, "learning_rate": 2.360956565130794e-06, "loss": 0.9849, "step": 7260 }, { "epoch": 1.35, "grad_norm": 2.4199392850875454, "learning_rate": 2.3547388802443166e-06, "loss": 0.9899, "step": 7265 }, { "epoch": 1.36, "grad_norm": 2.483465002460721, "learning_rate": 2.348526870733515e-06, "loss": 0.9675, "step": 7270 }, { "epoch": 1.36, "grad_norm": 2.7165707232570138, "learning_rate": 2.3423205499261796e-06, "loss": 1.0041, "step": 7275 }, { "epoch": 1.36, "grad_norm": 2.507174897930294, "learning_rate": 2.336119931137897e-06, "loss": 1.0225, "step": 7280 }, { "epoch": 1.36, "grad_norm": 2.486968006992097, "learning_rate": 2.3299250276720175e-06, "loss": 1.0073, "step": 7285 }, { "epoch": 1.36, "grad_norm": 2.4942272969865287, "learning_rate": 2.323735852819635e-06, "loss": 1.0089, "step": 7290 }, { "epoch": 1.36, "grad_norm": 2.6627456223879515, "learning_rate": 2.31755241985955e-06, "loss": 0.9922, "step": 7295 }, { "epoch": 1.36, "grad_norm": 2.682610387682074, "learning_rate": 2.3113747420582383e-06, "loss": 1.0586, "step": 7300 }, { "epoch": 1.36, "grad_norm": 2.3761170060074472, "learning_rate": 2.3052028326698333e-06, "loss": 0.9964, "step": 7305 }, { "epoch": 1.36, "grad_norm": 2.6110018966383937, "learning_rate": 2.299036704936095e-06, "loss": 1.0244, "step": 7310 }, { "epoch": 1.36, "grad_norm": 2.562086827792303, "learning_rate": 2.292876372086375e-06, "loss": 1.013, "step": 7315 }, { "epoch": 1.37, "grad_norm": 2.6929334887746026, "learning_rate": 2.286721847337591e-06, "loss": 1.0022, "step": 7320 }, { "epoch": 1.37, "grad_norm": 2.9940515986542255, "learning_rate": 2.2805731438942016e-06, "loss": 0.9905, "step": 7325 }, { "epoch": 1.37, "grad_norm": 2.849568114317893, "learning_rate": 2.274430274948179e-06, "loss": 0.9861, "step": 7330 }, { "epoch": 1.37, "grad_norm": 2.67404623335488, "learning_rate": 2.2682932536789693e-06, "loss": 0.9973, "step": 7335 }, { "epoch": 1.37, "grad_norm": 2.5576579250667812, "learning_rate": 2.26216209325348e-06, "loss": 0.9966, "step": 7340 }, { "epoch": 1.37, "grad_norm": 2.5678223637612834, "learning_rate": 2.25603680682604e-06, "loss": 1.0126, "step": 7345 }, { "epoch": 1.37, "grad_norm": 2.24656998780328, "learning_rate": 2.2499174075383762e-06, "loss": 0.9919, "step": 7350 }, { "epoch": 1.37, "grad_norm": 2.5754557845252344, "learning_rate": 2.2438039085195897e-06, "loss": 0.9964, "step": 7355 }, { "epoch": 1.37, "grad_norm": 2.7952425186487737, "learning_rate": 2.237696322886115e-06, "loss": 1.0108, "step": 7360 }, { "epoch": 1.37, "grad_norm": 2.3832345940042585, "learning_rate": 2.2315946637417046e-06, "loss": 0.9369, "step": 7365 }, { "epoch": 1.37, "grad_norm": 2.582206718494254, "learning_rate": 2.225498944177394e-06, "loss": 0.9998, "step": 7370 }, { "epoch": 1.38, "grad_norm": 3.1387040443518015, "learning_rate": 2.2194091772714754e-06, "loss": 0.9992, "step": 7375 }, { "epoch": 1.38, "grad_norm": 2.6314196217784502, "learning_rate": 2.2133253760894697e-06, "loss": 0.9607, "step": 7380 }, { "epoch": 1.38, "grad_norm": 2.8777705905638977, "learning_rate": 2.207247553684099e-06, "loss": 1.0068, "step": 7385 }, { "epoch": 1.38, "grad_norm": 2.7533052839525145, "learning_rate": 2.201175723095258e-06, "loss": 1.0119, "step": 7390 }, { "epoch": 1.38, "grad_norm": 2.6256471402176107, "learning_rate": 2.1951098973499823e-06, "loss": 0.9979, "step": 7395 }, { "epoch": 1.38, "grad_norm": 2.3971531857526824, "learning_rate": 2.189050089462433e-06, "loss": 1.0022, "step": 7400 }, { "epoch": 1.38, "grad_norm": 3.014527777899452, "learning_rate": 2.1829963124338514e-06, "loss": 1.0189, "step": 7405 }, { "epoch": 1.38, "grad_norm": 2.4836659831352024, "learning_rate": 2.1769485792525437e-06, "loss": 0.9942, "step": 7410 }, { "epoch": 1.38, "grad_norm": 2.49850328272449, "learning_rate": 2.170906902893847e-06, "loss": 0.9904, "step": 7415 }, { "epoch": 1.38, "grad_norm": 2.4045192020286676, "learning_rate": 2.164871296320106e-06, "loss": 1.0055, "step": 7420 }, { "epoch": 1.38, "grad_norm": 2.5940798839061148, "learning_rate": 2.15884177248064e-06, "loss": 0.9927, "step": 7425 }, { "epoch": 1.39, "grad_norm": 3.2043613592102087, "learning_rate": 2.152818344311721e-06, "loss": 1.0198, "step": 7430 }, { "epoch": 1.39, "grad_norm": 2.9549208216543086, "learning_rate": 2.1468010247365373e-06, "loss": 1.0524, "step": 7435 }, { "epoch": 1.39, "grad_norm": 2.431983887961489, "learning_rate": 2.1407898266651805e-06, "loss": 1.0261, "step": 7440 }, { "epoch": 1.39, "grad_norm": 2.8943587841934955, "learning_rate": 2.134784762994601e-06, "loss": 1.0147, "step": 7445 }, { "epoch": 1.39, "grad_norm": 2.6469118849255504, "learning_rate": 2.12878584660859e-06, "loss": 1.011, "step": 7450 }, { "epoch": 1.39, "grad_norm": 2.8503986008176567, "learning_rate": 2.1227930903777517e-06, "loss": 0.9992, "step": 7455 }, { "epoch": 1.39, "grad_norm": 2.7444067798255296, "learning_rate": 2.116806507159466e-06, "loss": 1.0106, "step": 7460 }, { "epoch": 1.39, "grad_norm": 2.8278801460868337, "learning_rate": 2.1108261097978812e-06, "loss": 1.007, "step": 7465 }, { "epoch": 1.39, "grad_norm": 2.9725590428126054, "learning_rate": 2.1048519111238647e-06, "loss": 0.9747, "step": 7470 }, { "epoch": 1.39, "grad_norm": 2.9983088444929225, "learning_rate": 2.0988839239549875e-06, "loss": 1.0049, "step": 7475 }, { "epoch": 1.39, "grad_norm": 2.457640569736608, "learning_rate": 2.092922161095492e-06, "loss": 1.0196, "step": 7480 }, { "epoch": 1.4, "grad_norm": 2.916737237944643, "learning_rate": 2.0869666353362724e-06, "loss": 1.0224, "step": 7485 }, { "epoch": 1.4, "grad_norm": 2.749518033440132, "learning_rate": 2.0810173594548356e-06, "loss": 1.0098, "step": 7490 }, { "epoch": 1.4, "grad_norm": 2.454945647019492, "learning_rate": 2.0750743462152782e-06, "loss": 0.9794, "step": 7495 }, { "epoch": 1.4, "grad_norm": 2.5914266247015565, "learning_rate": 2.0691376083682635e-06, "loss": 1.0079, "step": 7500 }, { "epoch": 1.4, "grad_norm": 2.6360537632556476, "learning_rate": 2.063207158650994e-06, "loss": 1.0306, "step": 7505 }, { "epoch": 1.4, "grad_norm": 2.705751594055718, "learning_rate": 2.0572830097871747e-06, "loss": 0.9879, "step": 7510 }, { "epoch": 1.4, "grad_norm": 2.417410823436373, "learning_rate": 2.0513651744869966e-06, "loss": 1.0156, "step": 7515 }, { "epoch": 1.4, "grad_norm": 2.58329594764982, "learning_rate": 2.0454536654471035e-06, "loss": 0.9679, "step": 7520 }, { "epoch": 1.4, "grad_norm": 2.5375397018150254, "learning_rate": 2.039548495350564e-06, "loss": 0.998, "step": 7525 }, { "epoch": 1.4, "grad_norm": 2.5457670422000462, "learning_rate": 2.0336496768668545e-06, "loss": 1.0162, "step": 7530 }, { "epoch": 1.41, "grad_norm": 2.5385173317223124, "learning_rate": 2.0277572226518134e-06, "loss": 0.9504, "step": 7535 }, { "epoch": 1.41, "grad_norm": 2.6122071384819647, "learning_rate": 2.021871145347632e-06, "loss": 1.0261, "step": 7540 }, { "epoch": 1.41, "grad_norm": 2.4559143987913004, "learning_rate": 2.015991457582815e-06, "loss": 1.0295, "step": 7545 }, { "epoch": 1.41, "grad_norm": 2.4131900706265688, "learning_rate": 2.010118171972165e-06, "loss": 1.0106, "step": 7550 }, { "epoch": 1.41, "grad_norm": 2.5830984989956187, "learning_rate": 2.0042513011167447e-06, "loss": 1.018, "step": 7555 }, { "epoch": 1.41, "grad_norm": 2.5141342688412736, "learning_rate": 1.998390857603853e-06, "loss": 0.9796, "step": 7560 }, { "epoch": 1.41, "grad_norm": 2.825530580323147, "learning_rate": 1.9925368540070008e-06, "loss": 1.0012, "step": 7565 }, { "epoch": 1.41, "grad_norm": 2.570499501431887, "learning_rate": 1.986689302885882e-06, "loss": 0.9962, "step": 7570 }, { "epoch": 1.41, "grad_norm": 2.5569510782263105, "learning_rate": 1.980848216786346e-06, "loss": 1.0039, "step": 7575 }, { "epoch": 1.41, "grad_norm": 2.4037111815985623, "learning_rate": 1.9750136082403735e-06, "loss": 1.0027, "step": 7580 }, { "epoch": 1.41, "grad_norm": 2.606403450673804, "learning_rate": 1.9691854897660455e-06, "loss": 1.0181, "step": 7585 }, { "epoch": 1.42, "grad_norm": 2.6149611442247553, "learning_rate": 1.9633638738675183e-06, "loss": 0.9894, "step": 7590 }, { "epoch": 1.42, "grad_norm": 2.731625908822058, "learning_rate": 1.957548773035001e-06, "loss": 0.9966, "step": 7595 }, { "epoch": 1.42, "grad_norm": 2.6720427892224676, "learning_rate": 1.9517401997447217e-06, "loss": 0.9995, "step": 7600 }, { "epoch": 1.42, "grad_norm": 2.2262976409569863, "learning_rate": 1.945938166458902e-06, "loss": 0.9857, "step": 7605 }, { "epoch": 1.42, "grad_norm": 2.5234422281434123, "learning_rate": 1.940142685625736e-06, "loss": 0.9802, "step": 7610 }, { "epoch": 1.42, "grad_norm": 2.904221387786334, "learning_rate": 1.9343537696793555e-06, "loss": 1.0409, "step": 7615 }, { "epoch": 1.42, "grad_norm": 2.4909926104739384, "learning_rate": 1.9285714310398117e-06, "loss": 0.9962, "step": 7620 }, { "epoch": 1.42, "grad_norm": 2.383606867119876, "learning_rate": 1.9227956821130406e-06, "loss": 1.0326, "step": 7625 }, { "epoch": 1.42, "grad_norm": 2.8933537644372262, "learning_rate": 1.9170265352908426e-06, "loss": 0.9889, "step": 7630 }, { "epoch": 1.42, "grad_norm": 2.5338221111488317, "learning_rate": 1.911264002950849e-06, "loss": 0.9956, "step": 7635 }, { "epoch": 1.42, "grad_norm": 3.1011258166395628, "learning_rate": 1.9055080974565076e-06, "loss": 1.0198, "step": 7640 }, { "epoch": 1.43, "grad_norm": 2.7342133503101147, "learning_rate": 1.8997588311570425e-06, "loss": 0.9802, "step": 7645 }, { "epoch": 1.43, "grad_norm": 2.8091842542893977, "learning_rate": 1.894016216387436e-06, "loss": 0.9592, "step": 7650 }, { "epoch": 1.43, "grad_norm": 3.407053889176312, "learning_rate": 1.888280265468393e-06, "loss": 1.0095, "step": 7655 }, { "epoch": 1.43, "grad_norm": 2.693200133443283, "learning_rate": 1.8825509907063328e-06, "loss": 1.0003, "step": 7660 }, { "epoch": 1.43, "grad_norm": 2.246216665996247, "learning_rate": 1.8768284043933427e-06, "loss": 0.9846, "step": 7665 }, { "epoch": 1.43, "grad_norm": 2.346758280901885, "learning_rate": 1.8711125188071621e-06, "loss": 1.0293, "step": 7670 }, { "epoch": 1.43, "grad_norm": 2.787357287294682, "learning_rate": 1.865403346211153e-06, "loss": 1.0274, "step": 7675 }, { "epoch": 1.43, "grad_norm": 3.7658119105298917, "learning_rate": 1.8597008988542791e-06, "loss": 0.9855, "step": 7680 }, { "epoch": 1.43, "grad_norm": 2.5022564517891235, "learning_rate": 1.854005188971071e-06, "loss": 1.0077, "step": 7685 }, { "epoch": 1.43, "grad_norm": 2.934534992469462, "learning_rate": 1.848316228781606e-06, "loss": 1.0042, "step": 7690 }, { "epoch": 1.44, "grad_norm": 2.8338775537793035, "learning_rate": 1.8426340304914764e-06, "loss": 0.9496, "step": 7695 }, { "epoch": 1.44, "grad_norm": 2.4873584490481133, "learning_rate": 1.8369586062917693e-06, "loss": 1.0171, "step": 7700 }, { "epoch": 1.44, "grad_norm": 3.0088916973384765, "learning_rate": 1.8312899683590424e-06, "loss": 0.9972, "step": 7705 }, { "epoch": 1.44, "grad_norm": 2.656464767123038, "learning_rate": 1.8256281288552885e-06, "loss": 1.0099, "step": 7710 }, { "epoch": 1.44, "grad_norm": 3.0511739775979114, "learning_rate": 1.8199730999279147e-06, "loss": 1.0329, "step": 7715 }, { "epoch": 1.44, "grad_norm": 2.586345592514905, "learning_rate": 1.8143248937097153e-06, "loss": 1.0266, "step": 7720 }, { "epoch": 1.44, "grad_norm": 2.5776112571536216, "learning_rate": 1.8086835223188542e-06, "loss": 1.0502, "step": 7725 }, { "epoch": 1.44, "grad_norm": 2.5687232321997833, "learning_rate": 1.8030489978588195e-06, "loss": 1.0314, "step": 7730 }, { "epoch": 1.44, "grad_norm": 2.589975182340955, "learning_rate": 1.7974213324184176e-06, "loss": 1.0744, "step": 7735 }, { "epoch": 1.44, "grad_norm": 2.4216590891248093, "learning_rate": 1.7918005380717341e-06, "loss": 1.0085, "step": 7740 }, { "epoch": 1.44, "grad_norm": 2.6874915737338756, "learning_rate": 1.7861866268781186e-06, "loss": 0.9891, "step": 7745 }, { "epoch": 1.45, "grad_norm": 2.730115388264179, "learning_rate": 1.7805796108821483e-06, "loss": 1.0821, "step": 7750 }, { "epoch": 1.45, "grad_norm": 2.58907575983276, "learning_rate": 1.7749795021136074e-06, "loss": 1.0293, "step": 7755 }, { "epoch": 1.45, "grad_norm": 2.400032501236323, "learning_rate": 1.769386312587462e-06, "loss": 0.9911, "step": 7760 }, { "epoch": 1.45, "grad_norm": 2.5616088617013593, "learning_rate": 1.7638000543038326e-06, "loss": 1.022, "step": 7765 }, { "epoch": 1.45, "grad_norm": 3.011699779238265, "learning_rate": 1.7582207392479684e-06, "loss": 0.9719, "step": 7770 }, { "epoch": 1.45, "grad_norm": 2.7120556534956206, "learning_rate": 1.7526483793902215e-06, "loss": 1.0045, "step": 7775 }, { "epoch": 1.45, "grad_norm": 2.41683174537852, "learning_rate": 1.747082986686024e-06, "loss": 1.0072, "step": 7780 }, { "epoch": 1.45, "grad_norm": 2.32504667984806, "learning_rate": 1.741524573075855e-06, "loss": 0.9902, "step": 7785 }, { "epoch": 1.45, "grad_norm": 2.6375990189427148, "learning_rate": 1.7359731504852283e-06, "loss": 0.9992, "step": 7790 }, { "epoch": 1.45, "grad_norm": 2.404224852309051, "learning_rate": 1.7304287308246514e-06, "loss": 0.9802, "step": 7795 }, { "epoch": 1.45, "grad_norm": 2.7029433370239087, "learning_rate": 1.72489132598961e-06, "loss": 1.0207, "step": 7800 }, { "epoch": 1.46, "grad_norm": 2.3249820631357827, "learning_rate": 1.7193609478605388e-06, "loss": 1.0503, "step": 7805 }, { "epoch": 1.46, "grad_norm": 2.634946032303947, "learning_rate": 1.713837608302797e-06, "loss": 1.0101, "step": 7810 }, { "epoch": 1.46, "grad_norm": 2.3929583667727585, "learning_rate": 1.7083213191666425e-06, "loss": 0.9947, "step": 7815 }, { "epoch": 1.46, "grad_norm": 2.71027976931746, "learning_rate": 1.7028120922872071e-06, "loss": 1.017, "step": 7820 }, { "epoch": 1.46, "grad_norm": 2.668532399284522, "learning_rate": 1.69730993948447e-06, "loss": 1.0405, "step": 7825 }, { "epoch": 1.46, "grad_norm": 2.6708797038580636, "learning_rate": 1.6918148725632322e-06, "loss": 0.9941, "step": 7830 }, { "epoch": 1.46, "grad_norm": 2.5676328647086586, "learning_rate": 1.6863269033130963e-06, "loss": 1.0316, "step": 7835 }, { "epoch": 1.46, "grad_norm": 2.9899945946380626, "learning_rate": 1.6808460435084316e-06, "loss": 0.9909, "step": 7840 }, { "epoch": 1.46, "grad_norm": 3.054882339496845, "learning_rate": 1.6753723049083575e-06, "loss": 1.0078, "step": 7845 }, { "epoch": 1.46, "grad_norm": 2.609112302401605, "learning_rate": 1.6699056992567132e-06, "loss": 0.989, "step": 7850 }, { "epoch": 1.46, "grad_norm": 2.430195809966236, "learning_rate": 1.6644462382820348e-06, "loss": 1.0013, "step": 7855 }, { "epoch": 1.47, "grad_norm": 3.2390129601989823, "learning_rate": 1.6589939336975298e-06, "loss": 0.9794, "step": 7860 }, { "epoch": 1.47, "grad_norm": 2.4217885170245452, "learning_rate": 1.6535487972010511e-06, "loss": 0.9815, "step": 7865 }, { "epoch": 1.47, "grad_norm": 2.929203958696014, "learning_rate": 1.648110840475074e-06, "loss": 0.9624, "step": 7870 }, { "epoch": 1.47, "grad_norm": 2.5488052903322385, "learning_rate": 1.6426800751866662e-06, "loss": 1.0104, "step": 7875 }, { "epoch": 1.47, "grad_norm": 2.8700612666358607, "learning_rate": 1.637256512987473e-06, "loss": 1.0434, "step": 7880 }, { "epoch": 1.47, "grad_norm": 2.8596393228875967, "learning_rate": 1.6318401655136807e-06, "loss": 0.9891, "step": 7885 }, { "epoch": 1.47, "grad_norm": 3.0486393042996323, "learning_rate": 1.6264310443859937e-06, "loss": 1.0076, "step": 7890 }, { "epoch": 1.47, "grad_norm": 2.3273538904602655, "learning_rate": 1.6210291612096168e-06, "loss": 0.9968, "step": 7895 }, { "epoch": 1.47, "grad_norm": 2.914791726331403, "learning_rate": 1.6156345275742274e-06, "loss": 1.0513, "step": 7900 }, { "epoch": 1.47, "grad_norm": 3.1399450509554034, "learning_rate": 1.610247155053945e-06, "loss": 1.0086, "step": 7905 }, { "epoch": 1.48, "grad_norm": 2.7207488493952297, "learning_rate": 1.6048670552073125e-06, "loss": 0.9984, "step": 7910 }, { "epoch": 1.48, "grad_norm": 2.6048326574991654, "learning_rate": 1.5994942395772668e-06, "loss": 1.0135, "step": 7915 }, { "epoch": 1.48, "grad_norm": 2.6367561560889774, "learning_rate": 1.5941287196911236e-06, "loss": 0.9907, "step": 7920 }, { "epoch": 1.48, "grad_norm": 2.4769120488306227, "learning_rate": 1.5887705070605364e-06, "loss": 1.0166, "step": 7925 }, { "epoch": 1.48, "grad_norm": 2.5189491566743256, "learning_rate": 1.5834196131814872e-06, "loss": 1.0052, "step": 7930 }, { "epoch": 1.48, "grad_norm": 2.4493928951618664, "learning_rate": 1.578076049534253e-06, "loss": 1.0553, "step": 7935 }, { "epoch": 1.48, "grad_norm": 2.5351068015113625, "learning_rate": 1.572739827583384e-06, "loss": 1.0467, "step": 7940 }, { "epoch": 1.48, "grad_norm": 2.655233690562782, "learning_rate": 1.5674109587776825e-06, "loss": 0.9715, "step": 7945 }, { "epoch": 1.48, "grad_norm": 2.5534781581480117, "learning_rate": 1.5620894545501709e-06, "loss": 0.9374, "step": 7950 }, { "epoch": 1.48, "grad_norm": 2.7528561246346124, "learning_rate": 1.5567753263180712e-06, "loss": 0.9979, "step": 7955 }, { "epoch": 1.48, "grad_norm": 3.0595711702661093, "learning_rate": 1.551468585482782e-06, "loss": 0.9997, "step": 7960 }, { "epoch": 1.49, "grad_norm": 2.6100795976750435, "learning_rate": 1.5461692434298515e-06, "loss": 1.0002, "step": 7965 }, { "epoch": 1.49, "grad_norm": 2.873387156051466, "learning_rate": 1.5408773115289527e-06, "loss": 1.0216, "step": 7970 }, { "epoch": 1.49, "grad_norm": 2.6690217461993444, "learning_rate": 1.5355928011338628e-06, "loss": 0.9939, "step": 7975 }, { "epoch": 1.49, "grad_norm": 2.506394501418365, "learning_rate": 1.5303157235824323e-06, "loss": 1.0374, "step": 7980 }, { "epoch": 1.49, "grad_norm": 2.26834419318478, "learning_rate": 1.5250460901965702e-06, "loss": 0.9837, "step": 7985 }, { "epoch": 1.49, "grad_norm": 2.626617307266259, "learning_rate": 1.51978391228221e-06, "loss": 1.0259, "step": 7990 }, { "epoch": 1.49, "grad_norm": 2.514926149191691, "learning_rate": 1.5145292011292905e-06, "loss": 0.9788, "step": 7995 }, { "epoch": 1.49, "grad_norm": 2.566211120378057, "learning_rate": 1.509281968011731e-06, "loss": 0.9221, "step": 8000 }, { "epoch": 1.49, "grad_norm": 3.090252271917728, "learning_rate": 1.5040422241874058e-06, "loss": 1.027, "step": 8005 }, { "epoch": 1.49, "grad_norm": 2.7794131086955, "learning_rate": 1.4988099808981227e-06, "loss": 0.9859, "step": 8010 }, { "epoch": 1.49, "grad_norm": 2.4233202674174334, "learning_rate": 1.4935852493695962e-06, "loss": 1.0112, "step": 8015 }, { "epoch": 1.5, "grad_norm": 2.6253735306012476, "learning_rate": 1.4883680408114238e-06, "loss": 0.9951, "step": 8020 }, { "epoch": 1.5, "grad_norm": 2.7054085641131747, "learning_rate": 1.4831583664170619e-06, "loss": 0.9924, "step": 8025 }, { "epoch": 1.5, "grad_norm": 2.66331786719659, "learning_rate": 1.4779562373638073e-06, "loss": 1.0151, "step": 8030 }, { "epoch": 1.5, "grad_norm": 2.4687092490748976, "learning_rate": 1.472761664812763e-06, "loss": 1.0439, "step": 8035 }, { "epoch": 1.5, "grad_norm": 2.2381123844793023, "learning_rate": 1.4675746599088208e-06, "loss": 0.992, "step": 8040 }, { "epoch": 1.5, "grad_norm": 2.7820004676047483, "learning_rate": 1.4623952337806397e-06, "loss": 0.9766, "step": 8045 }, { "epoch": 1.5, "grad_norm": 2.843933021496936, "learning_rate": 1.4572233975406102e-06, "loss": 0.9946, "step": 8050 }, { "epoch": 1.5, "grad_norm": 2.847159497897028, "learning_rate": 1.4520591622848495e-06, "loss": 1.01, "step": 8055 }, { "epoch": 1.5, "grad_norm": 2.5912260687609376, "learning_rate": 1.4469025390931595e-06, "loss": 1.0257, "step": 8060 }, { "epoch": 1.5, "grad_norm": 2.499004311563004, "learning_rate": 1.4417535390290139e-06, "loss": 1.0954, "step": 8065 }, { "epoch": 1.5, "grad_norm": 2.6280528719952874, "learning_rate": 1.4366121731395266e-06, "loss": 1.0619, "step": 8070 }, { "epoch": 1.51, "grad_norm": 2.5818235310207545, "learning_rate": 1.4314784524554405e-06, "loss": 1.0654, "step": 8075 }, { "epoch": 1.51, "grad_norm": 2.857141504036705, "learning_rate": 1.4263523879910902e-06, "loss": 0.9869, "step": 8080 }, { "epoch": 1.51, "grad_norm": 2.4989029878764413, "learning_rate": 1.4212339907443822e-06, "loss": 1.0482, "step": 8085 }, { "epoch": 1.51, "grad_norm": 2.6923172446783954, "learning_rate": 1.416123271696776e-06, "loss": 0.9947, "step": 8090 }, { "epoch": 1.51, "grad_norm": 2.5223697256777005, "learning_rate": 1.4110202418132596e-06, "loss": 1.0431, "step": 8095 }, { "epoch": 1.51, "grad_norm": 2.5206572271590697, "learning_rate": 1.4059249120423213e-06, "loss": 1.0203, "step": 8100 }, { "epoch": 1.51, "grad_norm": 2.274549996984812, "learning_rate": 1.4008372933159287e-06, "loss": 0.9754, "step": 8105 }, { "epoch": 1.51, "grad_norm": 2.463111888028792, "learning_rate": 1.3957573965495064e-06, "loss": 1.026, "step": 8110 }, { "epoch": 1.51, "grad_norm": 2.3299161035359477, "learning_rate": 1.3906852326419096e-06, "loss": 1.0099, "step": 8115 }, { "epoch": 1.51, "grad_norm": 2.3383819268062713, "learning_rate": 1.385620812475409e-06, "loss": 0.9251, "step": 8120 }, { "epoch": 1.52, "grad_norm": 2.780420753115608, "learning_rate": 1.3805641469156523e-06, "loss": 0.9935, "step": 8125 }, { "epoch": 1.52, "grad_norm": 2.5750186026284276, "learning_rate": 1.3755152468116562e-06, "loss": 1.0058, "step": 8130 }, { "epoch": 1.52, "grad_norm": 2.935898032597156, "learning_rate": 1.3704741229957723e-06, "loss": 0.9991, "step": 8135 }, { "epoch": 1.52, "grad_norm": 2.5933993665579242, "learning_rate": 1.365440786283674e-06, "loss": 1.0002, "step": 8140 }, { "epoch": 1.52, "grad_norm": 2.7639651828531577, "learning_rate": 1.3604152474743233e-06, "loss": 0.9922, "step": 8145 }, { "epoch": 1.52, "grad_norm": 3.2441453180376048, "learning_rate": 1.3553975173499516e-06, "loss": 0.9989, "step": 8150 }, { "epoch": 1.52, "grad_norm": 2.4970579337537373, "learning_rate": 1.350387606676039e-06, "loss": 1.0367, "step": 8155 }, { "epoch": 1.52, "grad_norm": 2.5016675644006856, "learning_rate": 1.3453855262012877e-06, "loss": 0.9925, "step": 8160 }, { "epoch": 1.52, "grad_norm": 2.712422105166325, "learning_rate": 1.3403912866576013e-06, "loss": 0.9872, "step": 8165 }, { "epoch": 1.52, "grad_norm": 2.6277080069881475, "learning_rate": 1.3354048987600597e-06, "loss": 0.9896, "step": 8170 }, { "epoch": 1.52, "grad_norm": 2.2742594923450294, "learning_rate": 1.3304263732068978e-06, "loss": 1.0459, "step": 8175 }, { "epoch": 1.53, "grad_norm": 3.198036891220997, "learning_rate": 1.3254557206794805e-06, "loss": 1.0019, "step": 8180 }, { "epoch": 1.53, "grad_norm": 2.8910090926748206, "learning_rate": 1.3204929518422854e-06, "loss": 1.0001, "step": 8185 }, { "epoch": 1.53, "grad_norm": 2.588419180976511, "learning_rate": 1.3155380773428716e-06, "loss": 1.0491, "step": 8190 }, { "epoch": 1.53, "grad_norm": 2.9831121608734863, "learning_rate": 1.3105911078118623e-06, "loss": 1.0183, "step": 8195 }, { "epoch": 1.53, "grad_norm": 2.7492226038465577, "learning_rate": 1.3056520538629202e-06, "loss": 0.9878, "step": 8200 }, { "epoch": 1.53, "grad_norm": 2.3860445073262215, "learning_rate": 1.3007209260927262e-06, "loss": 0.9998, "step": 8205 }, { "epoch": 1.53, "grad_norm": 2.7245221333282394, "learning_rate": 1.2957977350809548e-06, "loss": 1.0239, "step": 8210 }, { "epoch": 1.53, "grad_norm": 2.436510051916248, "learning_rate": 1.290882491390253e-06, "loss": 1.0715, "step": 8215 }, { "epoch": 1.53, "grad_norm": 2.278573195904812, "learning_rate": 1.2859752055662144e-06, "loss": 0.9442, "step": 8220 }, { "epoch": 1.53, "grad_norm": 2.4519680767900773, "learning_rate": 1.2810758881373653e-06, "loss": 0.9701, "step": 8225 }, { "epoch": 1.53, "grad_norm": 2.8358651175843423, "learning_rate": 1.2761845496151299e-06, "loss": 0.9333, "step": 8230 }, { "epoch": 1.54, "grad_norm": 2.5083663458407113, "learning_rate": 1.2713012004938157e-06, "loss": 1.0233, "step": 8235 }, { "epoch": 1.54, "grad_norm": 2.33214694353536, "learning_rate": 1.26642585125059e-06, "loss": 0.9518, "step": 8240 }, { "epoch": 1.54, "grad_norm": 2.833861135741682, "learning_rate": 1.2615585123454522e-06, "loss": 0.9277, "step": 8245 }, { "epoch": 1.54, "grad_norm": 2.67836542381953, "learning_rate": 1.2566991942212226e-06, "loss": 1.008, "step": 8250 }, { "epoch": 1.54, "grad_norm": 2.3412731570156002, "learning_rate": 1.2518479073035078e-06, "loss": 0.9933, "step": 8255 }, { "epoch": 1.54, "grad_norm": 2.637129686981733, "learning_rate": 1.247004662000686e-06, "loss": 0.9752, "step": 8260 }, { "epoch": 1.54, "grad_norm": 2.503883482079726, "learning_rate": 1.2421694687038782e-06, "loss": 0.9784, "step": 8265 }, { "epoch": 1.54, "grad_norm": 3.93407055926674, "learning_rate": 1.2373423377869376e-06, "loss": 1.036, "step": 8270 }, { "epoch": 1.54, "grad_norm": 2.454241790604054, "learning_rate": 1.232523279606413e-06, "loss": 1.0169, "step": 8275 }, { "epoch": 1.54, "grad_norm": 2.53575212107835, "learning_rate": 1.227712304501536e-06, "loss": 0.9813, "step": 8280 }, { "epoch": 1.55, "grad_norm": 2.3951270616987146, "learning_rate": 1.222909422794194e-06, "loss": 1.0607, "step": 8285 }, { "epoch": 1.55, "grad_norm": 2.49607239228924, "learning_rate": 1.2181146447889102e-06, "loss": 0.9752, "step": 8290 }, { "epoch": 1.55, "grad_norm": 2.942783317969311, "learning_rate": 1.213327980772826e-06, "loss": 0.9916, "step": 8295 }, { "epoch": 1.55, "grad_norm": 2.655950651887915, "learning_rate": 1.2085494410156702e-06, "loss": 0.983, "step": 8300 }, { "epoch": 1.55, "grad_norm": 2.254472695034608, "learning_rate": 1.203779035769741e-06, "loss": 1.0008, "step": 8305 }, { "epoch": 1.55, "grad_norm": 2.5598190175983193, "learning_rate": 1.1990167752698844e-06, "loss": 0.9649, "step": 8310 }, { "epoch": 1.55, "grad_norm": 2.8832350083558667, "learning_rate": 1.1942626697334765e-06, "loss": 0.978, "step": 8315 }, { "epoch": 1.55, "grad_norm": 2.2769055391490576, "learning_rate": 1.1895167293603882e-06, "loss": 1.01, "step": 8320 }, { "epoch": 1.55, "grad_norm": 2.868510340527335, "learning_rate": 1.1847789643329799e-06, "loss": 0.9954, "step": 8325 }, { "epoch": 1.55, "grad_norm": 2.975783073583449, "learning_rate": 1.1800493848160666e-06, "loss": 0.9835, "step": 8330 }, { "epoch": 1.55, "grad_norm": 2.839117035143341, "learning_rate": 1.1753280009569068e-06, "loss": 0.9374, "step": 8335 }, { "epoch": 1.56, "grad_norm": 2.9242643444387855, "learning_rate": 1.170614822885171e-06, "loss": 1.0051, "step": 8340 }, { "epoch": 1.56, "grad_norm": 2.4750110312055495, "learning_rate": 1.165909860712926e-06, "loss": 0.9987, "step": 8345 }, { "epoch": 1.56, "grad_norm": 2.8365937492878324, "learning_rate": 1.161213124534611e-06, "loss": 1.0153, "step": 8350 }, { "epoch": 1.56, "grad_norm": 2.8947472669930394, "learning_rate": 1.1565246244270162e-06, "loss": 0.974, "step": 8355 }, { "epoch": 1.56, "grad_norm": 2.7331996001322367, "learning_rate": 1.1518443704492616e-06, "loss": 0.9935, "step": 8360 }, { "epoch": 1.56, "grad_norm": 2.630249509479508, "learning_rate": 1.1471723726427764e-06, "loss": 1.0287, "step": 8365 }, { "epoch": 1.56, "grad_norm": 2.591959586184885, "learning_rate": 1.1425086410312741e-06, "loss": 1.0038, "step": 8370 }, { "epoch": 1.56, "grad_norm": 2.6408943576305535, "learning_rate": 1.1378531856207337e-06, "loss": 0.9827, "step": 8375 }, { "epoch": 1.56, "grad_norm": 2.55792605523133, "learning_rate": 1.1332060163993807e-06, "loss": 0.9853, "step": 8380 }, { "epoch": 1.56, "grad_norm": 3.0398087791861568, "learning_rate": 1.1285671433376593e-06, "loss": 0.9964, "step": 8385 }, { "epoch": 1.56, "grad_norm": 2.6670733962637074, "learning_rate": 1.1239365763882154e-06, "loss": 0.9925, "step": 8390 }, { "epoch": 1.57, "grad_norm": 3.4420907762244157, "learning_rate": 1.119314325485874e-06, "loss": 1.0508, "step": 8395 }, { "epoch": 1.57, "grad_norm": 2.4398172452504143, "learning_rate": 1.1147004005476192e-06, "loss": 0.9965, "step": 8400 }, { "epoch": 1.57, "grad_norm": 2.837429151462753, "learning_rate": 1.1100948114725695e-06, "loss": 1.022, "step": 8405 }, { "epoch": 1.57, "grad_norm": 2.50970813890442, "learning_rate": 1.1054975681419622e-06, "loss": 0.9722, "step": 8410 }, { "epoch": 1.57, "grad_norm": 2.6008952386375968, "learning_rate": 1.100908680419126e-06, "loss": 1.0042, "step": 8415 }, { "epoch": 1.57, "grad_norm": 2.397601424695823, "learning_rate": 1.0963281581494628e-06, "loss": 1.009, "step": 8420 }, { "epoch": 1.57, "grad_norm": 2.341096455070129, "learning_rate": 1.0917560111604302e-06, "loss": 0.9997, "step": 8425 }, { "epoch": 1.57, "grad_norm": 2.72165027387936, "learning_rate": 1.0871922492615133e-06, "loss": 1.0503, "step": 8430 }, { "epoch": 1.57, "grad_norm": 2.3765933075053876, "learning_rate": 1.0826368822442078e-06, "loss": 0.9567, "step": 8435 }, { "epoch": 1.57, "grad_norm": 2.7711690893945997, "learning_rate": 1.0780899198819983e-06, "loss": 1.0002, "step": 8440 }, { "epoch": 1.57, "grad_norm": 2.5883255453415086, "learning_rate": 1.073551371930337e-06, "loss": 1.0195, "step": 8445 }, { "epoch": 1.58, "grad_norm": 2.984516847385917, "learning_rate": 1.0690212481266243e-06, "loss": 0.9722, "step": 8450 }, { "epoch": 1.58, "grad_norm": 2.4493087022991276, "learning_rate": 1.0644995581901856e-06, "loss": 0.9661, "step": 8455 }, { "epoch": 1.58, "grad_norm": 2.4118000504228387, "learning_rate": 1.0599863118222503e-06, "loss": 1.0067, "step": 8460 }, { "epoch": 1.58, "grad_norm": 2.776031639098257, "learning_rate": 1.055481518705937e-06, "loss": 0.9601, "step": 8465 }, { "epoch": 1.58, "grad_norm": 2.8333438440892866, "learning_rate": 1.0509851885062227e-06, "loss": 1.0002, "step": 8470 }, { "epoch": 1.58, "grad_norm": 2.6776557473356597, "learning_rate": 1.0464973308699311e-06, "loss": 0.9865, "step": 8475 }, { "epoch": 1.58, "grad_norm": 2.784349412463351, "learning_rate": 1.0420179554257037e-06, "loss": 1.0139, "step": 8480 }, { "epoch": 1.58, "grad_norm": 2.862349613553008, "learning_rate": 1.0375470717839864e-06, "loss": 0.9772, "step": 8485 }, { "epoch": 1.58, "grad_norm": 2.6005297452393377, "learning_rate": 1.0330846895370078e-06, "loss": 0.9669, "step": 8490 }, { "epoch": 1.58, "grad_norm": 2.2580387526234844, "learning_rate": 1.0286308182587546e-06, "loss": 1.0336, "step": 8495 }, { "epoch": 1.59, "grad_norm": 2.715174129691373, "learning_rate": 1.0241854675049534e-06, "loss": 1.0184, "step": 8500 }, { "epoch": 1.59, "grad_norm": 2.2875458045237487, "learning_rate": 1.0197486468130485e-06, "loss": 0.9621, "step": 8505 }, { "epoch": 1.59, "grad_norm": 3.1661384763089004, "learning_rate": 1.0153203657021894e-06, "loss": 1.0024, "step": 8510 }, { "epoch": 1.59, "grad_norm": 2.3304188336777094, "learning_rate": 1.0109006336731947e-06, "loss": 1.0389, "step": 8515 }, { "epoch": 1.59, "grad_norm": 2.525969437894621, "learning_rate": 1.006489460208549e-06, "loss": 0.9968, "step": 8520 }, { "epoch": 1.59, "grad_norm": 2.387820313938779, "learning_rate": 1.0020868547723694e-06, "loss": 0.9955, "step": 8525 }, { "epoch": 1.59, "grad_norm": 2.9191333287504664, "learning_rate": 9.97692826810393e-07, "loss": 0.9721, "step": 8530 }, { "epoch": 1.59, "grad_norm": 2.8134674262578456, "learning_rate": 9.933073857499554e-07, "loss": 0.953, "step": 8535 }, { "epoch": 1.59, "grad_norm": 2.716709444476881, "learning_rate": 9.889305409999656e-07, "loss": 1.0351, "step": 8540 }, { "epoch": 1.59, "grad_norm": 2.5286335152563497, "learning_rate": 9.845623019508915e-07, "loss": 0.959, "step": 8545 }, { "epoch": 1.59, "grad_norm": 2.870530482870655, "learning_rate": 9.80202677974737e-07, "loss": 1.01, "step": 8550 }, { "epoch": 1.6, "grad_norm": 2.6542379957905853, "learning_rate": 9.758516784250222e-07, "loss": 0.9648, "step": 8555 }, { "epoch": 1.6, "grad_norm": 2.4225005154609076, "learning_rate": 9.715093126367643e-07, "loss": 0.9523, "step": 8560 }, { "epoch": 1.6, "grad_norm": 2.488467696082567, "learning_rate": 9.671755899264562e-07, "loss": 0.9919, "step": 8565 }, { "epoch": 1.6, "grad_norm": 2.732784552303335, "learning_rate": 9.628505195920463e-07, "loss": 0.9653, "step": 8570 }, { "epoch": 1.6, "grad_norm": 2.58854203634735, "learning_rate": 9.585341109129227e-07, "loss": 0.9737, "step": 8575 }, { "epoch": 1.6, "grad_norm": 3.0065804562154033, "learning_rate": 9.542263731498875e-07, "loss": 1.0292, "step": 8580 }, { "epoch": 1.6, "grad_norm": 3.1293104736581174, "learning_rate": 9.499273155451388e-07, "loss": 0.9769, "step": 8585 }, { "epoch": 1.6, "grad_norm": 2.871745660395818, "learning_rate": 9.456369473222538e-07, "loss": 0.9853, "step": 8590 }, { "epoch": 1.6, "grad_norm": 2.588651633798483, "learning_rate": 9.413552776861645e-07, "loss": 0.9886, "step": 8595 }, { "epoch": 1.6, "grad_norm": 2.821289962013644, "learning_rate": 9.370823158231418e-07, "loss": 1.0294, "step": 8600 }, { "epoch": 1.6, "grad_norm": 2.6198307828799154, "learning_rate": 9.328180709007734e-07, "loss": 0.9639, "step": 8605 }, { "epoch": 1.61, "grad_norm": 2.589055089856369, "learning_rate": 9.285625520679447e-07, "loss": 1.0009, "step": 8610 }, { "epoch": 1.61, "grad_norm": 2.6817091221550564, "learning_rate": 9.243157684548199e-07, "loss": 0.9768, "step": 8615 }, { "epoch": 1.61, "grad_norm": 2.446688278010068, "learning_rate": 9.200777291728224e-07, "loss": 1.0152, "step": 8620 }, { "epoch": 1.61, "grad_norm": 2.6819493855737067, "learning_rate": 9.158484433146142e-07, "loss": 0.9845, "step": 8625 }, { "epoch": 1.61, "grad_norm": 2.4531681967551897, "learning_rate": 9.116279199540768e-07, "loss": 1.0151, "step": 8630 }, { "epoch": 1.61, "grad_norm": 2.518328043463977, "learning_rate": 9.074161681462906e-07, "loss": 0.9412, "step": 8635 }, { "epoch": 1.61, "grad_norm": 2.6161600853086715, "learning_rate": 9.032131969275198e-07, "loss": 0.9948, "step": 8640 }, { "epoch": 1.61, "grad_norm": 2.622763413959486, "learning_rate": 8.990190153151874e-07, "loss": 0.9626, "step": 8645 }, { "epoch": 1.61, "grad_norm": 2.574191956837669, "learning_rate": 8.948336323078593e-07, "loss": 1.0239, "step": 8650 }, { "epoch": 1.61, "grad_norm": 2.72808187078771, "learning_rate": 8.90657056885224e-07, "loss": 0.9803, "step": 8655 }, { "epoch": 1.61, "grad_norm": 2.305170921371496, "learning_rate": 8.864892980080719e-07, "loss": 0.9985, "step": 8660 }, { "epoch": 1.62, "grad_norm": 2.5223234611611147, "learning_rate": 8.823303646182823e-07, "loss": 0.982, "step": 8665 }, { "epoch": 1.62, "grad_norm": 2.6678684960116246, "learning_rate": 8.781802656387955e-07, "loss": 0.9834, "step": 8670 }, { "epoch": 1.62, "grad_norm": 2.443915108308519, "learning_rate": 8.740390099735973e-07, "loss": 0.9941, "step": 8675 }, { "epoch": 1.62, "grad_norm": 3.0455073502536436, "learning_rate": 8.699066065077005e-07, "loss": 0.9633, "step": 8680 }, { "epoch": 1.62, "grad_norm": 2.8874510477383653, "learning_rate": 8.657830641071297e-07, "loss": 0.9885, "step": 8685 }, { "epoch": 1.62, "grad_norm": 2.501539958965497, "learning_rate": 8.616683916188939e-07, "loss": 1.027, "step": 8690 }, { "epoch": 1.62, "grad_norm": 2.739630521686995, "learning_rate": 8.575625978709723e-07, "loss": 1.0207, "step": 8695 }, { "epoch": 1.62, "grad_norm": 2.800804703467251, "learning_rate": 8.534656916722966e-07, "loss": 1.0162, "step": 8700 }, { "epoch": 1.62, "grad_norm": 2.6458421698177403, "learning_rate": 8.493776818127275e-07, "loss": 0.9937, "step": 8705 }, { "epoch": 1.62, "grad_norm": 2.4116964460512316, "learning_rate": 8.452985770630451e-07, "loss": 0.9697, "step": 8710 }, { "epoch": 1.63, "grad_norm": 2.5314410018943403, "learning_rate": 8.412283861749149e-07, "loss": 0.9986, "step": 8715 }, { "epoch": 1.63, "grad_norm": 2.5639593529419744, "learning_rate": 8.371671178808832e-07, "loss": 0.957, "step": 8720 }, { "epoch": 1.63, "grad_norm": 2.5695870932919584, "learning_rate": 8.331147808943507e-07, "loss": 1.0161, "step": 8725 }, { "epoch": 1.63, "grad_norm": 2.603768679055716, "learning_rate": 8.290713839095605e-07, "loss": 1.0269, "step": 8730 }, { "epoch": 1.63, "grad_norm": 2.447043867628572, "learning_rate": 8.250369356015698e-07, "loss": 0.9981, "step": 8735 }, { "epoch": 1.63, "grad_norm": 2.3880441714776466, "learning_rate": 8.210114446262391e-07, "loss": 1.0256, "step": 8740 }, { "epoch": 1.63, "grad_norm": 2.7658758501345826, "learning_rate": 8.169949196202099e-07, "loss": 1.0113, "step": 8745 }, { "epoch": 1.63, "grad_norm": 3.1004638384283023, "learning_rate": 8.129873692008883e-07, "loss": 0.9592, "step": 8750 }, { "epoch": 1.63, "grad_norm": 2.5833091864252467, "learning_rate": 8.089888019664255e-07, "loss": 0.964, "step": 8755 }, { "epoch": 1.63, "grad_norm": 2.6515268577779207, "learning_rate": 8.049992264956985e-07, "loss": 0.9743, "step": 8760 }, { "epoch": 1.63, "grad_norm": 2.8611413957572727, "learning_rate": 8.010186513482931e-07, "loss": 0.9987, "step": 8765 }, { "epoch": 1.64, "grad_norm": 2.6172758310161157, "learning_rate": 7.970470850644841e-07, "loss": 0.9748, "step": 8770 }, { "epoch": 1.64, "grad_norm": 3.1282851318839215, "learning_rate": 7.930845361652212e-07, "loss": 0.9794, "step": 8775 }, { "epoch": 1.64, "grad_norm": 3.0573290970246303, "learning_rate": 7.891310131521041e-07, "loss": 0.9787, "step": 8780 }, { "epoch": 1.64, "grad_norm": 2.1506066941256243, "learning_rate": 7.851865245073675e-07, "loss": 1.0085, "step": 8785 }, { "epoch": 1.64, "grad_norm": 2.5240271641211263, "learning_rate": 7.812510786938654e-07, "loss": 1.0368, "step": 8790 }, { "epoch": 1.64, "grad_norm": 2.423560041926841, "learning_rate": 7.77324684155048e-07, "loss": 0.9355, "step": 8795 }, { "epoch": 1.64, "grad_norm": 2.96046939164647, "learning_rate": 7.73407349314948e-07, "loss": 1.0029, "step": 8800 }, { "epoch": 1.64, "grad_norm": 2.8005942518059963, "learning_rate": 7.69499082578159e-07, "loss": 0.9935, "step": 8805 }, { "epoch": 1.64, "grad_norm": 3.2244592316095817, "learning_rate": 7.655998923298197e-07, "loss": 0.9929, "step": 8810 }, { "epoch": 1.64, "grad_norm": 2.70767496399375, "learning_rate": 7.617097869355972e-07, "loss": 1.0294, "step": 8815 }, { "epoch": 1.64, "grad_norm": 2.6297837313438106, "learning_rate": 7.57828774741664e-07, "loss": 0.9631, "step": 8820 }, { "epoch": 1.65, "grad_norm": 2.4742233021313593, "learning_rate": 7.539568640746859e-07, "loss": 1.019, "step": 8825 }, { "epoch": 1.65, "grad_norm": 2.776492788189207, "learning_rate": 7.500940632418003e-07, "loss": 1.012, "step": 8830 }, { "epoch": 1.65, "grad_norm": 2.4002889591019696, "learning_rate": 7.462403805305968e-07, "loss": 1.0021, "step": 8835 }, { "epoch": 1.65, "grad_norm": 2.6289778414523037, "learning_rate": 7.423958242091078e-07, "loss": 0.998, "step": 8840 }, { "epoch": 1.65, "grad_norm": 2.7338119825458262, "learning_rate": 7.38560402525782e-07, "loss": 0.9947, "step": 8845 }, { "epoch": 1.65, "grad_norm": 2.414954611873808, "learning_rate": 7.347341237094697e-07, "loss": 1.0184, "step": 8850 }, { "epoch": 1.65, "grad_norm": 3.019980775613549, "learning_rate": 7.30916995969404e-07, "loss": 1.0255, "step": 8855 }, { "epoch": 1.65, "grad_norm": 2.6991242420719725, "learning_rate": 7.271090274951898e-07, "loss": 0.9912, "step": 8860 }, { "epoch": 1.65, "grad_norm": 2.4453775704614746, "learning_rate": 7.233102264567749e-07, "loss": 0.9877, "step": 8865 }, { "epoch": 1.65, "grad_norm": 2.3079920807165593, "learning_rate": 7.195206010044425e-07, "loss": 1.0195, "step": 8870 }, { "epoch": 1.66, "grad_norm": 2.7013123383670576, "learning_rate": 7.157401592687851e-07, "loss": 1.043, "step": 8875 }, { "epoch": 1.66, "grad_norm": 2.717114035440022, "learning_rate": 7.119689093606974e-07, "loss": 0.9513, "step": 8880 }, { "epoch": 1.66, "grad_norm": 2.4488620724748977, "learning_rate": 7.082068593713498e-07, "loss": 0.9934, "step": 8885 }, { "epoch": 1.66, "grad_norm": 2.533131093439108, "learning_rate": 7.044540173721743e-07, "loss": 0.942, "step": 8890 }, { "epoch": 1.66, "grad_norm": 2.739407015115311, "learning_rate": 7.007103914148483e-07, "loss": 0.9678, "step": 8895 }, { "epoch": 1.66, "grad_norm": 2.9594049097283412, "learning_rate": 6.969759895312749e-07, "loss": 1.0177, "step": 8900 }, { "epoch": 1.66, "grad_norm": 2.5416612175743785, "learning_rate": 6.932508197335719e-07, "loss": 0.9669, "step": 8905 }, { "epoch": 1.66, "grad_norm": 2.741704966229588, "learning_rate": 6.895348900140414e-07, "loss": 0.9719, "step": 8910 }, { "epoch": 1.66, "grad_norm": 2.650765378296894, "learning_rate": 6.858282083451673e-07, "loss": 1.0236, "step": 8915 }, { "epoch": 1.66, "grad_norm": 2.6247728708420914, "learning_rate": 6.821307826795897e-07, "loss": 0.9939, "step": 8920 }, { "epoch": 1.66, "grad_norm": 2.8682972388443977, "learning_rate": 6.784426209500916e-07, "loss": 1.0111, "step": 8925 }, { "epoch": 1.67, "grad_norm": 2.532873552583512, "learning_rate": 6.747637310695792e-07, "loss": 0.9711, "step": 8930 }, { "epoch": 1.67, "grad_norm": 2.76208256828962, "learning_rate": 6.710941209310656e-07, "loss": 0.962, "step": 8935 }, { "epoch": 1.67, "grad_norm": 2.7328359763817804, "learning_rate": 6.674337984076551e-07, "loss": 1.0309, "step": 8940 }, { "epoch": 1.67, "grad_norm": 2.6713338056299203, "learning_rate": 6.637827713525252e-07, "loss": 1.0181, "step": 8945 }, { "epoch": 1.67, "grad_norm": 2.299256870197894, "learning_rate": 6.601410475989106e-07, "loss": 0.9893, "step": 8950 }, { "epoch": 1.67, "grad_norm": 2.581073111238717, "learning_rate": 6.565086349600847e-07, "loss": 1.0204, "step": 8955 }, { "epoch": 1.67, "grad_norm": 2.6116790427110574, "learning_rate": 6.52885541229345e-07, "loss": 0.9743, "step": 8960 }, { "epoch": 1.67, "grad_norm": 2.7309921989649677, "learning_rate": 6.492717741799948e-07, "loss": 0.9694, "step": 8965 }, { "epoch": 1.67, "grad_norm": 2.586747283406679, "learning_rate": 6.456673415653292e-07, "loss": 1.0078, "step": 8970 }, { "epoch": 1.67, "grad_norm": 2.479290657154231, "learning_rate": 6.420722511186134e-07, "loss": 0.9972, "step": 8975 }, { "epoch": 1.67, "grad_norm": 2.4648199168537412, "learning_rate": 6.384865105530708e-07, "loss": 1.0074, "step": 8980 }, { "epoch": 1.68, "grad_norm": 2.2138902601386574, "learning_rate": 6.349101275618641e-07, "loss": 0.9987, "step": 8985 }, { "epoch": 1.68, "grad_norm": 2.2720662118151544, "learning_rate": 6.3134310981808e-07, "loss": 1.0317, "step": 8990 }, { "epoch": 1.68, "grad_norm": 2.3287005891438817, "learning_rate": 6.277854649747111e-07, "loss": 1.0005, "step": 8995 }, { "epoch": 1.68, "grad_norm": 2.4627011404225, "learning_rate": 6.242372006646419e-07, "loss": 1.0005, "step": 9000 }, { "epoch": 1.68, "grad_norm": 2.511510221864267, "learning_rate": 6.206983245006304e-07, "loss": 1.0113, "step": 9005 }, { "epoch": 1.68, "grad_norm": 2.5333977257602363, "learning_rate": 6.171688440752915e-07, "loss": 0.9587, "step": 9010 }, { "epoch": 1.68, "grad_norm": 2.2279459443378635, "learning_rate": 6.136487669610841e-07, "loss": 1.0043, "step": 9015 }, { "epoch": 1.68, "grad_norm": 3.028251278959122, "learning_rate": 6.101381007102914e-07, "loss": 0.9891, "step": 9020 }, { "epoch": 1.68, "grad_norm": 2.7570318740615942, "learning_rate": 6.066368528550037e-07, "loss": 1.0377, "step": 9025 }, { "epoch": 1.68, "grad_norm": 2.6810202304359287, "learning_rate": 6.031450309071068e-07, "loss": 1.0084, "step": 9030 }, { "epoch": 1.68, "grad_norm": 2.4733864750094394, "learning_rate": 5.996626423582624e-07, "loss": 1.0223, "step": 9035 }, { "epoch": 1.69, "grad_norm": 2.399472156625489, "learning_rate": 5.961896946798923e-07, "loss": 1.0306, "step": 9040 }, { "epoch": 1.69, "grad_norm": 3.010499926572972, "learning_rate": 5.927261953231639e-07, "loss": 0.9157, "step": 9045 }, { "epoch": 1.69, "grad_norm": 3.138764769840231, "learning_rate": 5.892721517189726e-07, "loss": 0.9091, "step": 9050 }, { "epoch": 1.69, "grad_norm": 3.052701911605609, "learning_rate": 5.858275712779288e-07, "loss": 0.9896, "step": 9055 }, { "epoch": 1.69, "grad_norm": 2.6461404601065834, "learning_rate": 5.823924613903365e-07, "loss": 0.9955, "step": 9060 }, { "epoch": 1.69, "grad_norm": 2.6462011414933206, "learning_rate": 5.789668294261841e-07, "loss": 0.9824, "step": 9065 }, { "epoch": 1.69, "grad_norm": 2.4145981121391333, "learning_rate": 5.755506827351214e-07, "loss": 1.024, "step": 9070 }, { "epoch": 1.69, "grad_norm": 2.726076929054993, "learning_rate": 5.721440286464491e-07, "loss": 1.0127, "step": 9075 }, { "epoch": 1.69, "grad_norm": 2.832791332944865, "learning_rate": 5.687468744691049e-07, "loss": 1.0005, "step": 9080 }, { "epoch": 1.69, "grad_norm": 2.873484163008957, "learning_rate": 5.653592274916408e-07, "loss": 1.0268, "step": 9085 }, { "epoch": 1.7, "grad_norm": 2.4948123943547125, "learning_rate": 5.619810949822124e-07, "loss": 1.017, "step": 9090 }, { "epoch": 1.7, "grad_norm": 2.500406538069392, "learning_rate": 5.586124841885604e-07, "loss": 1.0411, "step": 9095 }, { "epoch": 1.7, "grad_norm": 2.850558812351205, "learning_rate": 5.552534023380024e-07, "loss": 0.9598, "step": 9100 }, { "epoch": 1.7, "grad_norm": 2.9195318309343645, "learning_rate": 5.519038566374041e-07, "loss": 0.986, "step": 9105 }, { "epoch": 1.7, "grad_norm": 2.8803650261654665, "learning_rate": 5.485638542731764e-07, "loss": 1.0123, "step": 9110 }, { "epoch": 1.7, "grad_norm": 2.5711087382307873, "learning_rate": 5.452334024112533e-07, "loss": 0.9667, "step": 9115 }, { "epoch": 1.7, "grad_norm": 2.600124755629203, "learning_rate": 5.419125081970805e-07, "loss": 1.0223, "step": 9120 }, { "epoch": 1.7, "grad_norm": 2.692133520812537, "learning_rate": 5.386011787555951e-07, "loss": 1.0128, "step": 9125 }, { "epoch": 1.7, "grad_norm": 2.841992509801276, "learning_rate": 5.352994211912144e-07, "loss": 0.977, "step": 9130 }, { "epoch": 1.7, "grad_norm": 2.4308516211470703, "learning_rate": 5.320072425878198e-07, "loss": 0.9886, "step": 9135 }, { "epoch": 1.7, "grad_norm": 2.528221958734492, "learning_rate": 5.287246500087401e-07, "loss": 0.9715, "step": 9140 }, { "epoch": 1.71, "grad_norm": 2.6059596924040513, "learning_rate": 5.254516504967377e-07, "loss": 1.0196, "step": 9145 }, { "epoch": 1.71, "grad_norm": 3.0973405688579327, "learning_rate": 5.221882510739939e-07, "loss": 0.9772, "step": 9150 }, { "epoch": 1.71, "grad_norm": 2.334645224986343, "learning_rate": 5.189344587420924e-07, "loss": 1.0351, "step": 9155 }, { "epoch": 1.71, "grad_norm": 2.7613541361820593, "learning_rate": 5.156902804820041e-07, "loss": 0.9751, "step": 9160 }, { "epoch": 1.71, "grad_norm": 2.536194202059706, "learning_rate": 5.124557232540761e-07, "loss": 1.0051, "step": 9165 }, { "epoch": 1.71, "grad_norm": 2.583988794361422, "learning_rate": 5.092307939980112e-07, "loss": 0.9559, "step": 9170 }, { "epoch": 1.71, "grad_norm": 2.6904789600878045, "learning_rate": 5.060154996328554e-07, "loss": 0.9627, "step": 9175 }, { "epoch": 1.71, "grad_norm": 2.9369318056690186, "learning_rate": 5.028098470569843e-07, "loss": 1.0039, "step": 9180 }, { "epoch": 1.71, "grad_norm": 2.842613702289357, "learning_rate": 4.996138431480862e-07, "loss": 1.0297, "step": 9185 }, { "epoch": 1.71, "grad_norm": 2.538587614782097, "learning_rate": 4.964274947631492e-07, "loss": 1.049, "step": 9190 }, { "epoch": 1.71, "grad_norm": 2.3897911615036356, "learning_rate": 4.932508087384452e-07, "loss": 1.0171, "step": 9195 }, { "epoch": 1.72, "grad_norm": 2.608616242680837, "learning_rate": 4.900837918895152e-07, "loss": 1.0567, "step": 9200 }, { "epoch": 1.72, "grad_norm": 2.7563694990221403, "learning_rate": 4.869264510111543e-07, "loss": 1.0163, "step": 9205 }, { "epoch": 1.72, "grad_norm": 2.8931596725585047, "learning_rate": 4.837787928774013e-07, "loss": 1.0264, "step": 9210 }, { "epoch": 1.72, "grad_norm": 2.56930829905871, "learning_rate": 4.806408242415184e-07, "loss": 1.0019, "step": 9215 }, { "epoch": 1.72, "grad_norm": 2.5389017003142, "learning_rate": 4.77512551835978e-07, "loss": 0.9896, "step": 9220 }, { "epoch": 1.72, "grad_norm": 2.4572664571372367, "learning_rate": 4.743939823724508e-07, "loss": 0.9452, "step": 9225 }, { "epoch": 1.72, "grad_norm": 2.597573031717897, "learning_rate": 4.7128512254179083e-07, "loss": 0.9848, "step": 9230 }, { "epoch": 1.72, "grad_norm": 2.9459449902717694, "learning_rate": 4.6818597901401864e-07, "loss": 0.9703, "step": 9235 }, { "epoch": 1.72, "grad_norm": 2.672050813178242, "learning_rate": 4.6509655843830827e-07, "loss": 0.9637, "step": 9240 }, { "epoch": 1.72, "grad_norm": 2.5874000556032435, "learning_rate": 4.620168674429754e-07, "loss": 1.0507, "step": 9245 }, { "epoch": 1.73, "grad_norm": 2.65520217601556, "learning_rate": 4.5894691263545845e-07, "loss": 1.0023, "step": 9250 }, { "epoch": 1.73, "grad_norm": 2.969014031244479, "learning_rate": 4.5588670060230933e-07, "loss": 1.0131, "step": 9255 }, { "epoch": 1.73, "grad_norm": 2.7090713448074206, "learning_rate": 4.528362379091766e-07, "loss": 0.9715, "step": 9260 }, { "epoch": 1.73, "grad_norm": 2.6279641402257043, "learning_rate": 4.4979553110078957e-07, "loss": 0.9686, "step": 9265 }, { "epoch": 1.73, "grad_norm": 2.5834971940560294, "learning_rate": 4.467645867009468e-07, "loss": 0.9521, "step": 9270 }, { "epoch": 1.73, "grad_norm": 3.0675999534568468, "learning_rate": 4.4374341121250496e-07, "loss": 0.9839, "step": 9275 }, { "epoch": 1.73, "grad_norm": 3.2037422202426504, "learning_rate": 4.407320111173591e-07, "loss": 0.967, "step": 9280 }, { "epoch": 1.73, "grad_norm": 3.020695568265947, "learning_rate": 4.3773039287643095e-07, "loss": 0.9436, "step": 9285 }, { "epoch": 1.73, "grad_norm": 2.7269030107954655, "learning_rate": 4.347385629296552e-07, "loss": 1.0461, "step": 9290 }, { "epoch": 1.73, "grad_norm": 2.5327864316035207, "learning_rate": 4.3175652769597007e-07, "loss": 0.9584, "step": 9295 }, { "epoch": 1.73, "grad_norm": 2.519235011867918, "learning_rate": 4.2878429357329277e-07, "loss": 0.9824, "step": 9300 }, { "epoch": 1.74, "grad_norm": 2.310799267177094, "learning_rate": 4.2582186693851666e-07, "loss": 1.0077, "step": 9305 }, { "epoch": 1.74, "grad_norm": 2.4863765476136823, "learning_rate": 4.228692541474921e-07, "loss": 1.0093, "step": 9310 }, { "epoch": 1.74, "grad_norm": 2.6063805598016225, "learning_rate": 4.199264615350135e-07, "loss": 1.0481, "step": 9315 }, { "epoch": 1.74, "grad_norm": 2.985739971909663, "learning_rate": 4.16993495414808e-07, "loss": 0.9816, "step": 9320 }, { "epoch": 1.74, "grad_norm": 2.466317302363233, "learning_rate": 4.1407036207951833e-07, "loss": 0.9676, "step": 9325 }, { "epoch": 1.74, "grad_norm": 2.7455631489194876, "learning_rate": 4.1115706780069155e-07, "loss": 0.9819, "step": 9330 }, { "epoch": 1.74, "grad_norm": 2.908316238615218, "learning_rate": 4.08253618828765e-07, "loss": 1.0758, "step": 9335 }, { "epoch": 1.74, "grad_norm": 2.8169316671572573, "learning_rate": 4.053600213930536e-07, "loss": 0.9638, "step": 9340 }, { "epoch": 1.74, "grad_norm": 2.381506348033514, "learning_rate": 4.0247628170173524e-07, "loss": 1.0118, "step": 9345 }, { "epoch": 1.74, "grad_norm": 2.755468426326752, "learning_rate": 3.9960240594183906e-07, "loss": 0.9953, "step": 9350 }, { "epoch": 1.74, "grad_norm": 2.605028560077565, "learning_rate": 3.9673840027923014e-07, "loss": 1.0409, "step": 9355 }, { "epoch": 1.75, "grad_norm": 3.3235218437167777, "learning_rate": 3.938842708585977e-07, "loss": 0.969, "step": 9360 }, { "epoch": 1.75, "grad_norm": 2.5416660450574007, "learning_rate": 3.910400238034423e-07, "loss": 1.0235, "step": 9365 }, { "epoch": 1.75, "grad_norm": 2.990917512675436, "learning_rate": 3.8820566521606207e-07, "loss": 0.9659, "step": 9370 }, { "epoch": 1.75, "grad_norm": 2.605011528586985, "learning_rate": 3.853812011775382e-07, "loss": 1.0126, "step": 9375 }, { "epoch": 1.75, "grad_norm": 2.7008321338964403, "learning_rate": 3.8256663774772383e-07, "loss": 0.9629, "step": 9380 }, { "epoch": 1.75, "grad_norm": 2.528423641712258, "learning_rate": 3.797619809652314e-07, "loss": 0.9808, "step": 9385 }, { "epoch": 1.75, "grad_norm": 2.31165739813913, "learning_rate": 3.769672368474181e-07, "loss": 0.9899, "step": 9390 }, { "epoch": 1.75, "grad_norm": 2.7287545120118013, "learning_rate": 3.741824113903725e-07, "loss": 0.9959, "step": 9395 }, { "epoch": 1.75, "grad_norm": 2.5020104494413253, "learning_rate": 3.714075105689041e-07, "loss": 1.0553, "step": 9400 }, { "epoch": 1.75, "grad_norm": 2.604353933763939, "learning_rate": 3.686425403365307e-07, "loss": 0.9977, "step": 9405 }, { "epoch": 1.75, "grad_norm": 3.1903595327659082, "learning_rate": 3.658875066254608e-07, "loss": 1.0267, "step": 9410 }, { "epoch": 1.76, "grad_norm": 2.5140954157235638, "learning_rate": 3.631424153465862e-07, "loss": 1.0241, "step": 9415 }, { "epoch": 1.76, "grad_norm": 3.1645831576006813, "learning_rate": 3.6040727238946807e-07, "loss": 1.0294, "step": 9420 }, { "epoch": 1.76, "grad_norm": 2.1964802552754197, "learning_rate": 3.576820836223194e-07, "loss": 0.9772, "step": 9425 }, { "epoch": 1.76, "grad_norm": 2.668666804034786, "learning_rate": 3.5496685489200265e-07, "loss": 1.0015, "step": 9430 }, { "epoch": 1.76, "grad_norm": 2.733206900757186, "learning_rate": 3.5226159202400747e-07, "loss": 0.9567, "step": 9435 }, { "epoch": 1.76, "grad_norm": 3.2939125262903644, "learning_rate": 3.4956630082244204e-07, "loss": 0.9883, "step": 9440 }, { "epoch": 1.76, "grad_norm": 2.8479567257890976, "learning_rate": 3.468809870700207e-07, "loss": 0.9406, "step": 9445 }, { "epoch": 1.76, "grad_norm": 2.731564303566274, "learning_rate": 3.4420565652805215e-07, "loss": 0.986, "step": 9450 }, { "epoch": 1.76, "grad_norm": 2.8052153951482466, "learning_rate": 3.4154031493642716e-07, "loss": 0.9648, "step": 9455 }, { "epoch": 1.76, "grad_norm": 2.3411545722993226, "learning_rate": 3.388849680136008e-07, "loss": 1.0088, "step": 9460 }, { "epoch": 1.77, "grad_norm": 2.5499289340441837, "learning_rate": 3.362396214565888e-07, "loss": 1.0102, "step": 9465 }, { "epoch": 1.77, "grad_norm": 2.30165208282686, "learning_rate": 3.336042809409512e-07, "loss": 0.9971, "step": 9470 }, { "epoch": 1.77, "grad_norm": 2.6602655807503988, "learning_rate": 3.3097895212077845e-07, "loss": 0.996, "step": 9475 }, { "epoch": 1.77, "grad_norm": 2.578917739046354, "learning_rate": 3.2836364062868165e-07, "loss": 0.9575, "step": 9480 }, { "epoch": 1.77, "grad_norm": 2.7011601669086116, "learning_rate": 3.2575835207577957e-07, "loss": 1.0345, "step": 9485 }, { "epoch": 1.77, "grad_norm": 3.187787275788192, "learning_rate": 3.2316309205168705e-07, "loss": 0.9859, "step": 9490 }, { "epoch": 1.77, "grad_norm": 2.6122898621423216, "learning_rate": 3.205778661245051e-07, "loss": 1.0141, "step": 9495 }, { "epoch": 1.77, "grad_norm": 2.443420355234737, "learning_rate": 3.180026798408026e-07, "loss": 1.0211, "step": 9500 }, { "epoch": 1.77, "grad_norm": 2.540209666631077, "learning_rate": 3.1543753872561087e-07, "loss": 1.0208, "step": 9505 }, { "epoch": 1.77, "grad_norm": 2.6964911580456654, "learning_rate": 3.128824482824083e-07, "loss": 0.9891, "step": 9510 }, { "epoch": 1.77, "grad_norm": 2.4469653887447746, "learning_rate": 3.1033741399311204e-07, "loss": 1.0305, "step": 9515 }, { "epoch": 1.78, "grad_norm": 2.291299962290693, "learning_rate": 3.0780244131806193e-07, "loss": 1.0168, "step": 9520 }, { "epoch": 1.78, "grad_norm": 2.522210460018589, "learning_rate": 3.0527753569601083e-07, "loss": 0.961, "step": 9525 }, { "epoch": 1.78, "grad_norm": 2.5659714010554526, "learning_rate": 3.027627025441132e-07, "loss": 0.9547, "step": 9530 }, { "epoch": 1.78, "grad_norm": 2.7052846580426855, "learning_rate": 3.0025794725791315e-07, "loss": 0.9943, "step": 9535 }, { "epoch": 1.78, "grad_norm": 2.484865366051049, "learning_rate": 2.977632752113324e-07, "loss": 0.9765, "step": 9540 }, { "epoch": 1.78, "grad_norm": 2.352793754428888, "learning_rate": 2.9527869175666025e-07, "loss": 0.9883, "step": 9545 }, { "epoch": 1.78, "grad_norm": 2.830152940630546, "learning_rate": 2.928042022245398e-07, "loss": 0.9651, "step": 9550 }, { "epoch": 1.78, "grad_norm": 2.5427539631035554, "learning_rate": 2.903398119239581e-07, "loss": 0.9826, "step": 9555 }, { "epoch": 1.78, "grad_norm": 2.9400021212833787, "learning_rate": 2.878855261422359e-07, "loss": 1.0063, "step": 9560 }, { "epoch": 1.78, "grad_norm": 2.3427477540958725, "learning_rate": 2.8544135014501264e-07, "loss": 1.0179, "step": 9565 }, { "epoch": 1.78, "grad_norm": 2.7757453717128824, "learning_rate": 2.8300728917623866e-07, "loss": 0.9788, "step": 9570 }, { "epoch": 1.79, "grad_norm": 2.4936847575193193, "learning_rate": 2.8058334845816214e-07, "loss": 1.0008, "step": 9575 }, { "epoch": 1.79, "grad_norm": 2.579956273305064, "learning_rate": 2.7816953319131865e-07, "loss": 1.0594, "step": 9580 }, { "epoch": 1.79, "grad_norm": 2.6638295637588993, "learning_rate": 2.757658485545195e-07, "loss": 0.9649, "step": 9585 }, { "epoch": 1.79, "grad_norm": 2.9718905778082707, "learning_rate": 2.7337229970484104e-07, "loss": 1.0001, "step": 9590 }, { "epoch": 1.79, "grad_norm": 2.470369084422218, "learning_rate": 2.709888917776132e-07, "loss": 1.0225, "step": 9595 }, { "epoch": 1.79, "grad_norm": 2.9125743810651463, "learning_rate": 2.6861562988640786e-07, "loss": 1.0356, "step": 9600 }, { "epoch": 1.79, "grad_norm": 2.282995672450367, "learning_rate": 2.662525191230314e-07, "loss": 0.9894, "step": 9605 }, { "epoch": 1.79, "grad_norm": 2.45891203140545, "learning_rate": 2.638995645575082e-07, "loss": 1.0093, "step": 9610 }, { "epoch": 1.79, "grad_norm": 3.1540839609729803, "learning_rate": 2.615567712380751e-07, "loss": 1.0343, "step": 9615 }, { "epoch": 1.79, "grad_norm": 2.4897966159719283, "learning_rate": 2.592241441911641e-07, "loss": 1.0053, "step": 9620 }, { "epoch": 1.79, "grad_norm": 2.6750189260740074, "learning_rate": 2.5690168842140097e-07, "loss": 0.9827, "step": 9625 }, { "epoch": 1.8, "grad_norm": 2.4749333331718457, "learning_rate": 2.5458940891158525e-07, "loss": 1.028, "step": 9630 }, { "epoch": 1.8, "grad_norm": 2.690092812167263, "learning_rate": 2.5228731062268476e-07, "loss": 0.9687, "step": 9635 }, { "epoch": 1.8, "grad_norm": 2.521697310830935, "learning_rate": 2.4999539849382304e-07, "loss": 0.9968, "step": 9640 }, { "epoch": 1.8, "grad_norm": 2.598292223752983, "learning_rate": 2.477136774422706e-07, "loss": 1.0522, "step": 9645 }, { "epoch": 1.8, "grad_norm": 2.449309049508482, "learning_rate": 2.4544215236343206e-07, "loss": 1.0047, "step": 9650 }, { "epoch": 1.8, "grad_norm": 2.6542058491455385, "learning_rate": 2.4318082813083734e-07, "loss": 1.006, "step": 9655 }, { "epoch": 1.8, "grad_norm": 2.5090872392658103, "learning_rate": 2.4092970959612885e-07, "loss": 0.9827, "step": 9660 }, { "epoch": 1.8, "grad_norm": 2.5844303394951806, "learning_rate": 2.3868880158905426e-07, "loss": 1.0087, "step": 9665 }, { "epoch": 1.8, "grad_norm": 2.3442031349160604, "learning_rate": 2.3645810891745556e-07, "loss": 0.997, "step": 9670 }, { "epoch": 1.8, "grad_norm": 2.654874365932498, "learning_rate": 2.3423763636725717e-07, "loss": 0.9667, "step": 9675 }, { "epoch": 1.81, "grad_norm": 2.5095988006665046, "learning_rate": 2.320273887024549e-07, "loss": 0.9777, "step": 9680 }, { "epoch": 1.81, "grad_norm": 2.715845469741196, "learning_rate": 2.298273706651083e-07, "loss": 0.9743, "step": 9685 }, { "epoch": 1.81, "grad_norm": 2.675360652874929, "learning_rate": 2.2763758697533112e-07, "loss": 0.9735, "step": 9690 }, { "epoch": 1.81, "grad_norm": 2.9739364378003312, "learning_rate": 2.2545804233127578e-07, "loss": 1.0247, "step": 9695 }, { "epoch": 1.81, "grad_norm": 3.1442927146127273, "learning_rate": 2.2328874140912948e-07, "loss": 1.0427, "step": 9700 }, { "epoch": 1.81, "grad_norm": 2.586761975300717, "learning_rate": 2.2112968886310037e-07, "loss": 1.0241, "step": 9705 }, { "epoch": 1.81, "grad_norm": 2.521670484167634, "learning_rate": 2.1898088932541139e-07, "loss": 1.0188, "step": 9710 }, { "epoch": 1.81, "grad_norm": 2.803812342944837, "learning_rate": 2.1684234740628474e-07, "loss": 1.03, "step": 9715 }, { "epoch": 1.81, "grad_norm": 2.748460050126293, "learning_rate": 2.1471406769393633e-07, "loss": 0.9911, "step": 9720 }, { "epoch": 1.81, "grad_norm": 3.1096781734009644, "learning_rate": 2.1259605475456468e-07, "loss": 1.0387, "step": 9725 }, { "epoch": 1.81, "grad_norm": 2.730958677130239, "learning_rate": 2.1048831313234042e-07, "loss": 0.9288, "step": 9730 }, { "epoch": 1.82, "grad_norm": 2.5620713671460504, "learning_rate": 2.0839084734939784e-07, "loss": 0.9943, "step": 9735 }, { "epoch": 1.82, "grad_norm": 3.296656696817953, "learning_rate": 2.063036619058245e-07, "loss": 1.0116, "step": 9740 }, { "epoch": 1.82, "grad_norm": 2.460667225354087, "learning_rate": 2.0422676127965169e-07, "loss": 1.0158, "step": 9745 }, { "epoch": 1.82, "grad_norm": 2.4612694765541314, "learning_rate": 2.021601499268433e-07, "loss": 1.0556, "step": 9750 }, { "epoch": 1.82, "grad_norm": 2.3591424193825685, "learning_rate": 2.0010383228129038e-07, "loss": 1.0146, "step": 9755 }, { "epoch": 1.82, "grad_norm": 2.8113416401044202, "learning_rate": 1.980578127547972e-07, "loss": 0.9576, "step": 9760 }, { "epoch": 1.82, "grad_norm": 2.706080390163917, "learning_rate": 1.9602209573707397e-07, "loss": 0.9713, "step": 9765 }, { "epoch": 1.82, "grad_norm": 2.845030926145939, "learning_rate": 1.9399668559572593e-07, "loss": 1.0307, "step": 9770 }, { "epoch": 1.82, "grad_norm": 2.674120579688675, "learning_rate": 1.9198158667624756e-07, "loss": 1.022, "step": 9775 }, { "epoch": 1.82, "grad_norm": 2.1385275862773003, "learning_rate": 1.8997680330200775e-07, "loss": 0.978, "step": 9780 }, { "epoch": 1.82, "grad_norm": 2.843210914091474, "learning_rate": 1.879823397742453e-07, "loss": 1.0131, "step": 9785 }, { "epoch": 1.83, "grad_norm": 3.0167556280462704, "learning_rate": 1.8599820037205785e-07, "loss": 0.9705, "step": 9790 }, { "epoch": 1.83, "grad_norm": 2.7679496204235217, "learning_rate": 1.8402438935239187e-07, "loss": 0.9938, "step": 9795 }, { "epoch": 1.83, "grad_norm": 2.8044277031578955, "learning_rate": 1.8206091095003543e-07, "loss": 1.034, "step": 9800 }, { "epoch": 1.83, "grad_norm": 2.6766193581300706, "learning_rate": 1.801077693776082e-07, "loss": 0.9989, "step": 9805 }, { "epoch": 1.83, "grad_norm": 2.4536021788135027, "learning_rate": 1.78164968825551e-07, "loss": 0.9419, "step": 9810 }, { "epoch": 1.83, "grad_norm": 2.7894618880044963, "learning_rate": 1.7623251346211902e-07, "loss": 0.9509, "step": 9815 }, { "epoch": 1.83, "grad_norm": 2.682428541717693, "learning_rate": 1.743104074333718e-07, "loss": 1.0204, "step": 9820 }, { "epoch": 1.83, "grad_norm": 2.8164877468179195, "learning_rate": 1.7239865486316508e-07, "loss": 0.9888, "step": 9825 }, { "epoch": 1.83, "grad_norm": 2.7847626207224816, "learning_rate": 1.7049725985313959e-07, "loss": 0.9771, "step": 9830 }, { "epoch": 1.83, "grad_norm": 3.3563234753041744, "learning_rate": 1.6860622648271653e-07, "loss": 1.0058, "step": 9835 }, { "epoch": 1.84, "grad_norm": 3.3102502358004706, "learning_rate": 1.667255588090838e-07, "loss": 1.0054, "step": 9840 }, { "epoch": 1.84, "grad_norm": 2.7169721452197635, "learning_rate": 1.6485526086719217e-07, "loss": 0.9396, "step": 9845 }, { "epoch": 1.84, "grad_norm": 2.6443166952288104, "learning_rate": 1.62995336669744e-07, "loss": 1.0504, "step": 9850 }, { "epoch": 1.84, "grad_norm": 2.6784711600031637, "learning_rate": 1.6114579020718223e-07, "loss": 0.9967, "step": 9855 }, { "epoch": 1.84, "grad_norm": 2.7021265214230525, "learning_rate": 1.593066254476866e-07, "loss": 1.0112, "step": 9860 }, { "epoch": 1.84, "grad_norm": 2.4415845222324783, "learning_rate": 1.57477846337164e-07, "loss": 0.9537, "step": 9865 }, { "epoch": 1.84, "grad_norm": 2.71926218122397, "learning_rate": 1.5565945679923699e-07, "loss": 1.0178, "step": 9870 }, { "epoch": 1.84, "grad_norm": 2.704907322234658, "learning_rate": 1.5385146073523926e-07, "loss": 1.0128, "step": 9875 }, { "epoch": 1.84, "grad_norm": 2.4769145265711363, "learning_rate": 1.5205386202420235e-07, "loss": 1.0083, "step": 9880 }, { "epoch": 1.84, "grad_norm": 2.6895279389899067, "learning_rate": 1.5026666452285565e-07, "loss": 1.0007, "step": 9885 }, { "epoch": 1.84, "grad_norm": 2.627092854331598, "learning_rate": 1.4848987206560695e-07, "loss": 1.0009, "step": 9890 }, { "epoch": 1.85, "grad_norm": 2.60277939316083, "learning_rate": 1.4672348846454408e-07, "loss": 1.0241, "step": 9895 }, { "epoch": 1.85, "grad_norm": 2.7442165962309515, "learning_rate": 1.4496751750942173e-07, "loss": 1.0583, "step": 9900 }, { "epoch": 1.85, "grad_norm": 2.7088131815525895, "learning_rate": 1.4322196296765346e-07, "loss": 1.0226, "step": 9905 }, { "epoch": 1.85, "grad_norm": 2.4976027705254693, "learning_rate": 1.414868285843063e-07, "loss": 1.0096, "step": 9910 }, { "epoch": 1.85, "grad_norm": 3.6650855699844134, "learning_rate": 1.3976211808209018e-07, "loss": 0.9702, "step": 9915 }, { "epoch": 1.85, "grad_norm": 2.8355048109869494, "learning_rate": 1.3804783516135012e-07, "loss": 1.0196, "step": 9920 }, { "epoch": 1.85, "grad_norm": 2.805735805208835, "learning_rate": 1.3634398350005907e-07, "loss": 1.0183, "step": 9925 }, { "epoch": 1.85, "grad_norm": 2.2774993379472583, "learning_rate": 1.3465056675381117e-07, "loss": 0.9935, "step": 9930 }, { "epoch": 1.85, "grad_norm": 2.3630526568932333, "learning_rate": 1.3296758855581015e-07, "loss": 1.0068, "step": 9935 }, { "epoch": 1.85, "grad_norm": 2.8721281507955574, "learning_rate": 1.3129505251686603e-07, "loss": 1.0101, "step": 9940 }, { "epoch": 1.85, "grad_norm": 2.6647193040662813, "learning_rate": 1.2963296222538446e-07, "loss": 1.0117, "step": 9945 }, { "epoch": 1.86, "grad_norm": 2.3667246578642827, "learning_rate": 1.2798132124736074e-07, "loss": 0.991, "step": 9950 }, { "epoch": 1.86, "grad_norm": 2.8919200983344076, "learning_rate": 1.2634013312636973e-07, "loss": 0.9733, "step": 9955 }, { "epoch": 1.86, "grad_norm": 2.36008020599268, "learning_rate": 1.247094013835609e-07, "loss": 0.994, "step": 9960 }, { "epoch": 1.86, "grad_norm": 2.467275486733601, "learning_rate": 1.230891295176495e-07, "loss": 0.9721, "step": 9965 }, { "epoch": 1.86, "grad_norm": 2.684510172486263, "learning_rate": 1.2147932100490866e-07, "loss": 1.0237, "step": 9970 }, { "epoch": 1.86, "grad_norm": 2.970515110780605, "learning_rate": 1.1987997929916395e-07, "loss": 0.9639, "step": 9975 }, { "epoch": 1.86, "grad_norm": 2.517212726647846, "learning_rate": 1.1829110783178332e-07, "loss": 0.9784, "step": 9980 }, { "epoch": 1.86, "grad_norm": 2.6269061771187623, "learning_rate": 1.1671271001167106e-07, "loss": 0.993, "step": 9985 }, { "epoch": 1.86, "grad_norm": 2.4276654463627274, "learning_rate": 1.1514478922525996e-07, "loss": 1.0237, "step": 9990 }, { "epoch": 1.86, "grad_norm": 2.628761439871769, "learning_rate": 1.1358734883650635e-07, "loss": 0.9542, "step": 9995 }, { "epoch": 1.86, "grad_norm": 2.8787281265293165, "learning_rate": 1.1204039218687901e-07, "loss": 1.0165, "step": 10000 }, { "epoch": 1.87, "grad_norm": 2.9024745064799125, "learning_rate": 1.105039225953547e-07, "loss": 0.9447, "step": 10005 }, { "epoch": 1.87, "grad_norm": 2.781988132946109, "learning_rate": 1.0897794335841094e-07, "loss": 0.9774, "step": 10010 }, { "epoch": 1.87, "grad_norm": 2.732110290110213, "learning_rate": 1.0746245775001607e-07, "loss": 0.9785, "step": 10015 }, { "epoch": 1.87, "grad_norm": 2.475660794796942, "learning_rate": 1.0595746902162751e-07, "loss": 1.0015, "step": 10020 }, { "epoch": 1.87, "grad_norm": 2.549244214480255, "learning_rate": 1.0446298040218017e-07, "loss": 0.9978, "step": 10025 }, { "epoch": 1.87, "grad_norm": 2.7669451232914866, "learning_rate": 1.0297899509808085e-07, "loss": 0.9641, "step": 10030 }, { "epoch": 1.87, "grad_norm": 2.7884785856697345, "learning_rate": 1.0150551629320215e-07, "loss": 1.0045, "step": 10035 }, { "epoch": 1.87, "grad_norm": 2.822600979742313, "learning_rate": 1.000425471488753e-07, "loss": 0.9792, "step": 10040 }, { "epoch": 1.87, "grad_norm": 2.34346287354799, "learning_rate": 9.859009080388338e-08, "loss": 1.0256, "step": 10045 }, { "epoch": 1.87, "grad_norm": 3.0091617199806744, "learning_rate": 9.714815037445202e-08, "loss": 0.9503, "step": 10050 }, { "epoch": 1.88, "grad_norm": 2.454168024542919, "learning_rate": 9.571672895424822e-08, "loss": 0.9699, "step": 10055 }, { "epoch": 1.88, "grad_norm": 2.577866899775397, "learning_rate": 9.429582961436978e-08, "loss": 1.0006, "step": 10060 }, { "epoch": 1.88, "grad_norm": 2.4283231702938752, "learning_rate": 9.28854554033376e-08, "loss": 1.0203, "step": 10065 }, { "epoch": 1.88, "grad_norm": 2.4012436890293922, "learning_rate": 9.148560934709395e-08, "loss": 0.9573, "step": 10070 }, { "epoch": 1.88, "grad_norm": 2.6430208059626805, "learning_rate": 9.009629444899081e-08, "loss": 1.0115, "step": 10075 }, { "epoch": 1.88, "grad_norm": 2.3785722525752853, "learning_rate": 8.871751368978554e-08, "loss": 1.0351, "step": 10080 }, { "epoch": 1.88, "grad_norm": 2.5618218348538813, "learning_rate": 8.734927002763793e-08, "loss": 1.0202, "step": 10085 }, { "epoch": 1.88, "grad_norm": 2.3559694341229704, "learning_rate": 8.599156639809591e-08, "loss": 0.9665, "step": 10090 }, { "epoch": 1.88, "grad_norm": 2.498377208318561, "learning_rate": 8.46444057140977e-08, "loss": 1.0156, "step": 10095 }, { "epoch": 1.88, "grad_norm": 2.8502573752110387, "learning_rate": 8.330779086595852e-08, "loss": 0.9877, "step": 10100 }, { "epoch": 1.88, "grad_norm": 2.730502176083712, "learning_rate": 8.198172472137111e-08, "loss": 0.9944, "step": 10105 }, { "epoch": 1.89, "grad_norm": 2.5665957223980884, "learning_rate": 8.0666210125393e-08, "loss": 0.9516, "step": 10110 }, { "epoch": 1.89, "grad_norm": 2.599168705119983, "learning_rate": 7.936124990044591e-08, "loss": 1.0021, "step": 10115 }, { "epoch": 1.89, "grad_norm": 2.509541356997699, "learning_rate": 7.806684684630528e-08, "loss": 0.9792, "step": 10120 }, { "epoch": 1.89, "grad_norm": 2.5079499486537, "learning_rate": 7.678300374009795e-08, "loss": 1.0283, "step": 10125 }, { "epoch": 1.89, "grad_norm": 2.7116455936430652, "learning_rate": 7.550972333629336e-08, "loss": 1.0421, "step": 10130 }, { "epoch": 1.89, "grad_norm": 2.4374078572896356, "learning_rate": 7.424700836669851e-08, "loss": 1.0299, "step": 10135 }, { "epoch": 1.89, "grad_norm": 2.6644682159026503, "learning_rate": 7.299486154045465e-08, "loss": 0.9972, "step": 10140 }, { "epoch": 1.89, "grad_norm": 2.533516269766712, "learning_rate": 7.175328554402561e-08, "loss": 0.9605, "step": 10145 }, { "epoch": 1.89, "grad_norm": 2.422491510120683, "learning_rate": 7.052228304119945e-08, "loss": 1.0066, "step": 10150 }, { "epoch": 1.89, "grad_norm": 2.701319935117471, "learning_rate": 6.930185667307632e-08, "loss": 0.9592, "step": 10155 }, { "epoch": 1.89, "grad_norm": 2.4325140710805977, "learning_rate": 6.809200905806612e-08, "loss": 0.9774, "step": 10160 }, { "epoch": 1.9, "grad_norm": 3.3040592558385087, "learning_rate": 6.689274279188252e-08, "loss": 1.0472, "step": 10165 }, { "epoch": 1.9, "grad_norm": 2.9349662595476644, "learning_rate": 6.570406044753675e-08, "loss": 0.9517, "step": 10170 }, { "epoch": 1.9, "grad_norm": 2.9432283064145315, "learning_rate": 6.452596457533211e-08, "loss": 0.9954, "step": 10175 }, { "epoch": 1.9, "grad_norm": 2.446315385822753, "learning_rate": 6.335845770285897e-08, "loss": 0.9763, "step": 10180 }, { "epoch": 1.9, "grad_norm": 2.4541488493898207, "learning_rate": 6.220154233498976e-08, "loss": 0.9846, "step": 10185 }, { "epoch": 1.9, "grad_norm": 2.740747449768843, "learning_rate": 6.105522095387173e-08, "loss": 1.0473, "step": 10190 }, { "epoch": 1.9, "grad_norm": 2.412788490791504, "learning_rate": 5.991949601892366e-08, "loss": 1.0206, "step": 10195 }, { "epoch": 1.9, "grad_norm": 2.2280455313309404, "learning_rate": 5.87943699668303e-08, "loss": 1.0077, "step": 10200 }, { "epoch": 1.9, "grad_norm": 2.5793911483638077, "learning_rate": 5.767984521153569e-08, "loss": 0.995, "step": 10205 }, { "epoch": 1.9, "grad_norm": 2.729204350644033, "learning_rate": 5.657592414423818e-08, "loss": 0.9881, "step": 10210 }, { "epoch": 1.9, "grad_norm": 2.6928681239961385, "learning_rate": 5.548260913338766e-08, "loss": 0.9753, "step": 10215 }, { "epoch": 1.91, "grad_norm": 2.7934332324069606, "learning_rate": 5.439990252467886e-08, "loss": 1.0208, "step": 10220 }, { "epoch": 1.91, "grad_norm": 2.4601360704368305, "learning_rate": 5.332780664104531e-08, "loss": 1.0288, "step": 10225 }, { "epoch": 1.91, "grad_norm": 2.6515972842534627, "learning_rate": 5.226632378265484e-08, "loss": 1.0382, "step": 10230 }, { "epoch": 1.91, "grad_norm": 2.6197630708573176, "learning_rate": 5.121545622690738e-08, "loss": 1.0731, "step": 10235 }, { "epoch": 1.91, "grad_norm": 2.844455313015032, "learning_rate": 5.017520622842609e-08, "loss": 0.9925, "step": 10240 }, { "epoch": 1.91, "grad_norm": 2.3475689287172785, "learning_rate": 4.914557601905456e-08, "loss": 0.9502, "step": 10245 }, { "epoch": 1.91, "grad_norm": 2.429152996391864, "learning_rate": 4.812656780785185e-08, "loss": 0.961, "step": 10250 }, { "epoch": 1.91, "grad_norm": 2.530617111574067, "learning_rate": 4.711818378108801e-08, "loss": 0.9803, "step": 10255 }, { "epoch": 1.91, "grad_norm": 2.3730172494335977, "learning_rate": 4.6120426102238547e-08, "loss": 0.9925, "step": 10260 }, { "epoch": 1.91, "grad_norm": 3.0046001348839546, "learning_rate": 4.513329691198054e-08, "loss": 1.0298, "step": 10265 }, { "epoch": 1.92, "grad_norm": 2.5230634308534228, "learning_rate": 4.4156798328187645e-08, "loss": 0.9695, "step": 10270 }, { "epoch": 1.92, "grad_norm": 2.6072373901168846, "learning_rate": 4.319093244592565e-08, "loss": 0.9855, "step": 10275 }, { "epoch": 1.92, "grad_norm": 2.7290151915323144, "learning_rate": 4.2235701337448585e-08, "loss": 1.0341, "step": 10280 }, { "epoch": 1.92, "grad_norm": 2.5839402508390577, "learning_rate": 4.129110705219319e-08, "loss": 1.0088, "step": 10285 }, { "epoch": 1.92, "grad_norm": 2.734863021668995, "learning_rate": 4.035715161677389e-08, "loss": 1.0405, "step": 10290 }, { "epoch": 1.92, "grad_norm": 2.537803008246443, "learning_rate": 3.943383703498171e-08, "loss": 0.9874, "step": 10295 }, { "epoch": 1.92, "grad_norm": 2.604401291284094, "learning_rate": 3.8521165287777605e-08, "loss": 0.9927, "step": 10300 }, { "epoch": 1.92, "grad_norm": 2.5104863117844776, "learning_rate": 3.761913833328634e-08, "loss": 0.9748, "step": 10305 }, { "epoch": 1.92, "grad_norm": 2.9857002776001, "learning_rate": 3.672775810679652e-08, "loss": 0.9856, "step": 10310 }, { "epoch": 1.92, "grad_norm": 3.487877028129344, "learning_rate": 3.584702652075333e-08, "loss": 0.9325, "step": 10315 }, { "epoch": 1.92, "grad_norm": 2.462719813368756, "learning_rate": 3.4976945464756385e-08, "loss": 0.9738, "step": 10320 }, { "epoch": 1.93, "grad_norm": 2.6647821276664305, "learning_rate": 3.4117516805552976e-08, "loss": 0.9519, "step": 10325 }, { "epoch": 1.93, "grad_norm": 2.7581010174320104, "learning_rate": 3.326874238703814e-08, "loss": 0.9946, "step": 10330 }, { "epoch": 1.93, "grad_norm": 2.8054155893587867, "learning_rate": 3.2430624030245773e-08, "loss": 1.0044, "step": 10335 }, { "epoch": 1.93, "grad_norm": 2.887227101606384, "learning_rate": 3.1603163533349136e-08, "loss": 1.0235, "step": 10340 }, { "epoch": 1.93, "grad_norm": 2.557612920941727, "learning_rate": 3.0786362671655356e-08, "loss": 1.0399, "step": 10345 }, { "epoch": 1.93, "grad_norm": 2.317041647221803, "learning_rate": 2.998022319760041e-08, "loss": 1.0367, "step": 10350 }, { "epoch": 1.93, "grad_norm": 3.3628041239154904, "learning_rate": 2.9184746840746346e-08, "loss": 1.0432, "step": 10355 }, { "epoch": 1.93, "grad_norm": 2.45529949397882, "learning_rate": 2.8399935307778516e-08, "loss": 0.9676, "step": 10360 }, { "epoch": 1.93, "grad_norm": 2.601408268617963, "learning_rate": 2.7625790282500564e-08, "loss": 1.0007, "step": 10365 }, { "epoch": 1.93, "grad_norm": 2.6735204157162165, "learning_rate": 2.6862313425830567e-08, "loss": 1.0625, "step": 10370 }, { "epoch": 1.93, "grad_norm": 2.359922200975906, "learning_rate": 2.6109506375799896e-08, "loss": 1.0015, "step": 10375 }, { "epoch": 1.94, "grad_norm": 2.3056241737305734, "learning_rate": 2.5367370747546026e-08, "loss": 0.9744, "step": 10380 }, { "epoch": 1.94, "grad_norm": 2.637133489900889, "learning_rate": 2.4635908133312514e-08, "loss": 0.9974, "step": 10385 }, { "epoch": 1.94, "grad_norm": 2.4683171286123557, "learning_rate": 2.3915120102443458e-08, "loss": 0.9822, "step": 10390 }, { "epoch": 1.94, "grad_norm": 2.503440993401632, "learning_rate": 2.3205008201380718e-08, "loss": 1.0011, "step": 10395 }, { "epoch": 1.94, "grad_norm": 2.8108246729696376, "learning_rate": 2.2505573953661153e-08, "loss": 1.0272, "step": 10400 }, { "epoch": 1.94, "grad_norm": 2.664659762392884, "learning_rate": 2.181681885991216e-08, "loss": 0.9898, "step": 10405 }, { "epoch": 1.94, "grad_norm": 2.585990693063831, "learning_rate": 2.113874439784891e-08, "loss": 1.0153, "step": 10410 }, { "epoch": 1.94, "grad_norm": 2.8947291462365046, "learning_rate": 2.0471352022272683e-08, "loss": 0.9818, "step": 10415 }, { "epoch": 1.94, "grad_norm": 2.3630315438689227, "learning_rate": 1.981464316506587e-08, "loss": 0.9976, "step": 10420 }, { "epoch": 1.94, "grad_norm": 2.9333155999303084, "learning_rate": 1.9168619235188646e-08, "loss": 0.9797, "step": 10425 }, { "epoch": 1.95, "grad_norm": 2.7761353274100236, "learning_rate": 1.8533281618677846e-08, "loss": 0.9809, "step": 10430 }, { "epoch": 1.95, "grad_norm": 2.926917528544316, "learning_rate": 1.790863167864254e-08, "loss": 0.9823, "step": 10435 }, { "epoch": 1.95, "grad_norm": 2.4755499922436677, "learning_rate": 1.7294670755262365e-08, "loss": 0.9795, "step": 10440 }, { "epoch": 1.95, "grad_norm": 2.9766426008228724, "learning_rate": 1.669140016578197e-08, "loss": 0.9984, "step": 10445 }, { "epoch": 1.95, "grad_norm": 2.7200469233636877, "learning_rate": 1.6098821204511562e-08, "loss": 0.9475, "step": 10450 }, { "epoch": 1.95, "grad_norm": 2.762204423209276, "learning_rate": 1.5516935142821933e-08, "loss": 0.9846, "step": 10455 }, { "epoch": 1.95, "grad_norm": 2.5807977939478377, "learning_rate": 1.4945743229142772e-08, "loss": 1.0107, "step": 10460 }, { "epoch": 1.95, "grad_norm": 2.7922748699778244, "learning_rate": 1.4385246688959909e-08, "loss": 0.9823, "step": 10465 }, { "epoch": 1.95, "grad_norm": 2.4732334084056387, "learning_rate": 1.3835446724810852e-08, "loss": 1.0322, "step": 10470 }, { "epoch": 1.95, "grad_norm": 2.918632979618341, "learning_rate": 1.3296344516285364e-08, "loss": 1.0167, "step": 10475 }, { "epoch": 1.95, "grad_norm": 2.5363575883520846, "learning_rate": 1.2767941220020453e-08, "loss": 0.9996, "step": 10480 }, { "epoch": 1.96, "grad_norm": 2.7942753065364663, "learning_rate": 1.2250237969699263e-08, "loss": 1.0262, "step": 10485 }, { "epoch": 1.96, "grad_norm": 2.6713751330423516, "learning_rate": 1.1743235876047753e-08, "loss": 1.0308, "step": 10490 }, { "epoch": 1.96, "grad_norm": 2.6195251533724946, "learning_rate": 1.1246936026832466e-08, "loss": 0.9306, "step": 10495 }, { "epoch": 1.96, "grad_norm": 2.673566984697063, "learning_rate": 1.0761339486859424e-08, "loss": 1.0086, "step": 10500 }, { "epoch": 1.96, "grad_norm": 2.6680706352937285, "learning_rate": 1.0286447297969682e-08, "loss": 1.0331, "step": 10505 }, { "epoch": 1.96, "grad_norm": 2.7859246917263087, "learning_rate": 9.822260479038226e-09, "loss": 0.992, "step": 10510 }, { "epoch": 1.96, "grad_norm": 2.3574649028330765, "learning_rate": 9.368780025972856e-09, "loss": 1.0498, "step": 10515 }, { "epoch": 1.96, "grad_norm": 2.2777885112050225, "learning_rate": 8.926006911710305e-09, "loss": 0.9413, "step": 10520 }, { "epoch": 1.96, "grad_norm": 2.5295332699440527, "learning_rate": 8.493942086214568e-09, "loss": 1.0083, "step": 10525 }, { "epoch": 1.96, "grad_norm": 2.7853874196558737, "learning_rate": 8.072586476475797e-09, "loss": 1.0256, "step": 10530 }, { "epoch": 1.96, "grad_norm": 2.6333818412429935, "learning_rate": 7.661940986507522e-09, "loss": 1.0239, "step": 10535 }, { "epoch": 1.97, "grad_norm": 2.549575059800705, "learning_rate": 7.262006497344431e-09, "loss": 1.0001, "step": 10540 }, { "epoch": 1.97, "grad_norm": 2.2772256153873895, "learning_rate": 6.872783867041266e-09, "loss": 0.958, "step": 10545 }, { "epoch": 1.97, "grad_norm": 2.6683332998055147, "learning_rate": 6.494273930670036e-09, "loss": 1.0027, "step": 10550 }, { "epoch": 1.97, "grad_norm": 2.5321362849934115, "learning_rate": 6.126477500319472e-09, "loss": 1.025, "step": 10555 }, { "epoch": 1.97, "grad_norm": 2.602531237642126, "learning_rate": 5.7693953650928e-09, "loss": 1.063, "step": 10560 }, { "epoch": 1.97, "grad_norm": 2.305373620166385, "learning_rate": 5.42302829110497e-09, "loss": 1.0504, "step": 10565 }, { "epoch": 1.97, "grad_norm": 2.6405599677279197, "learning_rate": 5.087377021482099e-09, "loss": 1.0107, "step": 10570 }, { "epoch": 1.97, "grad_norm": 2.4886321109441547, "learning_rate": 4.762442276360357e-09, "loss": 0.9846, "step": 10575 }, { "epoch": 1.97, "grad_norm": 2.404716196110838, "learning_rate": 4.448224752883201e-09, "loss": 1.0061, "step": 10580 }, { "epoch": 1.97, "grad_norm": 2.6094386667769336, "learning_rate": 4.144725125200255e-09, "loss": 0.9906, "step": 10585 }, { "epoch": 1.97, "grad_norm": 3.0107792922490466, "learning_rate": 3.85194404446676e-09, "loss": 1.0194, "step": 10590 }, { "epoch": 1.98, "grad_norm": 2.849093851045047, "learning_rate": 3.5698821388407966e-09, "loss": 0.9846, "step": 10595 }, { "epoch": 1.98, "grad_norm": 2.625766042793939, "learning_rate": 3.2985400134827318e-09, "loss": 0.9563, "step": 10600 }, { "epoch": 1.98, "grad_norm": 2.6570589229153803, "learning_rate": 3.037918250553551e-09, "loss": 1.0326, "step": 10605 }, { "epoch": 1.98, "grad_norm": 2.3590551907855484, "learning_rate": 2.7880174092148603e-09, "loss": 0.9736, "step": 10610 }, { "epoch": 1.98, "grad_norm": 2.3928499657909925, "learning_rate": 2.548838025625e-09, "loss": 0.9729, "step": 10615 }, { "epoch": 1.98, "grad_norm": 2.6017921989296315, "learning_rate": 2.3203806129407092e-09, "loss": 0.9762, "step": 10620 }, { "epoch": 1.98, "grad_norm": 2.5268978088365888, "learning_rate": 2.1026456613143508e-09, "loss": 1.0171, "step": 10625 }, { "epoch": 1.98, "grad_norm": 2.647145075223155, "learning_rate": 1.8956336378944673e-09, "loss": 0.9712, "step": 10630 }, { "epoch": 1.98, "grad_norm": 3.2187105524163186, "learning_rate": 1.6993449868218937e-09, "loss": 0.9911, "step": 10635 }, { "epoch": 1.98, "grad_norm": 2.6908790885372715, "learning_rate": 1.5137801292325338e-09, "loss": 0.9815, "step": 10640 }, { "epoch": 1.99, "grad_norm": 2.5243341858010324, "learning_rate": 1.3389394632523644e-09, "loss": 1.0148, "step": 10645 }, { "epoch": 1.99, "grad_norm": 3.157707192953073, "learning_rate": 1.1748233640007655e-09, "loss": 1.0329, "step": 10650 }, { "epoch": 1.99, "grad_norm": 2.789228515124869, "learning_rate": 1.02143218358719e-09, "loss": 1.0507, "step": 10655 }, { "epoch": 1.99, "grad_norm": 2.494956425208091, "learning_rate": 8.787662511094975e-10, "loss": 1.009, "step": 10660 }, { "epoch": 1.99, "grad_norm": 2.3012094212140335, "learning_rate": 7.468258726561761e-10, "loss": 1.0183, "step": 10665 }, { "epoch": 1.99, "grad_norm": 3.027904709311952, "learning_rate": 6.256113313035661e-10, "loss": 0.9953, "step": 10670 }, { "epoch": 1.99, "grad_norm": 2.6485213093531996, "learning_rate": 5.151228871164149e-10, "loss": 1.0034, "step": 10675 }, { "epoch": 1.99, "grad_norm": 2.341845531967239, "learning_rate": 4.1536077714621206e-10, "loss": 0.9533, "step": 10680 }, { "epoch": 1.99, "grad_norm": 2.866112027395985, "learning_rate": 3.2632521543118913e-10, "loss": 0.9769, "step": 10685 }, { "epoch": 1.99, "grad_norm": 2.3197040560602704, "learning_rate": 2.4801639299576465e-10, "loss": 0.9946, "step": 10690 }, { "epoch": 1.99, "grad_norm": 2.961026582811728, "learning_rate": 1.8043447785109914e-10, "loss": 0.9354, "step": 10695 }, { "epoch": 2.0, "grad_norm": 2.7427152602676603, "learning_rate": 1.2357961499342985e-10, "loss": 0.9992, "step": 10700 }, { "epoch": 2.0, "grad_norm": 2.674107444859358, "learning_rate": 7.745192640351562e-11, "loss": 1.0313, "step": 10705 }, { "epoch": 2.0, "grad_norm": 2.591490853219579, "learning_rate": 4.2051511048302184e-11, "loss": 0.9762, "step": 10710 }, { "epoch": 2.0, "grad_norm": 2.318869364141943, "learning_rate": 1.7378444879256885e-11, "loss": 0.9667, "step": 10715 }, { "epoch": 2.0, "grad_norm": 2.6035429249427002, "learning_rate": 3.4327808312584197e-12, "loss": 1.0605, "step": 10720 }, { "epoch": 2.0, "step": 10724, "total_flos": 1.675657501170729e+17, "train_loss": 1.1225166902173949, "train_runtime": 37990.1661, "train_samples_per_second": 36.134, "train_steps_per_second": 0.282 } ], "logging_steps": 5, "max_steps": 10724, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 1.675657501170729e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }