{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9988694177501414, "eval_steps": 500, "global_step": 1326, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015074429998115696, "grad_norm": 2.758618933635527, "learning_rate": 1.5037593984962406e-07, "loss": 0.7902, "step": 1 }, { "epoch": 0.003014885999623139, "grad_norm": 2.6991611264758757, "learning_rate": 3.007518796992481e-07, "loss": 0.7844, "step": 2 }, { "epoch": 0.0045223289994347085, "grad_norm": 2.7272565236365143, "learning_rate": 4.511278195488722e-07, "loss": 0.7626, "step": 3 }, { "epoch": 0.006029771999246278, "grad_norm": 2.7528117182790965, "learning_rate": 6.015037593984962e-07, "loss": 0.7858, "step": 4 }, { "epoch": 0.007537214999057848, "grad_norm": 2.775699578303916, "learning_rate": 7.518796992481203e-07, "loss": 0.8026, "step": 5 }, { "epoch": 0.009044657998869417, "grad_norm": 2.6767851045683204, "learning_rate": 9.022556390977444e-07, "loss": 0.769, "step": 6 }, { "epoch": 0.010552100998680987, "grad_norm": 2.5261120500748224, "learning_rate": 1.0526315789473685e-06, "loss": 0.7706, "step": 7 }, { "epoch": 0.012059543998492557, "grad_norm": 2.514977460751076, "learning_rate": 1.2030075187969925e-06, "loss": 0.7785, "step": 8 }, { "epoch": 0.013566986998304126, "grad_norm": 2.200241814288396, "learning_rate": 1.3533834586466167e-06, "loss": 0.7499, "step": 9 }, { "epoch": 0.015074429998115696, "grad_norm": 2.212492267506947, "learning_rate": 1.5037593984962406e-06, "loss": 0.784, "step": 10 }, { "epoch": 0.016581872997927266, "grad_norm": 1.9319703389138259, "learning_rate": 1.6541353383458648e-06, "loss": 0.7302, "step": 11 }, { "epoch": 0.018089315997738834, "grad_norm": 1.2550044831716305, "learning_rate": 1.8045112781954887e-06, "loss": 0.7056, "step": 12 }, { "epoch": 0.019596758997550406, "grad_norm": 1.2195135372913062, "learning_rate": 1.9548872180451127e-06, "loss": 0.7014, "step": 13 }, { "epoch": 0.021104201997361974, "grad_norm": 1.2455313883450765, "learning_rate": 2.105263157894737e-06, "loss": 0.7374, "step": 14 }, { "epoch": 0.022611644997173545, "grad_norm": 1.1872107473955416, "learning_rate": 2.255639097744361e-06, "loss": 0.7186, "step": 15 }, { "epoch": 0.024119087996985113, "grad_norm": 1.1982009143546264, "learning_rate": 2.406015037593985e-06, "loss": 0.6828, "step": 16 }, { "epoch": 0.025626530996796685, "grad_norm": 2.3539969715580384, "learning_rate": 2.556390977443609e-06, "loss": 0.6636, "step": 17 }, { "epoch": 0.027133973996608253, "grad_norm": 2.6228143835706765, "learning_rate": 2.7067669172932333e-06, "loss": 0.6967, "step": 18 }, { "epoch": 0.028641416996419825, "grad_norm": 2.224580122320562, "learning_rate": 2.8571428571428573e-06, "loss": 0.6761, "step": 19 }, { "epoch": 0.030148859996231393, "grad_norm": 1.6908938829069853, "learning_rate": 3.007518796992481e-06, "loss": 0.6711, "step": 20 }, { "epoch": 0.03165630299604296, "grad_norm": 1.2313675848377437, "learning_rate": 3.157894736842105e-06, "loss": 0.6731, "step": 21 }, { "epoch": 0.03316374599585453, "grad_norm": 1.0688239960942414, "learning_rate": 3.3082706766917295e-06, "loss": 0.6766, "step": 22 }, { "epoch": 0.034671188995666104, "grad_norm": 1.0400364411240592, "learning_rate": 3.4586466165413535e-06, "loss": 0.6388, "step": 23 }, { "epoch": 0.03617863199547767, "grad_norm": 1.1273406799386165, "learning_rate": 3.6090225563909775e-06, "loss": 0.6406, "step": 24 }, { "epoch": 0.03768607499528924, "grad_norm": 1.1097242153227487, "learning_rate": 3.7593984962406014e-06, "loss": 0.6316, "step": 25 }, { "epoch": 0.03919351799510081, "grad_norm": 0.9446010057713108, "learning_rate": 3.909774436090225e-06, "loss": 0.6023, "step": 26 }, { "epoch": 0.04070096099491238, "grad_norm": 0.8323088497080903, "learning_rate": 4.06015037593985e-06, "loss": 0.6183, "step": 27 }, { "epoch": 0.04220840399472395, "grad_norm": 0.7872962129475931, "learning_rate": 4.210526315789474e-06, "loss": 0.603, "step": 28 }, { "epoch": 0.04371584699453552, "grad_norm": 0.7767297100748087, "learning_rate": 4.360902255639098e-06, "loss": 0.6217, "step": 29 }, { "epoch": 0.04522328999434709, "grad_norm": 0.8604923635307716, "learning_rate": 4.511278195488722e-06, "loss": 0.5855, "step": 30 }, { "epoch": 0.046730732994158655, "grad_norm": 0.8294236539215625, "learning_rate": 4.661654135338346e-06, "loss": 0.5933, "step": 31 }, { "epoch": 0.04823817599397023, "grad_norm": 0.7967164005183986, "learning_rate": 4.81203007518797e-06, "loss": 0.6159, "step": 32 }, { "epoch": 0.0497456189937818, "grad_norm": 0.6830740716112117, "learning_rate": 4.962406015037594e-06, "loss": 0.5752, "step": 33 }, { "epoch": 0.05125306199359337, "grad_norm": 0.6829489822638658, "learning_rate": 5.112781954887218e-06, "loss": 0.5792, "step": 34 }, { "epoch": 0.052760504993404934, "grad_norm": 0.7505911666509206, "learning_rate": 5.263157894736842e-06, "loss": 0.602, "step": 35 }, { "epoch": 0.054267947993216506, "grad_norm": 0.7281087618287696, "learning_rate": 5.413533834586467e-06, "loss": 0.5994, "step": 36 }, { "epoch": 0.05577539099302808, "grad_norm": 0.7061965815038841, "learning_rate": 5.56390977443609e-06, "loss": 0.5734, "step": 37 }, { "epoch": 0.05728283399283965, "grad_norm": 0.6836377512068608, "learning_rate": 5.7142857142857145e-06, "loss": 0.5543, "step": 38 }, { "epoch": 0.058790276992651214, "grad_norm": 0.6751595521927064, "learning_rate": 5.864661654135339e-06, "loss": 0.5983, "step": 39 }, { "epoch": 0.060297719992462785, "grad_norm": 0.6517307660683558, "learning_rate": 6.015037593984962e-06, "loss": 0.5482, "step": 40 }, { "epoch": 0.06180516299227436, "grad_norm": 0.670740477832527, "learning_rate": 6.165413533834587e-06, "loss": 0.5771, "step": 41 }, { "epoch": 0.06331260599208592, "grad_norm": 0.6386183426057812, "learning_rate": 6.31578947368421e-06, "loss": 0.5338, "step": 42 }, { "epoch": 0.0648200489918975, "grad_norm": 0.6325251598452951, "learning_rate": 6.466165413533835e-06, "loss": 0.5409, "step": 43 }, { "epoch": 0.06632749199170906, "grad_norm": 0.6190880971896819, "learning_rate": 6.616541353383459e-06, "loss": 0.5386, "step": 44 }, { "epoch": 0.06783493499152063, "grad_norm": 0.6390822260018926, "learning_rate": 6.766917293233083e-06, "loss": 0.5824, "step": 45 }, { "epoch": 0.06934237799133221, "grad_norm": 0.6866065901609671, "learning_rate": 6.917293233082707e-06, "loss": 0.5661, "step": 46 }, { "epoch": 0.07084982099114377, "grad_norm": 0.6253742924174672, "learning_rate": 7.067669172932331e-06, "loss": 0.5371, "step": 47 }, { "epoch": 0.07235726399095534, "grad_norm": 0.6086976797344416, "learning_rate": 7.218045112781955e-06, "loss": 0.5394, "step": 48 }, { "epoch": 0.07386470699076692, "grad_norm": 0.6617551336853821, "learning_rate": 7.368421052631579e-06, "loss": 0.5481, "step": 49 }, { "epoch": 0.07537214999057848, "grad_norm": 0.6511571841438215, "learning_rate": 7.518796992481203e-06, "loss": 0.557, "step": 50 }, { "epoch": 0.07687959299039004, "grad_norm": 0.6424690635636273, "learning_rate": 7.669172932330828e-06, "loss": 0.554, "step": 51 }, { "epoch": 0.07838703599020162, "grad_norm": 0.6208252498748196, "learning_rate": 7.81954887218045e-06, "loss": 0.5316, "step": 52 }, { "epoch": 0.07989447899001319, "grad_norm": 0.6178927978791646, "learning_rate": 7.969924812030075e-06, "loss": 0.5303, "step": 53 }, { "epoch": 0.08140192198982477, "grad_norm": 0.6246216171964205, "learning_rate": 8.1203007518797e-06, "loss": 0.5551, "step": 54 }, { "epoch": 0.08290936498963633, "grad_norm": 0.6071119189590479, "learning_rate": 8.270676691729324e-06, "loss": 0.5324, "step": 55 }, { "epoch": 0.0844168079894479, "grad_norm": 0.6123261613238393, "learning_rate": 8.421052631578948e-06, "loss": 0.5503, "step": 56 }, { "epoch": 0.08592425098925947, "grad_norm": 0.620387110972641, "learning_rate": 8.571428571428571e-06, "loss": 0.5353, "step": 57 }, { "epoch": 0.08743169398907104, "grad_norm": 0.6185030072259556, "learning_rate": 8.721804511278195e-06, "loss": 0.5544, "step": 58 }, { "epoch": 0.0889391369888826, "grad_norm": 0.6244700607026835, "learning_rate": 8.87218045112782e-06, "loss": 0.5768, "step": 59 }, { "epoch": 0.09044657998869418, "grad_norm": 0.655865564676625, "learning_rate": 9.022556390977444e-06, "loss": 0.5541, "step": 60 }, { "epoch": 0.09195402298850575, "grad_norm": 0.6355623162806917, "learning_rate": 9.172932330827068e-06, "loss": 0.5317, "step": 61 }, { "epoch": 0.09346146598831731, "grad_norm": 0.6620650186277286, "learning_rate": 9.323308270676693e-06, "loss": 0.5825, "step": 62 }, { "epoch": 0.09496890898812889, "grad_norm": 0.6544984607034259, "learning_rate": 9.473684210526315e-06, "loss": 0.5367, "step": 63 }, { "epoch": 0.09647635198794045, "grad_norm": 0.6566999876216955, "learning_rate": 9.62406015037594e-06, "loss": 0.5334, "step": 64 }, { "epoch": 0.09798379498775203, "grad_norm": 0.6538652733227992, "learning_rate": 9.774436090225564e-06, "loss": 0.5088, "step": 65 }, { "epoch": 0.0994912379875636, "grad_norm": 0.7184816645886852, "learning_rate": 9.924812030075189e-06, "loss": 0.5015, "step": 66 }, { "epoch": 0.10099868098737516, "grad_norm": 0.6287887378220718, "learning_rate": 1.0075187969924813e-05, "loss": 0.5171, "step": 67 }, { "epoch": 0.10250612398718674, "grad_norm": 0.7045986205120561, "learning_rate": 1.0225563909774436e-05, "loss": 0.5499, "step": 68 }, { "epoch": 0.1040135669869983, "grad_norm": 0.6263524660452249, "learning_rate": 1.0375939849624062e-05, "loss": 0.5319, "step": 69 }, { "epoch": 0.10552100998680987, "grad_norm": 0.6340009161866458, "learning_rate": 1.0526315789473684e-05, "loss": 0.5295, "step": 70 }, { "epoch": 0.10702845298662145, "grad_norm": 0.6930114141626272, "learning_rate": 1.0676691729323309e-05, "loss": 0.543, "step": 71 }, { "epoch": 0.10853589598643301, "grad_norm": 0.6582132112309497, "learning_rate": 1.0827067669172933e-05, "loss": 0.5164, "step": 72 }, { "epoch": 0.11004333898624458, "grad_norm": 0.6774079053656817, "learning_rate": 1.0977443609022558e-05, "loss": 0.5202, "step": 73 }, { "epoch": 0.11155078198605616, "grad_norm": 0.6435562490480392, "learning_rate": 1.112781954887218e-05, "loss": 0.5095, "step": 74 }, { "epoch": 0.11305822498586772, "grad_norm": 0.7112297517037395, "learning_rate": 1.1278195488721806e-05, "loss": 0.5316, "step": 75 }, { "epoch": 0.1145656679856793, "grad_norm": 0.709494451956929, "learning_rate": 1.1428571428571429e-05, "loss": 0.4935, "step": 76 }, { "epoch": 0.11607311098549086, "grad_norm": 0.6777802836075782, "learning_rate": 1.1578947368421053e-05, "loss": 0.5043, "step": 77 }, { "epoch": 0.11758055398530243, "grad_norm": 0.6296151489375509, "learning_rate": 1.1729323308270678e-05, "loss": 0.4874, "step": 78 }, { "epoch": 0.119087996985114, "grad_norm": 0.6808431409244452, "learning_rate": 1.1879699248120302e-05, "loss": 0.4788, "step": 79 }, { "epoch": 0.12059543998492557, "grad_norm": 0.6704429377361576, "learning_rate": 1.2030075187969925e-05, "loss": 0.5011, "step": 80 }, { "epoch": 0.12210288298473713, "grad_norm": 0.6926069766970787, "learning_rate": 1.2180451127819551e-05, "loss": 0.496, "step": 81 }, { "epoch": 0.12361032598454871, "grad_norm": 0.639818862010909, "learning_rate": 1.2330827067669174e-05, "loss": 0.5308, "step": 82 }, { "epoch": 0.12511776898436028, "grad_norm": 0.6204899572762589, "learning_rate": 1.2481203007518798e-05, "loss": 0.5063, "step": 83 }, { "epoch": 0.12662521198417184, "grad_norm": 0.6865925022658576, "learning_rate": 1.263157894736842e-05, "loss": 0.507, "step": 84 }, { "epoch": 0.1281326549839834, "grad_norm": 0.7029706975479946, "learning_rate": 1.2781954887218047e-05, "loss": 0.5209, "step": 85 }, { "epoch": 0.129640097983795, "grad_norm": 0.6524424672188123, "learning_rate": 1.293233082706767e-05, "loss": 0.5527, "step": 86 }, { "epoch": 0.13114754098360656, "grad_norm": 0.6074805023838824, "learning_rate": 1.3082706766917295e-05, "loss": 0.4873, "step": 87 }, { "epoch": 0.13265498398341813, "grad_norm": 0.6891851394415897, "learning_rate": 1.3233082706766918e-05, "loss": 0.5335, "step": 88 }, { "epoch": 0.1341624269832297, "grad_norm": 0.6322157680641546, "learning_rate": 1.3383458646616543e-05, "loss": 0.5163, "step": 89 }, { "epoch": 0.13566986998304126, "grad_norm": 0.6652911326311045, "learning_rate": 1.3533834586466165e-05, "loss": 0.5227, "step": 90 }, { "epoch": 0.13717731298285282, "grad_norm": 0.7241927650908743, "learning_rate": 1.3684210526315791e-05, "loss": 0.5271, "step": 91 }, { "epoch": 0.13868475598266441, "grad_norm": 0.654474241851782, "learning_rate": 1.3834586466165414e-05, "loss": 0.5154, "step": 92 }, { "epoch": 0.14019219898247598, "grad_norm": 0.7184414157305912, "learning_rate": 1.3984962406015038e-05, "loss": 0.5077, "step": 93 }, { "epoch": 0.14169964198228754, "grad_norm": 0.6449190071052974, "learning_rate": 1.4135338345864663e-05, "loss": 0.5038, "step": 94 }, { "epoch": 0.1432070849820991, "grad_norm": 0.691580302982374, "learning_rate": 1.4285714285714287e-05, "loss": 0.4667, "step": 95 }, { "epoch": 0.14471452798191067, "grad_norm": 0.6288139344926038, "learning_rate": 1.443609022556391e-05, "loss": 0.49, "step": 96 }, { "epoch": 0.14622197098172227, "grad_norm": 0.643873596183986, "learning_rate": 1.4586466165413536e-05, "loss": 0.4749, "step": 97 }, { "epoch": 0.14772941398153383, "grad_norm": 0.6755660192421138, "learning_rate": 1.4736842105263159e-05, "loss": 0.5075, "step": 98 }, { "epoch": 0.1492368569813454, "grad_norm": 0.646796309870399, "learning_rate": 1.4887218045112783e-05, "loss": 0.5296, "step": 99 }, { "epoch": 0.15074429998115696, "grad_norm": 0.7044467423953411, "learning_rate": 1.5037593984962406e-05, "loss": 0.5303, "step": 100 }, { "epoch": 0.15225174298096852, "grad_norm": 0.7730989605725896, "learning_rate": 1.5187969924812032e-05, "loss": 0.5099, "step": 101 }, { "epoch": 0.1537591859807801, "grad_norm": 0.6620556405595589, "learning_rate": 1.5338345864661656e-05, "loss": 0.5189, "step": 102 }, { "epoch": 0.15526662898059168, "grad_norm": 0.8038364131821449, "learning_rate": 1.548872180451128e-05, "loss": 0.5152, "step": 103 }, { "epoch": 0.15677407198040325, "grad_norm": 0.6537353177538859, "learning_rate": 1.56390977443609e-05, "loss": 0.5012, "step": 104 }, { "epoch": 0.1582815149802148, "grad_norm": 0.7802019309424624, "learning_rate": 1.578947368421053e-05, "loss": 0.4851, "step": 105 }, { "epoch": 0.15978895798002637, "grad_norm": 0.7590456113216669, "learning_rate": 1.593984962406015e-05, "loss": 0.5077, "step": 106 }, { "epoch": 0.16129640097983794, "grad_norm": 0.6740526095538228, "learning_rate": 1.6090225563909775e-05, "loss": 0.4794, "step": 107 }, { "epoch": 0.16280384397964953, "grad_norm": 0.7266285917065574, "learning_rate": 1.62406015037594e-05, "loss": 0.5368, "step": 108 }, { "epoch": 0.1643112869794611, "grad_norm": 0.7202106895600753, "learning_rate": 1.6390977443609023e-05, "loss": 0.5077, "step": 109 }, { "epoch": 0.16581872997927266, "grad_norm": 0.7646664609937389, "learning_rate": 1.6541353383458648e-05, "loss": 0.517, "step": 110 }, { "epoch": 0.16732617297908423, "grad_norm": 0.7090240598112959, "learning_rate": 1.6691729323308272e-05, "loss": 0.5217, "step": 111 }, { "epoch": 0.1688336159788958, "grad_norm": 0.7260255784190195, "learning_rate": 1.6842105263157896e-05, "loss": 0.4977, "step": 112 }, { "epoch": 0.17034105897870735, "grad_norm": 0.6392143364785348, "learning_rate": 1.699248120300752e-05, "loss": 0.4844, "step": 113 }, { "epoch": 0.17184850197851895, "grad_norm": 0.7380625519153193, "learning_rate": 1.7142857142857142e-05, "loss": 0.5063, "step": 114 }, { "epoch": 0.1733559449783305, "grad_norm": 0.6999588028799495, "learning_rate": 1.729323308270677e-05, "loss": 0.5124, "step": 115 }, { "epoch": 0.17486338797814208, "grad_norm": 0.7189938723035283, "learning_rate": 1.744360902255639e-05, "loss": 0.4723, "step": 116 }, { "epoch": 0.17637083097795364, "grad_norm": 0.755925477740424, "learning_rate": 1.7593984962406015e-05, "loss": 0.5071, "step": 117 }, { "epoch": 0.1778782739777652, "grad_norm": 0.6932179016486248, "learning_rate": 1.774436090225564e-05, "loss": 0.4813, "step": 118 }, { "epoch": 0.1793857169775768, "grad_norm": 0.6803984999939205, "learning_rate": 1.7894736842105264e-05, "loss": 0.4537, "step": 119 }, { "epoch": 0.18089315997738836, "grad_norm": 0.6943064601614549, "learning_rate": 1.8045112781954888e-05, "loss": 0.5302, "step": 120 }, { "epoch": 0.18240060297719993, "grad_norm": 0.7024561418818766, "learning_rate": 1.8195488721804512e-05, "loss": 0.5292, "step": 121 }, { "epoch": 0.1839080459770115, "grad_norm": 0.6516091971357849, "learning_rate": 1.8345864661654137e-05, "loss": 0.5192, "step": 122 }, { "epoch": 0.18541548897682306, "grad_norm": 0.733121234769519, "learning_rate": 1.849624060150376e-05, "loss": 0.5137, "step": 123 }, { "epoch": 0.18692293197663462, "grad_norm": 0.6379606914374305, "learning_rate": 1.8646616541353386e-05, "loss": 0.5004, "step": 124 }, { "epoch": 0.1884303749764462, "grad_norm": 0.682116565034204, "learning_rate": 1.879699248120301e-05, "loss": 0.4932, "step": 125 }, { "epoch": 0.18993781797625778, "grad_norm": 0.6586357032334851, "learning_rate": 1.894736842105263e-05, "loss": 0.4949, "step": 126 }, { "epoch": 0.19144526097606934, "grad_norm": 0.6996866499647233, "learning_rate": 1.909774436090226e-05, "loss": 0.4926, "step": 127 }, { "epoch": 0.1929527039758809, "grad_norm": 0.7344954131354208, "learning_rate": 1.924812030075188e-05, "loss": 0.477, "step": 128 }, { "epoch": 0.19446014697569247, "grad_norm": 0.6945294612726404, "learning_rate": 1.9398496240601504e-05, "loss": 0.4872, "step": 129 }, { "epoch": 0.19596758997550406, "grad_norm": 0.7624604146665339, "learning_rate": 1.954887218045113e-05, "loss": 0.5224, "step": 130 }, { "epoch": 0.19747503297531563, "grad_norm": 0.685229042181731, "learning_rate": 1.9699248120300753e-05, "loss": 0.4964, "step": 131 }, { "epoch": 0.1989824759751272, "grad_norm": 0.8306163709490333, "learning_rate": 1.9849624060150377e-05, "loss": 0.5021, "step": 132 }, { "epoch": 0.20048991897493876, "grad_norm": 0.6752867411242717, "learning_rate": 2e-05, "loss": 0.4946, "step": 133 }, { "epoch": 0.20199736197475032, "grad_norm": 0.9473906923308808, "learning_rate": 1.99999653272242e-05, "loss": 0.5112, "step": 134 }, { "epoch": 0.2035048049745619, "grad_norm": 0.6355233169612663, "learning_rate": 1.9999861309137232e-05, "loss": 0.5318, "step": 135 }, { "epoch": 0.20501224797437348, "grad_norm": 0.8423903087733013, "learning_rate": 1.999968794646042e-05, "loss": 0.5148, "step": 136 }, { "epoch": 0.20651969097418504, "grad_norm": 0.6660475408627802, "learning_rate": 1.9999445240395953e-05, "loss": 0.5178, "step": 137 }, { "epoch": 0.2080271339739966, "grad_norm": 0.72967331295993, "learning_rate": 1.9999133192626893e-05, "loss": 0.5262, "step": 138 }, { "epoch": 0.20953457697380817, "grad_norm": 0.7393548066200798, "learning_rate": 1.9998751805317152e-05, "loss": 0.5057, "step": 139 }, { "epoch": 0.21104201997361974, "grad_norm": 0.687138877245702, "learning_rate": 1.999830108111148e-05, "loss": 0.4958, "step": 140 }, { "epoch": 0.21254946297343133, "grad_norm": 0.7007673502124087, "learning_rate": 1.999778102313545e-05, "loss": 0.4948, "step": 141 }, { "epoch": 0.2140569059732429, "grad_norm": 0.7183800488623966, "learning_rate": 1.999719163499543e-05, "loss": 0.5104, "step": 142 }, { "epoch": 0.21556434897305446, "grad_norm": 0.6535866563135689, "learning_rate": 1.999653292077857e-05, "loss": 0.5145, "step": 143 }, { "epoch": 0.21707179197286602, "grad_norm": 0.64107430044815, "learning_rate": 1.999580488505276e-05, "loss": 0.4659, "step": 144 }, { "epoch": 0.2185792349726776, "grad_norm": 0.6779403955139097, "learning_rate": 1.9995007532866594e-05, "loss": 0.4964, "step": 145 }, { "epoch": 0.22008667797248915, "grad_norm": 0.6539110005752458, "learning_rate": 1.9994140869749366e-05, "loss": 0.5092, "step": 146 }, { "epoch": 0.22159412097230075, "grad_norm": 0.6565635872751927, "learning_rate": 1.9993204901710995e-05, "loss": 0.5185, "step": 147 }, { "epoch": 0.2231015639721123, "grad_norm": 0.6372834025735034, "learning_rate": 1.9992199635241997e-05, "loss": 0.5152, "step": 148 }, { "epoch": 0.22460900697192387, "grad_norm": 0.622264657968412, "learning_rate": 1.999112507731346e-05, "loss": 0.5, "step": 149 }, { "epoch": 0.22611644997173544, "grad_norm": 0.6267246999704467, "learning_rate": 1.9989981235376956e-05, "loss": 0.4897, "step": 150 }, { "epoch": 0.227623892971547, "grad_norm": 0.6551262788067906, "learning_rate": 1.9988768117364526e-05, "loss": 0.5165, "step": 151 }, { "epoch": 0.2291313359713586, "grad_norm": 0.6664514520384526, "learning_rate": 1.9987485731688595e-05, "loss": 0.5002, "step": 152 }, { "epoch": 0.23063877897117016, "grad_norm": 0.6076256276502832, "learning_rate": 1.998613408724195e-05, "loss": 0.5084, "step": 153 }, { "epoch": 0.23214622197098173, "grad_norm": 0.6373492462291207, "learning_rate": 1.998471319339763e-05, "loss": 0.5026, "step": 154 }, { "epoch": 0.2336536649707933, "grad_norm": 1.745130262060046, "learning_rate": 1.9983223060008908e-05, "loss": 0.5034, "step": 155 }, { "epoch": 0.23516110797060485, "grad_norm": 5.046536940100192, "learning_rate": 1.9981663697409203e-05, "loss": 0.5424, "step": 156 }, { "epoch": 0.23666855097041642, "grad_norm": 1.5507957638980387, "learning_rate": 1.998003511641199e-05, "loss": 0.5301, "step": 157 }, { "epoch": 0.238175993970228, "grad_norm": 0.9453551152302114, "learning_rate": 1.997833732831076e-05, "loss": 0.4793, "step": 158 }, { "epoch": 0.23968343697003958, "grad_norm": 1.1864330727309345, "learning_rate": 1.9976570344878916e-05, "loss": 0.5125, "step": 159 }, { "epoch": 0.24119087996985114, "grad_norm": 0.8095198214822489, "learning_rate": 1.9974734178369702e-05, "loss": 0.4904, "step": 160 }, { "epoch": 0.2426983229696627, "grad_norm": 0.620347427984293, "learning_rate": 1.997282884151612e-05, "loss": 0.4611, "step": 161 }, { "epoch": 0.24420576596947427, "grad_norm": 0.7590913383659819, "learning_rate": 1.9970854347530828e-05, "loss": 0.5085, "step": 162 }, { "epoch": 0.24571320896928586, "grad_norm": 0.5932187358928716, "learning_rate": 1.9968810710106065e-05, "loss": 0.49, "step": 163 }, { "epoch": 0.24722065196909743, "grad_norm": 0.7421744580230403, "learning_rate": 1.9966697943413548e-05, "loss": 0.4789, "step": 164 }, { "epoch": 0.248728094968909, "grad_norm": 0.634606964098851, "learning_rate": 1.9964516062104377e-05, "loss": 0.5008, "step": 165 }, { "epoch": 0.25023553796872056, "grad_norm": 0.721486256547781, "learning_rate": 1.996226508130892e-05, "loss": 0.4546, "step": 166 }, { "epoch": 0.25174298096853215, "grad_norm": 0.6785391250628956, "learning_rate": 1.995994501663674e-05, "loss": 0.4892, "step": 167 }, { "epoch": 0.2532504239683437, "grad_norm": 0.5862796281463728, "learning_rate": 1.995755588417644e-05, "loss": 0.4736, "step": 168 }, { "epoch": 0.2547578669681553, "grad_norm": 0.6656043784418029, "learning_rate": 1.99550977004956e-05, "loss": 0.4749, "step": 169 }, { "epoch": 0.2562653099679668, "grad_norm": 0.6091440270236256, "learning_rate": 1.9952570482640628e-05, "loss": 0.4997, "step": 170 }, { "epoch": 0.2577727529677784, "grad_norm": 0.7671131220608588, "learning_rate": 1.9949974248136655e-05, "loss": 0.4741, "step": 171 }, { "epoch": 0.25928019596759, "grad_norm": 0.6246356814602296, "learning_rate": 1.9947309014987414e-05, "loss": 0.4727, "step": 172 }, { "epoch": 0.26078763896740154, "grad_norm": 0.7874820318511245, "learning_rate": 1.9944574801675106e-05, "loss": 0.4965, "step": 173 }, { "epoch": 0.26229508196721313, "grad_norm": 0.6727323732082747, "learning_rate": 1.9941771627160287e-05, "loss": 0.5361, "step": 174 }, { "epoch": 0.26380252496702467, "grad_norm": 0.6896058530733802, "learning_rate": 1.9938899510881732e-05, "loss": 0.4574, "step": 175 }, { "epoch": 0.26530996796683626, "grad_norm": 0.6396080754362474, "learning_rate": 1.9935958472756283e-05, "loss": 0.4791, "step": 176 }, { "epoch": 0.26681741096664785, "grad_norm": 0.6978820918898457, "learning_rate": 1.993294853317873e-05, "loss": 0.4936, "step": 177 }, { "epoch": 0.2683248539664594, "grad_norm": 0.6200726182474722, "learning_rate": 1.9929869713021668e-05, "loss": 0.4809, "step": 178 }, { "epoch": 0.269832296966271, "grad_norm": 0.6621164817055001, "learning_rate": 1.9926722033635343e-05, "loss": 0.4833, "step": 179 }, { "epoch": 0.2713397399660825, "grad_norm": 0.6443066814567524, "learning_rate": 1.9923505516847514e-05, "loss": 0.452, "step": 180 }, { "epoch": 0.2728471829658941, "grad_norm": 0.6324303832157692, "learning_rate": 1.9920220184963296e-05, "loss": 0.4942, "step": 181 }, { "epoch": 0.27435462596570565, "grad_norm": 0.7093590972609833, "learning_rate": 1.9916866060764994e-05, "loss": 0.4666, "step": 182 }, { "epoch": 0.27586206896551724, "grad_norm": 0.6056554558199854, "learning_rate": 1.991344316751198e-05, "loss": 0.48, "step": 183 }, { "epoch": 0.27736951196532883, "grad_norm": 0.6668247301054864, "learning_rate": 1.9909951528940485e-05, "loss": 0.4892, "step": 184 }, { "epoch": 0.27887695496514037, "grad_norm": 0.6248269362388523, "learning_rate": 1.990639116926348e-05, "loss": 0.4601, "step": 185 }, { "epoch": 0.28038439796495196, "grad_norm": 0.5988280290089756, "learning_rate": 1.9902762113170467e-05, "loss": 0.4671, "step": 186 }, { "epoch": 0.2818918409647635, "grad_norm": 0.6183022850194487, "learning_rate": 1.989906438582734e-05, "loss": 0.5023, "step": 187 }, { "epoch": 0.2833992839645751, "grad_norm": 0.6345731772578389, "learning_rate": 1.9895298012876192e-05, "loss": 0.4749, "step": 188 }, { "epoch": 0.2849067269643867, "grad_norm": 0.6026738883514794, "learning_rate": 1.9891463020435144e-05, "loss": 0.4884, "step": 189 }, { "epoch": 0.2864141699641982, "grad_norm": 0.6275566066201014, "learning_rate": 1.9887559435098162e-05, "loss": 0.4868, "step": 190 }, { "epoch": 0.2879216129640098, "grad_norm": 0.6830623512458401, "learning_rate": 1.9883587283934875e-05, "loss": 0.4797, "step": 191 }, { "epoch": 0.28942905596382135, "grad_norm": 0.621100203862078, "learning_rate": 1.9879546594490383e-05, "loss": 0.4781, "step": 192 }, { "epoch": 0.29093649896363294, "grad_norm": 0.7266845450092815, "learning_rate": 1.987543739478507e-05, "loss": 0.4838, "step": 193 }, { "epoch": 0.29244394196344453, "grad_norm": 0.5998498411317879, "learning_rate": 1.987125971331441e-05, "loss": 0.4809, "step": 194 }, { "epoch": 0.29395138496325607, "grad_norm": 0.7629414665635117, "learning_rate": 1.9867013579048765e-05, "loss": 0.4891, "step": 195 }, { "epoch": 0.29545882796306766, "grad_norm": 0.6340989750127195, "learning_rate": 1.9862699021433186e-05, "loss": 0.4696, "step": 196 }, { "epoch": 0.2969662709628792, "grad_norm": 0.6600966786500729, "learning_rate": 1.9858316070387208e-05, "loss": 0.4568, "step": 197 }, { "epoch": 0.2984737139626908, "grad_norm": 0.6836572646612057, "learning_rate": 1.9853864756304654e-05, "loss": 0.4849, "step": 198 }, { "epoch": 0.2999811569625024, "grad_norm": 0.5912116643865833, "learning_rate": 1.9849345110053405e-05, "loss": 0.4752, "step": 199 }, { "epoch": 0.3014885999623139, "grad_norm": 0.6202584603281575, "learning_rate": 1.984475716297519e-05, "loss": 0.478, "step": 200 }, { "epoch": 0.3029960429621255, "grad_norm": 0.6021031729150327, "learning_rate": 1.984010094688539e-05, "loss": 0.4818, "step": 201 }, { "epoch": 0.30450348596193705, "grad_norm": 0.6013263404823498, "learning_rate": 1.9835376494072788e-05, "loss": 0.4798, "step": 202 }, { "epoch": 0.30601092896174864, "grad_norm": 0.6391395935977097, "learning_rate": 1.9830583837299363e-05, "loss": 0.5079, "step": 203 }, { "epoch": 0.3075183719615602, "grad_norm": 0.6403126078695583, "learning_rate": 1.9825723009800058e-05, "loss": 0.4994, "step": 204 }, { "epoch": 0.30902581496137177, "grad_norm": 0.6996650791864127, "learning_rate": 1.9820794045282553e-05, "loss": 0.458, "step": 205 }, { "epoch": 0.31053325796118336, "grad_norm": 0.6120915229627125, "learning_rate": 1.9815796977927015e-05, "loss": 0.4837, "step": 206 }, { "epoch": 0.3120407009609949, "grad_norm": 0.698625059347094, "learning_rate": 1.9810731842385892e-05, "loss": 0.4762, "step": 207 }, { "epoch": 0.3135481439608065, "grad_norm": 0.6458152328354264, "learning_rate": 1.9805598673783644e-05, "loss": 0.4877, "step": 208 }, { "epoch": 0.31505558696061803, "grad_norm": 0.6183456339468536, "learning_rate": 1.980039750771651e-05, "loss": 0.4555, "step": 209 }, { "epoch": 0.3165630299604296, "grad_norm": 0.6625119162294268, "learning_rate": 1.9795128380252263e-05, "loss": 0.467, "step": 210 }, { "epoch": 0.3180704729602412, "grad_norm": 0.5634839413053515, "learning_rate": 1.978979132792996e-05, "loss": 0.4664, "step": 211 }, { "epoch": 0.31957791596005275, "grad_norm": 0.6026219616185302, "learning_rate": 1.9784386387759684e-05, "loss": 0.4774, "step": 212 }, { "epoch": 0.32108535895986434, "grad_norm": 0.6269218843440012, "learning_rate": 1.977891359722229e-05, "loss": 0.4432, "step": 213 }, { "epoch": 0.3225928019596759, "grad_norm": 0.5910572611931473, "learning_rate": 1.9773372994269147e-05, "loss": 0.4699, "step": 214 }, { "epoch": 0.32410024495948747, "grad_norm": 0.6536939644754692, "learning_rate": 1.976776461732187e-05, "loss": 0.4807, "step": 215 }, { "epoch": 0.32560768795929906, "grad_norm": 0.6191726708771672, "learning_rate": 1.976208850527206e-05, "loss": 0.4944, "step": 216 }, { "epoch": 0.3271151309591106, "grad_norm": 0.6298298802683915, "learning_rate": 1.9756344697481027e-05, "loss": 0.4862, "step": 217 }, { "epoch": 0.3286225739589222, "grad_norm": 0.6539080215758202, "learning_rate": 1.975053323377952e-05, "loss": 0.4817, "step": 218 }, { "epoch": 0.33013001695873373, "grad_norm": 0.6146092404035427, "learning_rate": 1.9744654154467468e-05, "loss": 0.5422, "step": 219 }, { "epoch": 0.3316374599585453, "grad_norm": 0.6490399293285286, "learning_rate": 1.9738707500313655e-05, "loss": 0.4703, "step": 220 }, { "epoch": 0.3331449029583569, "grad_norm": 0.615019483124787, "learning_rate": 1.9732693312555492e-05, "loss": 0.4801, "step": 221 }, { "epoch": 0.33465234595816845, "grad_norm": 0.5547372740595196, "learning_rate": 1.9726611632898693e-05, "loss": 0.4719, "step": 222 }, { "epoch": 0.33615978895798004, "grad_norm": 0.6076625752065381, "learning_rate": 1.9720462503517e-05, "loss": 0.498, "step": 223 }, { "epoch": 0.3376672319577916, "grad_norm": 0.6047398581844834, "learning_rate": 1.971424596705189e-05, "loss": 0.4643, "step": 224 }, { "epoch": 0.3391746749576032, "grad_norm": 0.5727445152315086, "learning_rate": 1.9707962066612278e-05, "loss": 0.4515, "step": 225 }, { "epoch": 0.3406821179574147, "grad_norm": 0.5573272058322264, "learning_rate": 1.970161084577422e-05, "loss": 0.4524, "step": 226 }, { "epoch": 0.3421895609572263, "grad_norm": 0.6257676922974255, "learning_rate": 1.9695192348580606e-05, "loss": 0.4815, "step": 227 }, { "epoch": 0.3436970039570379, "grad_norm": 0.5745183403896584, "learning_rate": 1.9688706619540863e-05, "loss": 0.4717, "step": 228 }, { "epoch": 0.34520444695684943, "grad_norm": 0.5964564340890054, "learning_rate": 1.968215370363063e-05, "loss": 0.4839, "step": 229 }, { "epoch": 0.346711889956661, "grad_norm": 0.5672877352491237, "learning_rate": 1.9675533646291463e-05, "loss": 0.4914, "step": 230 }, { "epoch": 0.34821933295647256, "grad_norm": 0.6672213227292868, "learning_rate": 1.9668846493430522e-05, "loss": 0.4718, "step": 231 }, { "epoch": 0.34972677595628415, "grad_norm": 0.5546791014566226, "learning_rate": 1.9662092291420233e-05, "loss": 0.4392, "step": 232 }, { "epoch": 0.35123421895609575, "grad_norm": 0.6546100852352986, "learning_rate": 1.965527108709798e-05, "loss": 0.4836, "step": 233 }, { "epoch": 0.3527416619559073, "grad_norm": 0.5992362903479123, "learning_rate": 1.964838292776579e-05, "loss": 0.4464, "step": 234 }, { "epoch": 0.3542491049557189, "grad_norm": 0.63523888294575, "learning_rate": 1.9641427861189973e-05, "loss": 0.4856, "step": 235 }, { "epoch": 0.3557565479555304, "grad_norm": 0.6032723041133213, "learning_rate": 1.963440593560083e-05, "loss": 0.4966, "step": 236 }, { "epoch": 0.357263990955342, "grad_norm": 0.6306498314236755, "learning_rate": 1.9627317199692287e-05, "loss": 0.4771, "step": 237 }, { "epoch": 0.3587714339551536, "grad_norm": 0.5865071462782886, "learning_rate": 1.962016170262157e-05, "loss": 0.4573, "step": 238 }, { "epoch": 0.36027887695496513, "grad_norm": 0.5665927327271444, "learning_rate": 1.961293949400888e-05, "loss": 0.4485, "step": 239 }, { "epoch": 0.3617863199547767, "grad_norm": 0.554220781330076, "learning_rate": 1.960565062393701e-05, "loss": 0.4686, "step": 240 }, { "epoch": 0.36329376295458826, "grad_norm": 0.6455923732389204, "learning_rate": 1.9598295142951035e-05, "loss": 0.4592, "step": 241 }, { "epoch": 0.36480120595439985, "grad_norm": 0.596721778819204, "learning_rate": 1.9590873102057948e-05, "loss": 0.4907, "step": 242 }, { "epoch": 0.36630864895421145, "grad_norm": 0.6716627212373145, "learning_rate": 1.9583384552726294e-05, "loss": 0.4799, "step": 243 }, { "epoch": 0.367816091954023, "grad_norm": 0.6229194933798746, "learning_rate": 1.957582954688584e-05, "loss": 0.4652, "step": 244 }, { "epoch": 0.3693235349538346, "grad_norm": 0.6298037085236075, "learning_rate": 1.9568208136927177e-05, "loss": 0.4717, "step": 245 }, { "epoch": 0.3708309779536461, "grad_norm": 0.5787887978421966, "learning_rate": 1.9560520375701408e-05, "loss": 0.4845, "step": 246 }, { "epoch": 0.3723384209534577, "grad_norm": 0.6451526421523999, "learning_rate": 1.9552766316519726e-05, "loss": 0.4516, "step": 247 }, { "epoch": 0.37384586395326924, "grad_norm": 0.538692705705553, "learning_rate": 1.9544946013153093e-05, "loss": 0.4649, "step": 248 }, { "epoch": 0.37535330695308083, "grad_norm": 0.6399780775437526, "learning_rate": 1.9537059519831822e-05, "loss": 0.4594, "step": 249 }, { "epoch": 0.3768607499528924, "grad_norm": 0.6082935211607333, "learning_rate": 1.9529106891245244e-05, "loss": 0.4709, "step": 250 }, { "epoch": 0.37836819295270396, "grad_norm": 0.6106738888512755, "learning_rate": 1.9521088182541298e-05, "loss": 0.492, "step": 251 }, { "epoch": 0.37987563595251556, "grad_norm": 0.5803041737823633, "learning_rate": 1.951300344932616e-05, "loss": 0.4646, "step": 252 }, { "epoch": 0.3813830789523271, "grad_norm": 0.5647638332240319, "learning_rate": 1.9504852747663862e-05, "loss": 0.4725, "step": 253 }, { "epoch": 0.3828905219521387, "grad_norm": 0.664315669006426, "learning_rate": 1.9496636134075894e-05, "loss": 0.4689, "step": 254 }, { "epoch": 0.3843979649519503, "grad_norm": 0.6019633789641826, "learning_rate": 1.9488353665540813e-05, "loss": 0.4613, "step": 255 }, { "epoch": 0.3859054079517618, "grad_norm": 0.5805016640621002, "learning_rate": 1.9480005399493857e-05, "loss": 0.4613, "step": 256 }, { "epoch": 0.3874128509515734, "grad_norm": 0.6053466035481387, "learning_rate": 1.9471591393826536e-05, "loss": 0.4877, "step": 257 }, { "epoch": 0.38892029395138494, "grad_norm": 0.5443749204002357, "learning_rate": 1.9463111706886234e-05, "loss": 0.481, "step": 258 }, { "epoch": 0.39042773695119654, "grad_norm": 0.6422687053592201, "learning_rate": 1.9454566397475813e-05, "loss": 0.464, "step": 259 }, { "epoch": 0.39193517995100813, "grad_norm": 0.5911574213296809, "learning_rate": 1.944595552485319e-05, "loss": 0.4451, "step": 260 }, { "epoch": 0.39344262295081966, "grad_norm": 0.6244696365384524, "learning_rate": 1.943727914873094e-05, "loss": 0.465, "step": 261 }, { "epoch": 0.39495006595063126, "grad_norm": 0.6787496907794774, "learning_rate": 1.9428537329275862e-05, "loss": 0.4591, "step": 262 }, { "epoch": 0.3964575089504428, "grad_norm": 0.584284155721975, "learning_rate": 1.941973012710859e-05, "loss": 0.4835, "step": 263 }, { "epoch": 0.3979649519502544, "grad_norm": 0.6636147745329853, "learning_rate": 1.941085760330316e-05, "loss": 0.4558, "step": 264 }, { "epoch": 0.3994723949500659, "grad_norm": 0.580002453326873, "learning_rate": 1.940191981938657e-05, "loss": 0.4848, "step": 265 }, { "epoch": 0.4009798379498775, "grad_norm": 0.6067452479296194, "learning_rate": 1.9392916837338376e-05, "loss": 0.4783, "step": 266 }, { "epoch": 0.4024872809496891, "grad_norm": 0.6517612748843483, "learning_rate": 1.9383848719590257e-05, "loss": 0.4849, "step": 267 }, { "epoch": 0.40399472394950064, "grad_norm": 0.6355304966389256, "learning_rate": 1.9374715529025575e-05, "loss": 0.4312, "step": 268 }, { "epoch": 0.40550216694931224, "grad_norm": 0.627744747765263, "learning_rate": 1.9365517328978943e-05, "loss": 0.4762, "step": 269 }, { "epoch": 0.4070096099491238, "grad_norm": 0.6640367945419465, "learning_rate": 1.9356254183235785e-05, "loss": 0.432, "step": 270 }, { "epoch": 0.40851705294893537, "grad_norm": 0.647008694411896, "learning_rate": 1.93469261560319e-05, "loss": 0.4795, "step": 271 }, { "epoch": 0.41002449594874696, "grad_norm": 0.6742117075938286, "learning_rate": 1.9337533312053002e-05, "loss": 0.4573, "step": 272 }, { "epoch": 0.4115319389485585, "grad_norm": 0.6000668524451142, "learning_rate": 1.9328075716434287e-05, "loss": 0.4474, "step": 273 }, { "epoch": 0.4130393819483701, "grad_norm": 0.6027061587937567, "learning_rate": 1.931855343475998e-05, "loss": 0.4283, "step": 274 }, { "epoch": 0.4145468249481816, "grad_norm": 0.56875377174764, "learning_rate": 1.930896653306286e-05, "loss": 0.4446, "step": 275 }, { "epoch": 0.4160542679479932, "grad_norm": 0.6494800822344575, "learning_rate": 1.929931507782383e-05, "loss": 0.4504, "step": 276 }, { "epoch": 0.4175617109478048, "grad_norm": 0.5925306999643124, "learning_rate": 1.9289599135971437e-05, "loss": 0.4993, "step": 277 }, { "epoch": 0.41906915394761635, "grad_norm": 0.5812846521774916, "learning_rate": 1.9279818774881418e-05, "loss": 0.4574, "step": 278 }, { "epoch": 0.42057659694742794, "grad_norm": 0.5625417674563119, "learning_rate": 1.9269974062376224e-05, "loss": 0.4325, "step": 279 }, { "epoch": 0.4220840399472395, "grad_norm": 0.5839055838922522, "learning_rate": 1.926006506672456e-05, "loss": 0.4669, "step": 280 }, { "epoch": 0.42359148294705107, "grad_norm": 0.6042605173402862, "learning_rate": 1.9250091856640895e-05, "loss": 0.4224, "step": 281 }, { "epoch": 0.42509892594686266, "grad_norm": 0.5856982708883072, "learning_rate": 1.9240054501285015e-05, "loss": 0.4709, "step": 282 }, { "epoch": 0.4266063689466742, "grad_norm": 0.5631263514578662, "learning_rate": 1.922995307026151e-05, "loss": 0.4614, "step": 283 }, { "epoch": 0.4281138119464858, "grad_norm": 0.5583569731432177, "learning_rate": 1.921978763361931e-05, "loss": 0.4589, "step": 284 }, { "epoch": 0.4296212549462973, "grad_norm": 0.6050421963625475, "learning_rate": 1.9209558261851194e-05, "loss": 0.4382, "step": 285 }, { "epoch": 0.4311286979461089, "grad_norm": 0.533785762634786, "learning_rate": 1.919926502589331e-05, "loss": 0.4862, "step": 286 }, { "epoch": 0.43263614094592046, "grad_norm": 0.5693448486944194, "learning_rate": 1.9188907997124666e-05, "loss": 0.4562, "step": 287 }, { "epoch": 0.43414358394573205, "grad_norm": 0.5654990613672617, "learning_rate": 1.9178487247366652e-05, "loss": 0.4492, "step": 288 }, { "epoch": 0.43565102694554364, "grad_norm": 0.5771432152665512, "learning_rate": 1.916800284888253e-05, "loss": 0.4478, "step": 289 }, { "epoch": 0.4371584699453552, "grad_norm": 0.5734596310020046, "learning_rate": 1.915745487437694e-05, "loss": 0.4801, "step": 290 }, { "epoch": 0.43866591294516677, "grad_norm": 0.5832753483996317, "learning_rate": 1.9146843396995396e-05, "loss": 0.4563, "step": 291 }, { "epoch": 0.4401733559449783, "grad_norm": 0.5879841082366902, "learning_rate": 1.9136168490323772e-05, "loss": 0.4689, "step": 292 }, { "epoch": 0.4416807989447899, "grad_norm": 0.5521570450782258, "learning_rate": 1.9125430228387794e-05, "loss": 0.4581, "step": 293 }, { "epoch": 0.4431882419446015, "grad_norm": 0.5673604877581071, "learning_rate": 1.9114628685652535e-05, "loss": 0.4668, "step": 294 }, { "epoch": 0.444695684944413, "grad_norm": 0.5866077006525799, "learning_rate": 1.9103763937021887e-05, "loss": 0.4588, "step": 295 }, { "epoch": 0.4462031279442246, "grad_norm": 0.5731048741878798, "learning_rate": 1.909283605783805e-05, "loss": 0.4774, "step": 296 }, { "epoch": 0.44771057094403616, "grad_norm": 0.6251177027508026, "learning_rate": 1.9081845123881002e-05, "loss": 0.4813, "step": 297 }, { "epoch": 0.44921801394384775, "grad_norm": 0.5256954818277138, "learning_rate": 1.9070791211367984e-05, "loss": 0.4473, "step": 298 }, { "epoch": 0.45072545694365934, "grad_norm": 0.6199874516009303, "learning_rate": 1.9059674396952963e-05, "loss": 0.4629, "step": 299 }, { "epoch": 0.4522328999434709, "grad_norm": 0.5917017492987557, "learning_rate": 1.90484947577261e-05, "loss": 0.4979, "step": 300 }, { "epoch": 0.45374034294328247, "grad_norm": 0.6120361922704654, "learning_rate": 1.903725237121322e-05, "loss": 0.4831, "step": 301 }, { "epoch": 0.455247785943094, "grad_norm": 0.5514120347682593, "learning_rate": 1.902594731537527e-05, "loss": 0.4452, "step": 302 }, { "epoch": 0.4567552289429056, "grad_norm": 0.5767336190747095, "learning_rate": 1.901457966860779e-05, "loss": 0.4435, "step": 303 }, { "epoch": 0.4582626719427172, "grad_norm": 0.5868519118956824, "learning_rate": 1.9003149509740347e-05, "loss": 0.492, "step": 304 }, { "epoch": 0.45977011494252873, "grad_norm": 0.6168191655686016, "learning_rate": 1.899165691803601e-05, "loss": 0.4512, "step": 305 }, { "epoch": 0.4612775579423403, "grad_norm": 0.6053359821845329, "learning_rate": 1.8980101973190787e-05, "loss": 0.4749, "step": 306 }, { "epoch": 0.46278500094215186, "grad_norm": 0.60634572707715, "learning_rate": 1.896848475533309e-05, "loss": 0.4682, "step": 307 }, { "epoch": 0.46429244394196345, "grad_norm": 0.6339199718330278, "learning_rate": 1.8956805345023145e-05, "loss": 0.4643, "step": 308 }, { "epoch": 0.465799886941775, "grad_norm": 0.6011561135843241, "learning_rate": 1.894506382325248e-05, "loss": 0.435, "step": 309 }, { "epoch": 0.4673073299415866, "grad_norm": 0.6067579490346751, "learning_rate": 1.8933260271443313e-05, "loss": 0.4162, "step": 310 }, { "epoch": 0.4688147729413982, "grad_norm": 0.5747986536629459, "learning_rate": 1.8921394771448032e-05, "loss": 0.448, "step": 311 }, { "epoch": 0.4703222159412097, "grad_norm": 0.605434367981348, "learning_rate": 1.89094674055486e-05, "loss": 0.4264, "step": 312 }, { "epoch": 0.4718296589410213, "grad_norm": 0.6028982875539595, "learning_rate": 1.889747825645599e-05, "loss": 0.447, "step": 313 }, { "epoch": 0.47333710194083284, "grad_norm": 0.6024460995063091, "learning_rate": 1.8885427407309627e-05, "loss": 0.4689, "step": 314 }, { "epoch": 0.47484454494064443, "grad_norm": 0.6726949468749703, "learning_rate": 1.887331494167678e-05, "loss": 0.4562, "step": 315 }, { "epoch": 0.476351987940456, "grad_norm": 0.6108367421924343, "learning_rate": 1.8861140943552014e-05, "loss": 0.4574, "step": 316 }, { "epoch": 0.47785943094026756, "grad_norm": 0.6095993211515124, "learning_rate": 1.884890549735659e-05, "loss": 0.429, "step": 317 }, { "epoch": 0.47936687394007915, "grad_norm": 0.5708366516060817, "learning_rate": 1.8836608687937883e-05, "loss": 0.4494, "step": 318 }, { "epoch": 0.4808743169398907, "grad_norm": 0.6319148329863508, "learning_rate": 1.8824250600568798e-05, "loss": 0.4457, "step": 319 }, { "epoch": 0.4823817599397023, "grad_norm": 0.5817901717334689, "learning_rate": 1.8811831320947177e-05, "loss": 0.4444, "step": 320 }, { "epoch": 0.4838892029395139, "grad_norm": 0.6167253992638152, "learning_rate": 1.879935093519519e-05, "loss": 0.4758, "step": 321 }, { "epoch": 0.4853966459393254, "grad_norm": 0.5918299912550289, "learning_rate": 1.878680952985877e-05, "loss": 0.4586, "step": 322 }, { "epoch": 0.486904088939137, "grad_norm": 0.5897988696893806, "learning_rate": 1.8774207191906976e-05, "loss": 0.4548, "step": 323 }, { "epoch": 0.48841153193894854, "grad_norm": 0.5336492924439385, "learning_rate": 1.8761544008731426e-05, "loss": 0.4477, "step": 324 }, { "epoch": 0.48991897493876013, "grad_norm": 0.5969332291879268, "learning_rate": 1.874882006814565e-05, "loss": 0.4423, "step": 325 }, { "epoch": 0.4914264179385717, "grad_norm": 0.5894559630672119, "learning_rate": 1.8736035458384528e-05, "loss": 0.4681, "step": 326 }, { "epoch": 0.49293386093838326, "grad_norm": 0.583381204713255, "learning_rate": 1.8723190268103634e-05, "loss": 0.431, "step": 327 }, { "epoch": 0.49444130393819485, "grad_norm": 0.5501857874739489, "learning_rate": 1.8710284586378645e-05, "loss": 0.4501, "step": 328 }, { "epoch": 0.4959487469380064, "grad_norm": 0.5807568427837185, "learning_rate": 1.8697318502704734e-05, "loss": 0.446, "step": 329 }, { "epoch": 0.497456189937818, "grad_norm": 0.5344952874232914, "learning_rate": 1.8684292106995916e-05, "loss": 0.464, "step": 330 }, { "epoch": 0.4989636329376295, "grad_norm": 0.5875400091192824, "learning_rate": 1.8671205489584453e-05, "loss": 0.462, "step": 331 }, { "epoch": 0.5004710759374411, "grad_norm": 0.5898142606962845, "learning_rate": 1.865805874122021e-05, "loss": 0.4495, "step": 332 }, { "epoch": 0.5019785189372526, "grad_norm": 0.5383180946864506, "learning_rate": 1.8644851953070045e-05, "loss": 0.474, "step": 333 }, { "epoch": 0.5034859619370643, "grad_norm": 0.5701159430118912, "learning_rate": 1.863158521671716e-05, "loss": 0.4644, "step": 334 }, { "epoch": 0.5049934049368758, "grad_norm": 0.5456550772582448, "learning_rate": 1.8618258624160465e-05, "loss": 0.4426, "step": 335 }, { "epoch": 0.5065008479366874, "grad_norm": 0.5806062450133762, "learning_rate": 1.8604872267813954e-05, "loss": 0.4428, "step": 336 }, { "epoch": 0.508008290936499, "grad_norm": 0.5723184224994758, "learning_rate": 1.859142624050605e-05, "loss": 0.427, "step": 337 }, { "epoch": 0.5095157339363106, "grad_norm": 0.5503430826330011, "learning_rate": 1.8577920635478976e-05, "loss": 0.4863, "step": 338 }, { "epoch": 0.5110231769361221, "grad_norm": 0.5922429005891785, "learning_rate": 1.8564355546388094e-05, "loss": 0.472, "step": 339 }, { "epoch": 0.5125306199359336, "grad_norm": 0.5243816217609505, "learning_rate": 1.855073106730126e-05, "loss": 0.4563, "step": 340 }, { "epoch": 0.5140380629357453, "grad_norm": 0.571898057341335, "learning_rate": 1.8537047292698175e-05, "loss": 0.4686, "step": 341 }, { "epoch": 0.5155455059355568, "grad_norm": 0.5389787797747003, "learning_rate": 1.852330431746973e-05, "loss": 0.4044, "step": 342 }, { "epoch": 0.5170529489353684, "grad_norm": 0.5755069679771695, "learning_rate": 1.8509502236917353e-05, "loss": 0.4536, "step": 343 }, { "epoch": 0.51856039193518, "grad_norm": 0.5386650306089089, "learning_rate": 1.8495641146752322e-05, "loss": 0.4285, "step": 344 }, { "epoch": 0.5200678349349915, "grad_norm": 0.5775045065740545, "learning_rate": 1.848172114309513e-05, "loss": 0.4579, "step": 345 }, { "epoch": 0.5215752779348031, "grad_norm": 0.6222104655446267, "learning_rate": 1.8467742322474822e-05, "loss": 0.4733, "step": 346 }, { "epoch": 0.5230827209346146, "grad_norm": 0.5869893846228816, "learning_rate": 1.845370478182829e-05, "loss": 0.5073, "step": 347 }, { "epoch": 0.5245901639344263, "grad_norm": 0.6007295355780623, "learning_rate": 1.8439608618499637e-05, "loss": 0.4859, "step": 348 }, { "epoch": 0.5260976069342378, "grad_norm": 0.5715093886190423, "learning_rate": 1.842545393023949e-05, "loss": 0.436, "step": 349 }, { "epoch": 0.5276050499340493, "grad_norm": 0.5370655215760771, "learning_rate": 1.841124081520431e-05, "loss": 0.4545, "step": 350 }, { "epoch": 0.529112492933861, "grad_norm": 0.5468526752808022, "learning_rate": 1.8396969371955724e-05, "loss": 0.4412, "step": 351 }, { "epoch": 0.5306199359336725, "grad_norm": 0.5386055180491347, "learning_rate": 1.838263969945985e-05, "loss": 0.455, "step": 352 }, { "epoch": 0.532127378933484, "grad_norm": 0.5273830292324821, "learning_rate": 1.836825189708659e-05, "loss": 0.4208, "step": 353 }, { "epoch": 0.5336348219332957, "grad_norm": 0.5324858057392972, "learning_rate": 1.8353806064608953e-05, "loss": 0.4259, "step": 354 }, { "epoch": 0.5351422649331072, "grad_norm": 0.5185086851614243, "learning_rate": 1.833930230220236e-05, "loss": 0.4506, "step": 355 }, { "epoch": 0.5366497079329188, "grad_norm": 0.5553133756097826, "learning_rate": 1.8324740710443955e-05, "loss": 0.4629, "step": 356 }, { "epoch": 0.5381571509327303, "grad_norm": 0.5742120676044152, "learning_rate": 1.831012139031189e-05, "loss": 0.4357, "step": 357 }, { "epoch": 0.539664593932542, "grad_norm": 0.5605121444976939, "learning_rate": 1.829544444318466e-05, "loss": 0.4606, "step": 358 }, { "epoch": 0.5411720369323535, "grad_norm": 0.6092704764024721, "learning_rate": 1.8280709970840352e-05, "loss": 0.4589, "step": 359 }, { "epoch": 0.542679479932165, "grad_norm": 0.5515104498699946, "learning_rate": 1.8265918075455985e-05, "loss": 0.4554, "step": 360 }, { "epoch": 0.5441869229319767, "grad_norm": 0.5517752011641777, "learning_rate": 1.8251068859606777e-05, "loss": 0.4446, "step": 361 }, { "epoch": 0.5456943659317882, "grad_norm": 0.523313087940014, "learning_rate": 1.823616242626542e-05, "loss": 0.4453, "step": 362 }, { "epoch": 0.5472018089315998, "grad_norm": 0.5555090795115328, "learning_rate": 1.8221198878801415e-05, "loss": 0.431, "step": 363 }, { "epoch": 0.5487092519314113, "grad_norm": 0.5254077832278897, "learning_rate": 1.8206178320980295e-05, "loss": 0.4512, "step": 364 }, { "epoch": 0.5502166949312229, "grad_norm": 0.5382752275452225, "learning_rate": 1.819110085696295e-05, "loss": 0.4489, "step": 365 }, { "epoch": 0.5517241379310345, "grad_norm": 0.5752845306460045, "learning_rate": 1.817596659130489e-05, "loss": 0.4659, "step": 366 }, { "epoch": 0.553231580930846, "grad_norm": 0.534082668899082, "learning_rate": 1.816077562895551e-05, "loss": 0.443, "step": 367 }, { "epoch": 0.5547390239306577, "grad_norm": 0.4935673245960411, "learning_rate": 1.814552807525738e-05, "loss": 0.4265, "step": 368 }, { "epoch": 0.5562464669304692, "grad_norm": 0.5587086828843211, "learning_rate": 1.81302240359455e-05, "loss": 0.4171, "step": 369 }, { "epoch": 0.5577539099302807, "grad_norm": 0.5706799552715889, "learning_rate": 1.8114863617146576e-05, "loss": 0.4419, "step": 370 }, { "epoch": 0.5592613529300924, "grad_norm": 0.5559814423377313, "learning_rate": 1.8099446925378278e-05, "loss": 0.4646, "step": 371 }, { "epoch": 0.5607687959299039, "grad_norm": 0.6399807563842037, "learning_rate": 1.8083974067548506e-05, "loss": 0.4662, "step": 372 }, { "epoch": 0.5622762389297155, "grad_norm": 0.5499667823126643, "learning_rate": 1.806844515095465e-05, "loss": 0.4705, "step": 373 }, { "epoch": 0.563783681929527, "grad_norm": 0.5802308318791667, "learning_rate": 1.8052860283282832e-05, "loss": 0.4285, "step": 374 }, { "epoch": 0.5652911249293386, "grad_norm": 0.616061675009139, "learning_rate": 1.8037219572607177e-05, "loss": 0.4661, "step": 375 }, { "epoch": 0.5667985679291502, "grad_norm": 0.5381388831653736, "learning_rate": 1.8021523127389066e-05, "loss": 0.442, "step": 376 }, { "epoch": 0.5683060109289617, "grad_norm": 0.5427863037336617, "learning_rate": 1.800577105647635e-05, "loss": 0.4737, "step": 377 }, { "epoch": 0.5698134539287734, "grad_norm": 0.647319829296571, "learning_rate": 1.7989963469102643e-05, "loss": 0.4597, "step": 378 }, { "epoch": 0.5713208969285849, "grad_norm": 0.5361993689265471, "learning_rate": 1.797410047488653e-05, "loss": 0.4515, "step": 379 }, { "epoch": 0.5728283399283964, "grad_norm": 0.5928443348297506, "learning_rate": 1.7958182183830816e-05, "loss": 0.4383, "step": 380 }, { "epoch": 0.574335782928208, "grad_norm": 0.5525429424873411, "learning_rate": 1.794220870632177e-05, "loss": 0.4676, "step": 381 }, { "epoch": 0.5758432259280196, "grad_norm": 0.6048913598018805, "learning_rate": 1.7926180153128358e-05, "loss": 0.4803, "step": 382 }, { "epoch": 0.5773506689278312, "grad_norm": 0.6159208841600681, "learning_rate": 1.791009663540146e-05, "loss": 0.4446, "step": 383 }, { "epoch": 0.5788581119276427, "grad_norm": 0.604058916697408, "learning_rate": 1.789395826467312e-05, "loss": 0.4406, "step": 384 }, { "epoch": 0.5803655549274543, "grad_norm": 0.6189321454832999, "learning_rate": 1.7877765152855757e-05, "loss": 0.4757, "step": 385 }, { "epoch": 0.5818729979272659, "grad_norm": 0.5252310621840579, "learning_rate": 1.78615174122414e-05, "loss": 0.4226, "step": 386 }, { "epoch": 0.5833804409270774, "grad_norm": 0.6058698433864601, "learning_rate": 1.78452151555009e-05, "loss": 0.4242, "step": 387 }, { "epoch": 0.5848878839268891, "grad_norm": 0.5784597918661724, "learning_rate": 1.7828858495683162e-05, "loss": 0.4546, "step": 388 }, { "epoch": 0.5863953269267006, "grad_norm": 0.5778733445604559, "learning_rate": 1.781244754621434e-05, "loss": 0.4474, "step": 389 }, { "epoch": 0.5879027699265121, "grad_norm": 0.5574362195371769, "learning_rate": 1.779598242089707e-05, "loss": 0.4461, "step": 390 }, { "epoch": 0.5894102129263237, "grad_norm": 0.6035018906117913, "learning_rate": 1.7779463233909677e-05, "loss": 0.4647, "step": 391 }, { "epoch": 0.5909176559261353, "grad_norm": 0.5783320653215531, "learning_rate": 1.7762890099805362e-05, "loss": 0.4509, "step": 392 }, { "epoch": 0.5924250989259469, "grad_norm": 0.608063697903211, "learning_rate": 1.774626313351145e-05, "loss": 0.4496, "step": 393 }, { "epoch": 0.5939325419257584, "grad_norm": 0.5637493289630973, "learning_rate": 1.7729582450328547e-05, "loss": 0.4548, "step": 394 }, { "epoch": 0.59543998492557, "grad_norm": 0.5878505952019026, "learning_rate": 1.771284816592978e-05, "loss": 0.4025, "step": 395 }, { "epoch": 0.5969474279253816, "grad_norm": 0.5732228081169485, "learning_rate": 1.7696060396359956e-05, "loss": 0.4155, "step": 396 }, { "epoch": 0.5984548709251931, "grad_norm": 0.5275574748856542, "learning_rate": 1.7679219258034798e-05, "loss": 0.4668, "step": 397 }, { "epoch": 0.5999623139250048, "grad_norm": 0.565193432089848, "learning_rate": 1.7662324867740102e-05, "loss": 0.464, "step": 398 }, { "epoch": 0.6014697569248163, "grad_norm": 0.5276065053060457, "learning_rate": 1.7645377342630956e-05, "loss": 0.4641, "step": 399 }, { "epoch": 0.6029771999246278, "grad_norm": 0.5504334109425478, "learning_rate": 1.76283768002309e-05, "loss": 0.4288, "step": 400 }, { "epoch": 0.6044846429244394, "grad_norm": 0.6059296820868759, "learning_rate": 1.7611323358431145e-05, "loss": 0.4961, "step": 401 }, { "epoch": 0.605992085924251, "grad_norm": 0.5077017761738585, "learning_rate": 1.759421713548971e-05, "loss": 0.4706, "step": 402 }, { "epoch": 0.6074995289240626, "grad_norm": 0.5590656170710925, "learning_rate": 1.757705825003065e-05, "loss": 0.4034, "step": 403 }, { "epoch": 0.6090069719238741, "grad_norm": 0.525709220345065, "learning_rate": 1.7559846821043205e-05, "loss": 0.4379, "step": 404 }, { "epoch": 0.6105144149236857, "grad_norm": 0.5538945207929713, "learning_rate": 1.754258296788097e-05, "loss": 0.445, "step": 405 }, { "epoch": 0.6120218579234973, "grad_norm": 0.5517645766831191, "learning_rate": 1.7525266810261096e-05, "loss": 0.4469, "step": 406 }, { "epoch": 0.6135293009233088, "grad_norm": 0.5594555749715797, "learning_rate": 1.7507898468263422e-05, "loss": 0.4343, "step": 407 }, { "epoch": 0.6150367439231204, "grad_norm": 0.5530618540850076, "learning_rate": 1.7490478062329686e-05, "loss": 0.4625, "step": 408 }, { "epoch": 0.616544186922932, "grad_norm": 0.5948076942836006, "learning_rate": 1.7473005713262644e-05, "loss": 0.4497, "step": 409 }, { "epoch": 0.6180516299227435, "grad_norm": 0.5776155556563956, "learning_rate": 1.7455481542225272e-05, "loss": 0.3959, "step": 410 }, { "epoch": 0.6195590729225551, "grad_norm": 0.5391682645939875, "learning_rate": 1.7437905670739893e-05, "loss": 0.4337, "step": 411 }, { "epoch": 0.6210665159223667, "grad_norm": 0.5918312025262793, "learning_rate": 1.7420278220687366e-05, "loss": 0.4749, "step": 412 }, { "epoch": 0.6225739589221783, "grad_norm": 0.5169533403943937, "learning_rate": 1.7402599314306207e-05, "loss": 0.4361, "step": 413 }, { "epoch": 0.6240814019219898, "grad_norm": 0.5757476582664114, "learning_rate": 1.7384869074191777e-05, "loss": 0.4423, "step": 414 }, { "epoch": 0.6255888449218014, "grad_norm": 0.5789420594237762, "learning_rate": 1.7367087623295394e-05, "loss": 0.4493, "step": 415 }, { "epoch": 0.627096287921613, "grad_norm": 0.5146689624027024, "learning_rate": 1.7349255084923517e-05, "loss": 0.4128, "step": 416 }, { "epoch": 0.6286037309214245, "grad_norm": 0.5556214483108315, "learning_rate": 1.7331371582736864e-05, "loss": 0.4097, "step": 417 }, { "epoch": 0.6301111739212361, "grad_norm": 0.5781033815860408, "learning_rate": 1.731343724074957e-05, "loss": 0.4755, "step": 418 }, { "epoch": 0.6316186169210477, "grad_norm": 0.505299705771376, "learning_rate": 1.7295452183328317e-05, "loss": 0.423, "step": 419 }, { "epoch": 0.6331260599208592, "grad_norm": 0.6019529322565086, "learning_rate": 1.7277416535191478e-05, "loss": 0.4467, "step": 420 }, { "epoch": 0.6346335029206708, "grad_norm": 0.5423258091864472, "learning_rate": 1.7259330421408247e-05, "loss": 0.4297, "step": 421 }, { "epoch": 0.6361409459204824, "grad_norm": 0.550859799446333, "learning_rate": 1.7241193967397784e-05, "loss": 0.4334, "step": 422 }, { "epoch": 0.637648388920294, "grad_norm": 0.5436505610454662, "learning_rate": 1.7223007298928322e-05, "loss": 0.4227, "step": 423 }, { "epoch": 0.6391558319201055, "grad_norm": 0.5265015330498195, "learning_rate": 1.7204770542116326e-05, "loss": 0.4407, "step": 424 }, { "epoch": 0.640663274919917, "grad_norm": 0.577557633955233, "learning_rate": 1.7186483823425582e-05, "loss": 0.4794, "step": 425 }, { "epoch": 0.6421707179197287, "grad_norm": 0.5304780945155085, "learning_rate": 1.7168147269666357e-05, "loss": 0.4306, "step": 426 }, { "epoch": 0.6436781609195402, "grad_norm": 0.5436263482054755, "learning_rate": 1.714976100799449e-05, "loss": 0.4505, "step": 427 }, { "epoch": 0.6451856039193518, "grad_norm": 0.5239803005942689, "learning_rate": 1.713132516591053e-05, "loss": 0.4204, "step": 428 }, { "epoch": 0.6466930469191634, "grad_norm": 0.5640485363783228, "learning_rate": 1.7112839871258838e-05, "loss": 0.4709, "step": 429 }, { "epoch": 0.6482004899189749, "grad_norm": 0.5112413611963181, "learning_rate": 1.7094305252226713e-05, "loss": 0.4352, "step": 430 }, { "epoch": 0.6497079329187865, "grad_norm": 0.5839208365283748, "learning_rate": 1.7075721437343488e-05, "loss": 0.467, "step": 431 }, { "epoch": 0.6512153759185981, "grad_norm": 0.5264144807133015, "learning_rate": 1.705708855547966e-05, "loss": 0.4427, "step": 432 }, { "epoch": 0.6527228189184097, "grad_norm": 0.503285177882026, "learning_rate": 1.7038406735845967e-05, "loss": 0.4206, "step": 433 }, { "epoch": 0.6542302619182212, "grad_norm": 0.523921175908132, "learning_rate": 1.7019676107992523e-05, "loss": 0.4636, "step": 434 }, { "epoch": 0.6557377049180327, "grad_norm": 0.5213012549969936, "learning_rate": 1.70008968018079e-05, "loss": 0.4385, "step": 435 }, { "epoch": 0.6572451479178444, "grad_norm": 0.5616975925596913, "learning_rate": 1.6982068947518235e-05, "loss": 0.4495, "step": 436 }, { "epoch": 0.6587525909176559, "grad_norm": 0.5094741288290618, "learning_rate": 1.6963192675686312e-05, "loss": 0.4354, "step": 437 }, { "epoch": 0.6602600339174675, "grad_norm": 0.5691859599654164, "learning_rate": 1.694426811721069e-05, "loss": 0.4121, "step": 438 }, { "epoch": 0.6617674769172791, "grad_norm": 0.565755177059836, "learning_rate": 1.6925295403324758e-05, "loss": 0.4291, "step": 439 }, { "epoch": 0.6632749199170906, "grad_norm": 0.5182694692522232, "learning_rate": 1.6906274665595854e-05, "loss": 0.4187, "step": 440 }, { "epoch": 0.6647823629169022, "grad_norm": 0.5442306033345655, "learning_rate": 1.688720603592432e-05, "loss": 0.4596, "step": 441 }, { "epoch": 0.6662898059167138, "grad_norm": 0.508987211991653, "learning_rate": 1.6868089646542632e-05, "loss": 0.4218, "step": 442 }, { "epoch": 0.6677972489165254, "grad_norm": 0.5409018441358341, "learning_rate": 1.6848925630014445e-05, "loss": 0.4422, "step": 443 }, { "epoch": 0.6693046919163369, "grad_norm": 0.5332135170482968, "learning_rate": 1.6829714119233688e-05, "loss": 0.4742, "step": 444 }, { "epoch": 0.6708121349161484, "grad_norm": 0.510365685539909, "learning_rate": 1.6810455247423634e-05, "loss": 0.4308, "step": 445 }, { "epoch": 0.6723195779159601, "grad_norm": 0.5088383566851198, "learning_rate": 1.6791149148136003e-05, "loss": 0.4491, "step": 446 }, { "epoch": 0.6738270209157716, "grad_norm": 0.5398522018308489, "learning_rate": 1.677179595525e-05, "loss": 0.465, "step": 447 }, { "epoch": 0.6753344639155832, "grad_norm": 0.5312851766133058, "learning_rate": 1.675239580297141e-05, "loss": 0.4574, "step": 448 }, { "epoch": 0.6768419069153948, "grad_norm": 0.5377924163432233, "learning_rate": 1.6732948825831657e-05, "loss": 0.4282, "step": 449 }, { "epoch": 0.6783493499152063, "grad_norm": 0.5411515105207517, "learning_rate": 1.671345515868688e-05, "loss": 0.437, "step": 450 }, { "epoch": 0.6798567929150179, "grad_norm": 0.5061423487479686, "learning_rate": 1.6693914936716983e-05, "loss": 0.4244, "step": 451 }, { "epoch": 0.6813642359148294, "grad_norm": 0.5390647508447596, "learning_rate": 1.6674328295424723e-05, "loss": 0.4395, "step": 452 }, { "epoch": 0.6828716789146411, "grad_norm": 0.5706362763533134, "learning_rate": 1.6654695370634738e-05, "loss": 0.4421, "step": 453 }, { "epoch": 0.6843791219144526, "grad_norm": 0.5330284685793139, "learning_rate": 1.6635016298492628e-05, "loss": 0.4303, "step": 454 }, { "epoch": 0.6858865649142641, "grad_norm": 0.5267067326608682, "learning_rate": 1.6615291215464005e-05, "loss": 0.4245, "step": 455 }, { "epoch": 0.6873940079140758, "grad_norm": 0.5726680200512305, "learning_rate": 1.6595520258333545e-05, "loss": 0.4752, "step": 456 }, { "epoch": 0.6889014509138873, "grad_norm": 0.5183865668680759, "learning_rate": 1.657570356420404e-05, "loss": 0.4542, "step": 457 }, { "epoch": 0.6904088939136989, "grad_norm": 0.553551099478117, "learning_rate": 1.6555841270495456e-05, "loss": 0.445, "step": 458 }, { "epoch": 0.6919163369135105, "grad_norm": 0.5929224658029257, "learning_rate": 1.6535933514943955e-05, "loss": 0.4183, "step": 459 }, { "epoch": 0.693423779913322, "grad_norm": 0.5010271872134405, "learning_rate": 1.6515980435600965e-05, "loss": 0.4169, "step": 460 }, { "epoch": 0.6949312229131336, "grad_norm": 0.49068598527278895, "learning_rate": 1.6495982170832224e-05, "loss": 0.4122, "step": 461 }, { "epoch": 0.6964386659129451, "grad_norm": 0.5288472547252633, "learning_rate": 1.6475938859316795e-05, "loss": 0.4154, "step": 462 }, { "epoch": 0.6979461089127568, "grad_norm": 0.5364001246117184, "learning_rate": 1.6455850640046134e-05, "loss": 0.4247, "step": 463 }, { "epoch": 0.6994535519125683, "grad_norm": 0.5248089160285507, "learning_rate": 1.6435717652323097e-05, "loss": 0.4522, "step": 464 }, { "epoch": 0.7009609949123798, "grad_norm": 0.5871578611838155, "learning_rate": 1.6415540035761008e-05, "loss": 0.4477, "step": 465 }, { "epoch": 0.7024684379121915, "grad_norm": 0.531098674787926, "learning_rate": 1.639531793028265e-05, "loss": 0.43, "step": 466 }, { "epoch": 0.703975880912003, "grad_norm": 0.6050322359617515, "learning_rate": 1.637505147611934e-05, "loss": 0.4533, "step": 467 }, { "epoch": 0.7054833239118146, "grad_norm": 0.5045703819799817, "learning_rate": 1.6354740813809917e-05, "loss": 0.4021, "step": 468 }, { "epoch": 0.7069907669116261, "grad_norm": 0.5129545738188582, "learning_rate": 1.6334386084199787e-05, "loss": 0.4517, "step": 469 }, { "epoch": 0.7084982099114377, "grad_norm": 0.5736577274561188, "learning_rate": 1.631398742843995e-05, "loss": 0.418, "step": 470 }, { "epoch": 0.7100056529112493, "grad_norm": 0.5323460252829038, "learning_rate": 1.629354498798601e-05, "loss": 0.4251, "step": 471 }, { "epoch": 0.7115130959110608, "grad_norm": 0.5747199097534378, "learning_rate": 1.627305890459719e-05, "loss": 0.4394, "step": 472 }, { "epoch": 0.7130205389108725, "grad_norm": 0.5646262513047455, "learning_rate": 1.625252932033538e-05, "loss": 0.4297, "step": 473 }, { "epoch": 0.714527981910684, "grad_norm": 0.49304427786239235, "learning_rate": 1.6231956377564095e-05, "loss": 0.4224, "step": 474 }, { "epoch": 0.7160354249104955, "grad_norm": 0.5791416730858486, "learning_rate": 1.621134021894756e-05, "loss": 0.4388, "step": 475 }, { "epoch": 0.7175428679103072, "grad_norm": 0.5186150019034591, "learning_rate": 1.619068098744965e-05, "loss": 0.4422, "step": 476 }, { "epoch": 0.7190503109101187, "grad_norm": 0.5839335428128258, "learning_rate": 1.6169978826332955e-05, "loss": 0.458, "step": 477 }, { "epoch": 0.7205577539099303, "grad_norm": 0.5613046419371709, "learning_rate": 1.6149233879157747e-05, "loss": 0.4669, "step": 478 }, { "epoch": 0.7220651969097418, "grad_norm": 0.5154157204007299, "learning_rate": 1.6128446289781012e-05, "loss": 0.4372, "step": 479 }, { "epoch": 0.7235726399095535, "grad_norm": 0.5677977726488427, "learning_rate": 1.610761620235543e-05, "loss": 0.4731, "step": 480 }, { "epoch": 0.725080082909365, "grad_norm": 0.5375971717165063, "learning_rate": 1.60867437613284e-05, "loss": 0.4566, "step": 481 }, { "epoch": 0.7265875259091765, "grad_norm": 0.49724342603457516, "learning_rate": 1.6065829111441e-05, "loss": 0.4507, "step": 482 }, { "epoch": 0.7280949689089882, "grad_norm": 0.5827089081742053, "learning_rate": 1.6044872397727037e-05, "loss": 0.4564, "step": 483 }, { "epoch": 0.7296024119087997, "grad_norm": 0.5474489228753104, "learning_rate": 1.6023873765511993e-05, "loss": 0.4309, "step": 484 }, { "epoch": 0.7311098549086112, "grad_norm": 0.5319969584661621, "learning_rate": 1.6002833360412044e-05, "loss": 0.4394, "step": 485 }, { "epoch": 0.7326172979084229, "grad_norm": 0.5521662619957021, "learning_rate": 1.5981751328333036e-05, "loss": 0.4568, "step": 486 }, { "epoch": 0.7341247409082344, "grad_norm": 0.4814653766664411, "learning_rate": 1.5960627815469486e-05, "loss": 0.4066, "step": 487 }, { "epoch": 0.735632183908046, "grad_norm": 0.5109256400558994, "learning_rate": 1.5939462968303554e-05, "loss": 0.4272, "step": 488 }, { "epoch": 0.7371396269078575, "grad_norm": 0.5357957318401174, "learning_rate": 1.5918256933604047e-05, "loss": 0.4237, "step": 489 }, { "epoch": 0.7386470699076692, "grad_norm": 0.5396229844011063, "learning_rate": 1.589700985842538e-05, "loss": 0.4205, "step": 490 }, { "epoch": 0.7401545129074807, "grad_norm": 0.5056971418930007, "learning_rate": 1.5875721890106574e-05, "loss": 0.4558, "step": 491 }, { "epoch": 0.7416619559072922, "grad_norm": 0.5466763607345122, "learning_rate": 1.5854393176270205e-05, "loss": 0.4262, "step": 492 }, { "epoch": 0.7431693989071039, "grad_norm": 0.5318696480713733, "learning_rate": 1.5833023864821427e-05, "loss": 0.4222, "step": 493 }, { "epoch": 0.7446768419069154, "grad_norm": 0.5577732122364522, "learning_rate": 1.5811614103946905e-05, "loss": 0.4643, "step": 494 }, { "epoch": 0.746184284906727, "grad_norm": 0.5396811070945262, "learning_rate": 1.5790164042113805e-05, "loss": 0.4619, "step": 495 }, { "epoch": 0.7476917279065385, "grad_norm": 0.5116348501037207, "learning_rate": 1.576867382806877e-05, "loss": 0.4257, "step": 496 }, { "epoch": 0.7491991709063501, "grad_norm": 0.5376269628887883, "learning_rate": 1.5747143610836873e-05, "loss": 0.4431, "step": 497 }, { "epoch": 0.7507066139061617, "grad_norm": 0.5552456121649234, "learning_rate": 1.5725573539720592e-05, "loss": 0.4345, "step": 498 }, { "epoch": 0.7522140569059732, "grad_norm": 0.5525594597252514, "learning_rate": 1.570396376429877e-05, "loss": 0.4288, "step": 499 }, { "epoch": 0.7537214999057849, "grad_norm": 0.5130914024917077, "learning_rate": 1.5682314434425593e-05, "loss": 0.4506, "step": 500 }, { "epoch": 0.7552289429055964, "grad_norm": 0.5438445066019086, "learning_rate": 1.5660625700229526e-05, "loss": 0.451, "step": 501 }, { "epoch": 0.7567363859054079, "grad_norm": 0.5393532424898553, "learning_rate": 1.5638897712112303e-05, "loss": 0.4339, "step": 502 }, { "epoch": 0.7582438289052196, "grad_norm": 0.5067131473915181, "learning_rate": 1.561713062074785e-05, "loss": 0.4452, "step": 503 }, { "epoch": 0.7597512719050311, "grad_norm": 0.511705817056659, "learning_rate": 1.5595324577081265e-05, "loss": 0.4227, "step": 504 }, { "epoch": 0.7612587149048426, "grad_norm": 0.5105016396695756, "learning_rate": 1.5573479732327758e-05, "loss": 0.4223, "step": 505 }, { "epoch": 0.7627661579046542, "grad_norm": 0.508814615305124, "learning_rate": 1.555159623797161e-05, "loss": 0.4649, "step": 506 }, { "epoch": 0.7642736009044658, "grad_norm": 0.5115538447430213, "learning_rate": 1.552967424576512e-05, "loss": 0.4257, "step": 507 }, { "epoch": 0.7657810439042774, "grad_norm": 0.519809456344861, "learning_rate": 1.5507713907727557e-05, "loss": 0.4393, "step": 508 }, { "epoch": 0.7672884869040889, "grad_norm": 0.5220982867467517, "learning_rate": 1.5485715376144087e-05, "loss": 0.4296, "step": 509 }, { "epoch": 0.7687959299039006, "grad_norm": 0.4819994486336346, "learning_rate": 1.5463678803564753e-05, "loss": 0.4227, "step": 510 }, { "epoch": 0.7703033729037121, "grad_norm": 0.5721785385849657, "learning_rate": 1.5441604342803374e-05, "loss": 0.4446, "step": 511 }, { "epoch": 0.7718108159035236, "grad_norm": 0.5203314012229143, "learning_rate": 1.5419492146936518e-05, "loss": 0.4205, "step": 512 }, { "epoch": 0.7733182589033352, "grad_norm": 0.5359755271436466, "learning_rate": 1.5397342369302425e-05, "loss": 0.4402, "step": 513 }, { "epoch": 0.7748257019031468, "grad_norm": 0.5233100133672925, "learning_rate": 1.5375155163499953e-05, "loss": 0.4177, "step": 514 }, { "epoch": 0.7763331449029583, "grad_norm": 0.5349268255121612, "learning_rate": 1.5352930683387502e-05, "loss": 0.4586, "step": 515 }, { "epoch": 0.7778405879027699, "grad_norm": 0.5815705753331589, "learning_rate": 1.5330669083081956e-05, "loss": 0.4427, "step": 516 }, { "epoch": 0.7793480309025815, "grad_norm": 0.49665657788094364, "learning_rate": 1.5308370516957617e-05, "loss": 0.4201, "step": 517 }, { "epoch": 0.7808554739023931, "grad_norm": 0.5160010880115449, "learning_rate": 1.528603513964511e-05, "loss": 0.4261, "step": 518 }, { "epoch": 0.7823629169022046, "grad_norm": 0.5468406227400142, "learning_rate": 1.5263663106030347e-05, "loss": 0.4116, "step": 519 }, { "epoch": 0.7838703599020163, "grad_norm": 0.5236112386795565, "learning_rate": 1.5241254571253433e-05, "loss": 0.4317, "step": 520 }, { "epoch": 0.7853778029018278, "grad_norm": 0.5715363020786929, "learning_rate": 1.5218809690707583e-05, "loss": 0.4288, "step": 521 }, { "epoch": 0.7868852459016393, "grad_norm": 0.5191719390295657, "learning_rate": 1.5196328620038059e-05, "loss": 0.4126, "step": 522 }, { "epoch": 0.7883926889014509, "grad_norm": 0.5236405890133281, "learning_rate": 1.5173811515141083e-05, "loss": 0.4024, "step": 523 }, { "epoch": 0.7899001319012625, "grad_norm": 0.5689433953014548, "learning_rate": 1.5151258532162771e-05, "loss": 0.4377, "step": 524 }, { "epoch": 0.791407574901074, "grad_norm": 0.5352416985872532, "learning_rate": 1.5128669827498024e-05, "loss": 0.4354, "step": 525 }, { "epoch": 0.7929150179008856, "grad_norm": 0.5274897373659767, "learning_rate": 1.5106045557789453e-05, "loss": 0.4391, "step": 526 }, { "epoch": 0.7944224609006972, "grad_norm": 0.5240353462138522, "learning_rate": 1.5083385879926309e-05, "loss": 0.4461, "step": 527 }, { "epoch": 0.7959299039005088, "grad_norm": 0.5040339622037668, "learning_rate": 1.5060690951043385e-05, "loss": 0.428, "step": 528 }, { "epoch": 0.7974373469003203, "grad_norm": 0.6056664440579997, "learning_rate": 1.5037960928519902e-05, "loss": 0.4667, "step": 529 }, { "epoch": 0.7989447899001318, "grad_norm": 0.5064874652403102, "learning_rate": 1.501519596997847e-05, "loss": 0.4174, "step": 530 }, { "epoch": 0.8004522328999435, "grad_norm": 0.5178815992344113, "learning_rate": 1.499239623328394e-05, "loss": 0.4143, "step": 531 }, { "epoch": 0.801959675899755, "grad_norm": 0.5433275328773004, "learning_rate": 1.4969561876542348e-05, "loss": 0.4308, "step": 532 }, { "epoch": 0.8034671188995666, "grad_norm": 0.5067490187395532, "learning_rate": 1.4946693058099802e-05, "loss": 0.4383, "step": 533 }, { "epoch": 0.8049745618993782, "grad_norm": 0.49712616081242367, "learning_rate": 1.4923789936541378e-05, "loss": 0.423, "step": 534 }, { "epoch": 0.8064820048991898, "grad_norm": 0.5142222567824052, "learning_rate": 1.4900852670690044e-05, "loss": 0.4427, "step": 535 }, { "epoch": 0.8079894478990013, "grad_norm": 0.5138167933634391, "learning_rate": 1.487788141960553e-05, "loss": 0.426, "step": 536 }, { "epoch": 0.8094968908988129, "grad_norm": 0.49938679145962556, "learning_rate": 1.4854876342583246e-05, "loss": 0.4116, "step": 537 }, { "epoch": 0.8110043338986245, "grad_norm": 0.5630302514996013, "learning_rate": 1.4831837599153165e-05, "loss": 0.4569, "step": 538 }, { "epoch": 0.812511776898436, "grad_norm": 0.5068845911186761, "learning_rate": 1.4808765349078729e-05, "loss": 0.4174, "step": 539 }, { "epoch": 0.8140192198982475, "grad_norm": 0.5402742918446363, "learning_rate": 1.4785659752355724e-05, "loss": 0.4046, "step": 540 }, { "epoch": 0.8155266628980592, "grad_norm": 0.5486844481668101, "learning_rate": 1.4762520969211186e-05, "loss": 0.4225, "step": 541 }, { "epoch": 0.8170341058978707, "grad_norm": 0.5290035366810187, "learning_rate": 1.4739349160102285e-05, "loss": 0.4378, "step": 542 }, { "epoch": 0.8185415488976823, "grad_norm": 0.5374079241254692, "learning_rate": 1.4716144485715209e-05, "loss": 0.4299, "step": 543 }, { "epoch": 0.8200489918974939, "grad_norm": 0.4778906030205072, "learning_rate": 1.4692907106964051e-05, "loss": 0.3992, "step": 544 }, { "epoch": 0.8215564348973055, "grad_norm": 0.49060078784195343, "learning_rate": 1.4669637184989696e-05, "loss": 0.4243, "step": 545 }, { "epoch": 0.823063877897117, "grad_norm": 0.5253862030306666, "learning_rate": 1.4646334881158704e-05, "loss": 0.4236, "step": 546 }, { "epoch": 0.8245713208969286, "grad_norm": 0.5215051723939326, "learning_rate": 1.4623000357062184e-05, "loss": 0.4274, "step": 547 }, { "epoch": 0.8260787638967402, "grad_norm": 0.5071119070406966, "learning_rate": 1.459963377451468e-05, "loss": 0.4081, "step": 548 }, { "epoch": 0.8275862068965517, "grad_norm": 0.5180772114309931, "learning_rate": 1.457623529555305e-05, "loss": 0.4228, "step": 549 }, { "epoch": 0.8290936498963632, "grad_norm": 0.5198434876057629, "learning_rate": 1.4552805082435333e-05, "loss": 0.4328, "step": 550 }, { "epoch": 0.8306010928961749, "grad_norm": 0.53696356685593, "learning_rate": 1.4529343297639638e-05, "loss": 0.4311, "step": 551 }, { "epoch": 0.8321085358959864, "grad_norm": 0.5522072703618133, "learning_rate": 1.4505850103863007e-05, "loss": 0.4441, "step": 552 }, { "epoch": 0.833615978895798, "grad_norm": 0.5022303098504759, "learning_rate": 1.448232566402028e-05, "loss": 0.4233, "step": 553 }, { "epoch": 0.8351234218956096, "grad_norm": 0.5522095422296431, "learning_rate": 1.4458770141242992e-05, "loss": 0.4333, "step": 554 }, { "epoch": 0.8366308648954212, "grad_norm": 0.5232096502230357, "learning_rate": 1.4435183698878212e-05, "loss": 0.4286, "step": 555 }, { "epoch": 0.8381383078952327, "grad_norm": 0.46986995612699417, "learning_rate": 1.4411566500487425e-05, "loss": 0.4049, "step": 556 }, { "epoch": 0.8396457508950442, "grad_norm": 0.5077507527784849, "learning_rate": 1.4387918709845395e-05, "loss": 0.4144, "step": 557 }, { "epoch": 0.8411531938948559, "grad_norm": 0.5253570052023816, "learning_rate": 1.4364240490939032e-05, "loss": 0.4547, "step": 558 }, { "epoch": 0.8426606368946674, "grad_norm": 0.49921819408434215, "learning_rate": 1.4340532007966252e-05, "loss": 0.3949, "step": 559 }, { "epoch": 0.844168079894479, "grad_norm": 0.5411234788441551, "learning_rate": 1.4316793425334836e-05, "loss": 0.4445, "step": 560 }, { "epoch": 0.8456755228942906, "grad_norm": 0.5264546536830835, "learning_rate": 1.4293024907661295e-05, "loss": 0.4117, "step": 561 }, { "epoch": 0.8471829658941021, "grad_norm": 0.518655972625287, "learning_rate": 1.4269226619769727e-05, "loss": 0.4159, "step": 562 }, { "epoch": 0.8486904088939137, "grad_norm": 0.537382287002897, "learning_rate": 1.424539872669067e-05, "loss": 0.4395, "step": 563 }, { "epoch": 0.8501978518937253, "grad_norm": 0.4871628601960703, "learning_rate": 1.4221541393659966e-05, "loss": 0.4244, "step": 564 }, { "epoch": 0.8517052948935369, "grad_norm": 0.5323818502275258, "learning_rate": 1.4197654786117604e-05, "loss": 0.442, "step": 565 }, { "epoch": 0.8532127378933484, "grad_norm": 0.49211277864065, "learning_rate": 1.4173739069706586e-05, "loss": 0.4333, "step": 566 }, { "epoch": 0.8547201808931599, "grad_norm": 0.5016763716077036, "learning_rate": 1.414979441027176e-05, "loss": 0.4223, "step": 567 }, { "epoch": 0.8562276238929716, "grad_norm": 0.5072197589397037, "learning_rate": 1.4125820973858693e-05, "loss": 0.4166, "step": 568 }, { "epoch": 0.8577350668927831, "grad_norm": 0.5379841247223495, "learning_rate": 1.41018189267125e-05, "loss": 0.4457, "step": 569 }, { "epoch": 0.8592425098925947, "grad_norm": 0.5156171430561991, "learning_rate": 1.4077788435276701e-05, "loss": 0.4154, "step": 570 }, { "epoch": 0.8607499528924063, "grad_norm": 0.5377878469372074, "learning_rate": 1.4053729666192067e-05, "loss": 0.4437, "step": 571 }, { "epoch": 0.8622573958922178, "grad_norm": 0.5606843337820052, "learning_rate": 1.4029642786295452e-05, "loss": 0.4479, "step": 572 }, { "epoch": 0.8637648388920294, "grad_norm": 0.4989731388746451, "learning_rate": 1.400552796261866e-05, "loss": 0.407, "step": 573 }, { "epoch": 0.8652722818918409, "grad_norm": 0.5136932503470173, "learning_rate": 1.3981385362387268e-05, "loss": 0.4211, "step": 574 }, { "epoch": 0.8667797248916526, "grad_norm": 0.495625389098895, "learning_rate": 1.3957215153019463e-05, "loss": 0.4203, "step": 575 }, { "epoch": 0.8682871678914641, "grad_norm": 0.49590492700182753, "learning_rate": 1.3933017502124897e-05, "loss": 0.4123, "step": 576 }, { "epoch": 0.8697946108912756, "grad_norm": 0.5389299185456149, "learning_rate": 1.3908792577503514e-05, "loss": 0.4309, "step": 577 }, { "epoch": 0.8713020538910873, "grad_norm": 0.5014871721652727, "learning_rate": 1.3884540547144393e-05, "loss": 0.4159, "step": 578 }, { "epoch": 0.8728094968908988, "grad_norm": 0.49719473763201644, "learning_rate": 1.3860261579224574e-05, "loss": 0.4191, "step": 579 }, { "epoch": 0.8743169398907104, "grad_norm": 0.5102002869995407, "learning_rate": 1.3835955842107897e-05, "loss": 0.418, "step": 580 }, { "epoch": 0.875824382890522, "grad_norm": 0.497268362475834, "learning_rate": 1.3811623504343845e-05, "loss": 0.4092, "step": 581 }, { "epoch": 0.8773318258903335, "grad_norm": 0.49466892349875324, "learning_rate": 1.378726473466635e-05, "loss": 0.4154, "step": 582 }, { "epoch": 0.8788392688901451, "grad_norm": 0.5485556900908343, "learning_rate": 1.3762879701992642e-05, "loss": 0.4327, "step": 583 }, { "epoch": 0.8803467118899566, "grad_norm": 0.49193915962474927, "learning_rate": 1.373846857542208e-05, "loss": 0.4233, "step": 584 }, { "epoch": 0.8818541548897683, "grad_norm": 0.49157440442050665, "learning_rate": 1.3714031524234965e-05, "loss": 0.4255, "step": 585 }, { "epoch": 0.8833615978895798, "grad_norm": 0.5153566919676954, "learning_rate": 1.3689568717891381e-05, "loss": 0.4433, "step": 586 }, { "epoch": 0.8848690408893913, "grad_norm": 0.5151771531878016, "learning_rate": 1.3665080326029997e-05, "loss": 0.4313, "step": 587 }, { "epoch": 0.886376483889203, "grad_norm": 0.5172038128981158, "learning_rate": 1.364056651846693e-05, "loss": 0.4025, "step": 588 }, { "epoch": 0.8878839268890145, "grad_norm": 0.5197034910270297, "learning_rate": 1.3616027465194525e-05, "loss": 0.432, "step": 589 }, { "epoch": 0.889391369888826, "grad_norm": 0.5280686849313844, "learning_rate": 1.35914633363802e-05, "loss": 0.4093, "step": 590 }, { "epoch": 0.8908988128886377, "grad_norm": 0.5192774851448931, "learning_rate": 1.356687430236526e-05, "loss": 0.426, "step": 591 }, { "epoch": 0.8924062558884492, "grad_norm": 0.5407059497728999, "learning_rate": 1.3542260533663723e-05, "loss": 0.4408, "step": 592 }, { "epoch": 0.8939136988882608, "grad_norm": 0.5029787366533781, "learning_rate": 1.351762220096112e-05, "loss": 0.4134, "step": 593 }, { "epoch": 0.8954211418880723, "grad_norm": 0.5557133502339159, "learning_rate": 1.3492959475113332e-05, "loss": 0.4247, "step": 594 }, { "epoch": 0.896928584887884, "grad_norm": 0.5446161829977666, "learning_rate": 1.3468272527145388e-05, "loss": 0.4133, "step": 595 }, { "epoch": 0.8984360278876955, "grad_norm": 0.5055328441209378, "learning_rate": 1.3443561528250295e-05, "loss": 0.3916, "step": 596 }, { "epoch": 0.899943470887507, "grad_norm": 0.5874519416857665, "learning_rate": 1.3418826649787834e-05, "loss": 0.4339, "step": 597 }, { "epoch": 0.9014509138873187, "grad_norm": 0.5577170031704589, "learning_rate": 1.3394068063283387e-05, "loss": 0.458, "step": 598 }, { "epoch": 0.9029583568871302, "grad_norm": 0.5332814444729285, "learning_rate": 1.3369285940426737e-05, "loss": 0.4206, "step": 599 }, { "epoch": 0.9044657998869418, "grad_norm": 0.5654643143753597, "learning_rate": 1.334448045307088e-05, "loss": 0.4113, "step": 600 }, { "epoch": 0.9059732428867533, "grad_norm": 0.4979334800098818, "learning_rate": 1.331965177323084e-05, "loss": 0.4093, "step": 601 }, { "epoch": 0.9074806858865649, "grad_norm": 0.5415874467915235, "learning_rate": 1.3294800073082464e-05, "loss": 0.4366, "step": 602 }, { "epoch": 0.9089881288863765, "grad_norm": 0.5813207766062746, "learning_rate": 1.3269925524961237e-05, "loss": 0.4448, "step": 603 }, { "epoch": 0.910495571886188, "grad_norm": 0.5078359282634053, "learning_rate": 1.3245028301361086e-05, "loss": 0.4161, "step": 604 }, { "epoch": 0.9120030148859997, "grad_norm": 0.5539022471684321, "learning_rate": 1.3220108574933185e-05, "loss": 0.4056, "step": 605 }, { "epoch": 0.9135104578858112, "grad_norm": 0.48460567118259956, "learning_rate": 1.3195166518484748e-05, "loss": 0.4009, "step": 606 }, { "epoch": 0.9150179008856227, "grad_norm": 0.4843343744091719, "learning_rate": 1.317020230497784e-05, "loss": 0.4231, "step": 607 }, { "epoch": 0.9165253438854344, "grad_norm": 0.5190197613843625, "learning_rate": 1.3145216107528178e-05, "loss": 0.4029, "step": 608 }, { "epoch": 0.9180327868852459, "grad_norm": 0.4867573763184133, "learning_rate": 1.3120208099403926e-05, "loss": 0.3801, "step": 609 }, { "epoch": 0.9195402298850575, "grad_norm": 0.48900894299598635, "learning_rate": 1.3095178454024496e-05, "loss": 0.4413, "step": 610 }, { "epoch": 0.921047672884869, "grad_norm": 0.5555266761898254, "learning_rate": 1.3070127344959348e-05, "loss": 0.4144, "step": 611 }, { "epoch": 0.9225551158846806, "grad_norm": 0.5128649153965221, "learning_rate": 1.3045054945926775e-05, "loss": 0.4616, "step": 612 }, { "epoch": 0.9240625588844922, "grad_norm": 0.5194503259126656, "learning_rate": 1.3019961430792711e-05, "loss": 0.4229, "step": 613 }, { "epoch": 0.9255700018843037, "grad_norm": 0.49285532678009114, "learning_rate": 1.2994846973569524e-05, "loss": 0.4165, "step": 614 }, { "epoch": 0.9270774448841154, "grad_norm": 0.5197963588456296, "learning_rate": 1.2969711748414804e-05, "loss": 0.3947, "step": 615 }, { "epoch": 0.9285848878839269, "grad_norm": 0.542725727252665, "learning_rate": 1.2944555929630152e-05, "loss": 0.4261, "step": 616 }, { "epoch": 0.9300923308837384, "grad_norm": 0.5068570325444082, "learning_rate": 1.2919379691659979e-05, "loss": 0.453, "step": 617 }, { "epoch": 0.93159977388355, "grad_norm": 0.5138431602453551, "learning_rate": 1.2894183209090304e-05, "loss": 0.4482, "step": 618 }, { "epoch": 0.9331072168833616, "grad_norm": 0.5098264236378465, "learning_rate": 1.2868966656647522e-05, "loss": 0.4344, "step": 619 }, { "epoch": 0.9346146598831732, "grad_norm": 0.4932368518544031, "learning_rate": 1.2843730209197203e-05, "loss": 0.4444, "step": 620 }, { "epoch": 0.9361221028829847, "grad_norm": 0.48787838834596486, "learning_rate": 1.2818474041742885e-05, "loss": 0.3909, "step": 621 }, { "epoch": 0.9376295458827963, "grad_norm": 0.5042148044417084, "learning_rate": 1.2793198329424858e-05, "loss": 0.4114, "step": 622 }, { "epoch": 0.9391369888826079, "grad_norm": 0.5164275014163481, "learning_rate": 1.2767903247518945e-05, "loss": 0.4042, "step": 623 }, { "epoch": 0.9406444318824194, "grad_norm": 0.4878553181808082, "learning_rate": 1.2742588971435276e-05, "loss": 0.4108, "step": 624 }, { "epoch": 0.9421518748822311, "grad_norm": 0.4953872026297146, "learning_rate": 1.2717255676717106e-05, "loss": 0.4227, "step": 625 }, { "epoch": 0.9436593178820426, "grad_norm": 0.5623597137703112, "learning_rate": 1.2691903539039563e-05, "loss": 0.4436, "step": 626 }, { "epoch": 0.9451667608818541, "grad_norm": 0.539298059881258, "learning_rate": 1.2666532734208437e-05, "loss": 0.4384, "step": 627 }, { "epoch": 0.9466742038816657, "grad_norm": 0.5443120200340641, "learning_rate": 1.264114343815898e-05, "loss": 0.4413, "step": 628 }, { "epoch": 0.9481816468814773, "grad_norm": 0.5142650264217846, "learning_rate": 1.2615735826954664e-05, "loss": 0.4231, "step": 629 }, { "epoch": 0.9496890898812889, "grad_norm": 0.5566560995617864, "learning_rate": 1.2590310076785974e-05, "loss": 0.4458, "step": 630 }, { "epoch": 0.9511965328811004, "grad_norm": 0.484643722468428, "learning_rate": 1.256486636396917e-05, "loss": 0.3868, "step": 631 }, { "epoch": 0.952703975880912, "grad_norm": 0.5278211197592041, "learning_rate": 1.2539404864945087e-05, "loss": 0.3956, "step": 632 }, { "epoch": 0.9542114188807236, "grad_norm": 0.5339784329738423, "learning_rate": 1.2513925756277894e-05, "loss": 0.4065, "step": 633 }, { "epoch": 0.9557188618805351, "grad_norm": 0.4808436521240299, "learning_rate": 1.2488429214653871e-05, "loss": 0.3733, "step": 634 }, { "epoch": 0.9572263048803467, "grad_norm": 0.5245674565988473, "learning_rate": 1.24629154168802e-05, "loss": 0.4206, "step": 635 }, { "epoch": 0.9587337478801583, "grad_norm": 0.5091922264135481, "learning_rate": 1.2437384539883715e-05, "loss": 0.4321, "step": 636 }, { "epoch": 0.9602411908799698, "grad_norm": 0.48729820029525145, "learning_rate": 1.2411836760709686e-05, "loss": 0.3961, "step": 637 }, { "epoch": 0.9617486338797814, "grad_norm": 0.5224677796102979, "learning_rate": 1.2386272256520606e-05, "loss": 0.4094, "step": 638 }, { "epoch": 0.963256076879593, "grad_norm": 0.5291193644566966, "learning_rate": 1.2360691204594937e-05, "loss": 0.4202, "step": 639 }, { "epoch": 0.9647635198794046, "grad_norm": 0.5090746283917961, "learning_rate": 1.2335093782325889e-05, "loss": 0.4115, "step": 640 }, { "epoch": 0.9662709628792161, "grad_norm": 0.49844277614657384, "learning_rate": 1.2309480167220203e-05, "loss": 0.4138, "step": 641 }, { "epoch": 0.9677784058790277, "grad_norm": 0.5085446797250271, "learning_rate": 1.2283850536896907e-05, "loss": 0.4403, "step": 642 }, { "epoch": 0.9692858488788393, "grad_norm": 0.48811956114780947, "learning_rate": 1.2258205069086082e-05, "loss": 0.4132, "step": 643 }, { "epoch": 0.9707932918786508, "grad_norm": 0.5804699645229868, "learning_rate": 1.2232543941627641e-05, "loss": 0.4145, "step": 644 }, { "epoch": 0.9723007348784624, "grad_norm": 0.5223286630706884, "learning_rate": 1.2206867332470091e-05, "loss": 0.4451, "step": 645 }, { "epoch": 0.973808177878274, "grad_norm": 0.5431240213202171, "learning_rate": 1.2181175419669293e-05, "loss": 0.4106, "step": 646 }, { "epoch": 0.9753156208780855, "grad_norm": 0.4788749668502741, "learning_rate": 1.215546838138723e-05, "loss": 0.3947, "step": 647 }, { "epoch": 0.9768230638778971, "grad_norm": 0.4823666614879542, "learning_rate": 1.212974639589078e-05, "loss": 0.3805, "step": 648 }, { "epoch": 0.9783305068777087, "grad_norm": 0.5272835049687891, "learning_rate": 1.2104009641550472e-05, "loss": 0.4192, "step": 649 }, { "epoch": 0.9798379498775203, "grad_norm": 0.4899435333806439, "learning_rate": 1.2078258296839245e-05, "loss": 0.4242, "step": 650 }, { "epoch": 0.9813453928773318, "grad_norm": 0.48267520902055755, "learning_rate": 1.2052492540331218e-05, "loss": 0.3819, "step": 651 }, { "epoch": 0.9828528358771434, "grad_norm": 0.5208472855722491, "learning_rate": 1.2026712550700457e-05, "loss": 0.4268, "step": 652 }, { "epoch": 0.984360278876955, "grad_norm": 0.5182048450359825, "learning_rate": 1.200091850671972e-05, "loss": 0.3833, "step": 653 }, { "epoch": 0.9858677218767665, "grad_norm": 0.5524884939555313, "learning_rate": 1.1975110587259222e-05, "loss": 0.4099, "step": 654 }, { "epoch": 0.9873751648765781, "grad_norm": 0.5724743146915252, "learning_rate": 1.1949288971285411e-05, "loss": 0.4451, "step": 655 }, { "epoch": 0.9888826078763897, "grad_norm": 0.5560489536019798, "learning_rate": 1.1923453837859706e-05, "loss": 0.4245, "step": 656 }, { "epoch": 0.9903900508762012, "grad_norm": 0.5241693566205756, "learning_rate": 1.1897605366137264e-05, "loss": 0.426, "step": 657 }, { "epoch": 0.9918974938760128, "grad_norm": 0.5078011601273249, "learning_rate": 1.1871743735365735e-05, "loss": 0.4147, "step": 658 }, { "epoch": 0.9934049368758244, "grad_norm": 0.5014207467428378, "learning_rate": 1.1845869124884027e-05, "loss": 0.4029, "step": 659 }, { "epoch": 0.994912379875636, "grad_norm": 0.5184450473918536, "learning_rate": 1.1819981714121054e-05, "loss": 0.4338, "step": 660 }, { "epoch": 0.9964198228754475, "grad_norm": 0.5218529509897015, "learning_rate": 1.1794081682594491e-05, "loss": 0.4001, "step": 661 }, { "epoch": 0.997927265875259, "grad_norm": 0.5277285874094648, "learning_rate": 1.176816920990954e-05, "loss": 0.4225, "step": 662 }, { "epoch": 0.9994347088750707, "grad_norm": 0.506018413554039, "learning_rate": 1.174224447575767e-05, "loss": 0.4398, "step": 663 }, { "epoch": 1.0009421518748822, "grad_norm": 0.6655724719416495, "learning_rate": 1.171630765991538e-05, "loss": 0.377, "step": 664 }, { "epoch": 1.0024495948746939, "grad_norm": 0.5752880840432146, "learning_rate": 1.169035894224295e-05, "loss": 0.325, "step": 665 }, { "epoch": 1.0039570378745053, "grad_norm": 0.547046172496627, "learning_rate": 1.1664398502683194e-05, "loss": 0.3422, "step": 666 }, { "epoch": 1.005464480874317, "grad_norm": 0.6183956576461548, "learning_rate": 1.1638426521260211e-05, "loss": 0.3551, "step": 667 }, { "epoch": 1.0069719238741286, "grad_norm": 0.6272202909322583, "learning_rate": 1.1612443178078138e-05, "loss": 0.3293, "step": 668 }, { "epoch": 1.00847936687394, "grad_norm": 0.6733584649632783, "learning_rate": 1.1586448653319908e-05, "loss": 0.3495, "step": 669 }, { "epoch": 1.0099868098737517, "grad_norm": 0.5974677434978642, "learning_rate": 1.156044312724598e-05, "loss": 0.3339, "step": 670 }, { "epoch": 1.0114942528735633, "grad_norm": 0.5526640261136243, "learning_rate": 1.153442678019311e-05, "loss": 0.3629, "step": 671 }, { "epoch": 1.0130016958733747, "grad_norm": 0.5666634240071511, "learning_rate": 1.1508399792573095e-05, "loss": 0.3361, "step": 672 }, { "epoch": 1.0145091388731864, "grad_norm": 0.6224882966351409, "learning_rate": 1.1482362344871514e-05, "loss": 0.3545, "step": 673 }, { "epoch": 1.0160165818729978, "grad_norm": 0.5615749930186623, "learning_rate": 1.1456314617646482e-05, "loss": 0.3208, "step": 674 }, { "epoch": 1.0175240248728095, "grad_norm": 0.548490348465347, "learning_rate": 1.1430256791527406e-05, "loss": 0.3278, "step": 675 }, { "epoch": 1.019031467872621, "grad_norm": 0.6137191595237155, "learning_rate": 1.1404189047213716e-05, "loss": 0.3684, "step": 676 }, { "epoch": 1.0205389108724325, "grad_norm": 0.6128432091688398, "learning_rate": 1.137811156547362e-05, "loss": 0.3479, "step": 677 }, { "epoch": 1.0220463538722442, "grad_norm": 0.5530398492501923, "learning_rate": 1.1352024527142855e-05, "loss": 0.3258, "step": 678 }, { "epoch": 1.0235537968720558, "grad_norm": 0.5691801541559598, "learning_rate": 1.1325928113123431e-05, "loss": 0.3359, "step": 679 }, { "epoch": 1.0250612398718673, "grad_norm": 0.5996898750429057, "learning_rate": 1.129982250438237e-05, "loss": 0.34, "step": 680 }, { "epoch": 1.026568682871679, "grad_norm": 0.5203916917045198, "learning_rate": 1.1273707881950445e-05, "loss": 0.3194, "step": 681 }, { "epoch": 1.0280761258714906, "grad_norm": 0.5706678991613441, "learning_rate": 1.1247584426920962e-05, "loss": 0.3394, "step": 682 }, { "epoch": 1.029583568871302, "grad_norm": 0.558797423405198, "learning_rate": 1.1221452320448449e-05, "loss": 0.3476, "step": 683 }, { "epoch": 1.0310910118711136, "grad_norm": 0.5491796357132722, "learning_rate": 1.1195311743747445e-05, "loss": 0.3287, "step": 684 }, { "epoch": 1.0325984548709253, "grad_norm": 0.5423270097914835, "learning_rate": 1.116916287809122e-05, "loss": 0.3315, "step": 685 }, { "epoch": 1.0341058978707367, "grad_norm": 0.5440784988767636, "learning_rate": 1.1143005904810527e-05, "loss": 0.3409, "step": 686 }, { "epoch": 1.0356133408705483, "grad_norm": 0.5506460404964368, "learning_rate": 1.1116841005292339e-05, "loss": 0.3665, "step": 687 }, { "epoch": 1.03712078387036, "grad_norm": 0.5271450898091751, "learning_rate": 1.1090668360978589e-05, "loss": 0.3354, "step": 688 }, { "epoch": 1.0386282268701714, "grad_norm": 0.5116723363561022, "learning_rate": 1.106448815336493e-05, "loss": 0.3055, "step": 689 }, { "epoch": 1.040135669869983, "grad_norm": 0.5261827472069973, "learning_rate": 1.1038300563999455e-05, "loss": 0.3141, "step": 690 }, { "epoch": 1.0416431128697947, "grad_norm": 0.5675715863653521, "learning_rate": 1.1012105774481446e-05, "loss": 0.3576, "step": 691 }, { "epoch": 1.0431505558696061, "grad_norm": 0.542765155631167, "learning_rate": 1.0985903966460115e-05, "loss": 0.337, "step": 692 }, { "epoch": 1.0446579988694178, "grad_norm": 0.576467518182856, "learning_rate": 1.0959695321633346e-05, "loss": 0.3345, "step": 693 }, { "epoch": 1.0461654418692292, "grad_norm": 0.5261227763098979, "learning_rate": 1.0933480021746432e-05, "loss": 0.3137, "step": 694 }, { "epoch": 1.0476728848690409, "grad_norm": 0.5529375328569147, "learning_rate": 1.0907258248590816e-05, "loss": 0.332, "step": 695 }, { "epoch": 1.0491803278688525, "grad_norm": 0.5136240223834705, "learning_rate": 1.0881030184002827e-05, "loss": 0.3276, "step": 696 }, { "epoch": 1.050687770868664, "grad_norm": 0.5367848385477425, "learning_rate": 1.0854796009862434e-05, "loss": 0.3163, "step": 697 }, { "epoch": 1.0521952138684756, "grad_norm": 0.544930166455388, "learning_rate": 1.0828555908091958e-05, "loss": 0.359, "step": 698 }, { "epoch": 1.0537026568682872, "grad_norm": 0.5387564352002492, "learning_rate": 1.0802310060654832e-05, "loss": 0.339, "step": 699 }, { "epoch": 1.0552100998680987, "grad_norm": 0.5496802508408758, "learning_rate": 1.0776058649554336e-05, "loss": 0.3535, "step": 700 }, { "epoch": 1.0567175428679103, "grad_norm": 0.5348548485090446, "learning_rate": 1.0749801856832325e-05, "loss": 0.3368, "step": 701 }, { "epoch": 1.058224985867722, "grad_norm": 0.5794289951348468, "learning_rate": 1.0723539864567983e-05, "loss": 0.3596, "step": 702 }, { "epoch": 1.0597324288675334, "grad_norm": 0.5365708234277743, "learning_rate": 1.0697272854876537e-05, "loss": 0.3421, "step": 703 }, { "epoch": 1.061239871867345, "grad_norm": 0.5904399117303262, "learning_rate": 1.0671001009908015e-05, "loss": 0.3348, "step": 704 }, { "epoch": 1.0627473148671567, "grad_norm": 0.5204976732643493, "learning_rate": 1.0644724511845976e-05, "loss": 0.3525, "step": 705 }, { "epoch": 1.064254757866968, "grad_norm": 0.6120309774969117, "learning_rate": 1.0618443542906251e-05, "loss": 0.3727, "step": 706 }, { "epoch": 1.0657622008667798, "grad_norm": 0.6091575812702822, "learning_rate": 1.059215828533566e-05, "loss": 0.3588, "step": 707 }, { "epoch": 1.0672696438665912, "grad_norm": 0.591151755333861, "learning_rate": 1.0565868921410776e-05, "loss": 0.3363, "step": 708 }, { "epoch": 1.0687770868664028, "grad_norm": 0.5984602535754296, "learning_rate": 1.0539575633436645e-05, "loss": 0.3616, "step": 709 }, { "epoch": 1.0702845298662145, "grad_norm": 0.5604228857922577, "learning_rate": 1.0513278603745523e-05, "loss": 0.3398, "step": 710 }, { "epoch": 1.071791972866026, "grad_norm": 0.5557030870304388, "learning_rate": 1.0486978014695606e-05, "loss": 0.338, "step": 711 }, { "epoch": 1.0732994158658375, "grad_norm": 0.5730991612503363, "learning_rate": 1.0460674048669783e-05, "loss": 0.3219, "step": 712 }, { "epoch": 1.0748068588656492, "grad_norm": 0.6009828081011681, "learning_rate": 1.0434366888074363e-05, "loss": 0.3237, "step": 713 }, { "epoch": 1.0763143018654606, "grad_norm": 0.5386294130513889, "learning_rate": 1.0408056715337797e-05, "loss": 0.3391, "step": 714 }, { "epoch": 1.0778217448652723, "grad_norm": 0.5345878263288965, "learning_rate": 1.0381743712909424e-05, "loss": 0.3384, "step": 715 }, { "epoch": 1.079329187865084, "grad_norm": 0.6369538253688138, "learning_rate": 1.0355428063258224e-05, "loss": 0.35, "step": 716 }, { "epoch": 1.0808366308648953, "grad_norm": 0.5615591275271141, "learning_rate": 1.0329109948871512e-05, "loss": 0.3467, "step": 717 }, { "epoch": 1.082344073864707, "grad_norm": 0.6406352309238248, "learning_rate": 1.0302789552253702e-05, "loss": 0.3523, "step": 718 }, { "epoch": 1.0838515168645186, "grad_norm": 0.5212977047595297, "learning_rate": 1.0276467055925044e-05, "loss": 0.3185, "step": 719 }, { "epoch": 1.08535895986433, "grad_norm": 0.5443802073020193, "learning_rate": 1.0250142642420335e-05, "loss": 0.3396, "step": 720 }, { "epoch": 1.0868664028641417, "grad_norm": 0.5516128695838226, "learning_rate": 1.0223816494287675e-05, "loss": 0.3199, "step": 721 }, { "epoch": 1.0883738458639534, "grad_norm": 0.5459335385131995, "learning_rate": 1.0197488794087188e-05, "loss": 0.2979, "step": 722 }, { "epoch": 1.0898812888637648, "grad_norm": 0.5660471338581954, "learning_rate": 1.0171159724389766e-05, "loss": 0.3578, "step": 723 }, { "epoch": 1.0913887318635764, "grad_norm": 0.577383627814168, "learning_rate": 1.0144829467775794e-05, "loss": 0.3253, "step": 724 }, { "epoch": 1.092896174863388, "grad_norm": 0.5656943231881854, "learning_rate": 1.0118498206833886e-05, "loss": 0.3559, "step": 725 }, { "epoch": 1.0944036178631995, "grad_norm": 0.5427797556871369, "learning_rate": 1.0092166124159628e-05, "loss": 0.3299, "step": 726 }, { "epoch": 1.0959110608630112, "grad_norm": 0.5582731085039236, "learning_rate": 1.0065833402354302e-05, "loss": 0.342, "step": 727 }, { "epoch": 1.0974185038628228, "grad_norm": 0.5809252708008414, "learning_rate": 1.003950022402361e-05, "loss": 0.3553, "step": 728 }, { "epoch": 1.0989259468626342, "grad_norm": 0.5400373499865376, "learning_rate": 1.0013166771776441e-05, "loss": 0.3283, "step": 729 }, { "epoch": 1.1004333898624459, "grad_norm": 0.5280335723569519, "learning_rate": 9.986833228223562e-06, "loss": 0.3567, "step": 730 }, { "epoch": 1.1019408328622573, "grad_norm": 0.5756207231701386, "learning_rate": 9.96049977597639e-06, "loss": 0.3422, "step": 731 }, { "epoch": 1.103448275862069, "grad_norm": 0.5917844968799806, "learning_rate": 9.934166597645703e-06, "loss": 0.3803, "step": 732 }, { "epoch": 1.1049557188618806, "grad_norm": 0.5375048549376539, "learning_rate": 9.907833875840374e-06, "loss": 0.3421, "step": 733 }, { "epoch": 1.106463161861692, "grad_norm": 0.5685857382900585, "learning_rate": 9.881501793166117e-06, "loss": 0.3658, "step": 734 }, { "epoch": 1.1079706048615037, "grad_norm": 0.6069823667279429, "learning_rate": 9.85517053222421e-06, "loss": 0.3273, "step": 735 }, { "epoch": 1.1094780478613153, "grad_norm": 0.5196609920900314, "learning_rate": 9.82884027561024e-06, "loss": 0.3233, "step": 736 }, { "epoch": 1.1109854908611267, "grad_norm": 0.5702922246540342, "learning_rate": 9.802511205912815e-06, "loss": 0.35, "step": 737 }, { "epoch": 1.1124929338609384, "grad_norm": 0.563216447988931, "learning_rate": 9.776183505712327e-06, "loss": 0.3578, "step": 738 }, { "epoch": 1.11400037686075, "grad_norm": 0.5835000476343075, "learning_rate": 9.749857357579667e-06, "loss": 0.3753, "step": 739 }, { "epoch": 1.1155078198605615, "grad_norm": 0.5822012862085456, "learning_rate": 9.723532944074961e-06, "loss": 0.3035, "step": 740 }, { "epoch": 1.1170152628603731, "grad_norm": 0.5625362231656639, "learning_rate": 9.6972104477463e-06, "loss": 0.3669, "step": 741 }, { "epoch": 1.1185227058601848, "grad_norm": 0.5816421569187623, "learning_rate": 9.670890051128493e-06, "loss": 0.3264, "step": 742 }, { "epoch": 1.1200301488599962, "grad_norm": 0.6076866614497781, "learning_rate": 9.644571936741778e-06, "loss": 0.3448, "step": 743 }, { "epoch": 1.1215375918598078, "grad_norm": 0.5868211335333723, "learning_rate": 9.618256287090576e-06, "loss": 0.3453, "step": 744 }, { "epoch": 1.1230450348596195, "grad_norm": 0.5784910781884745, "learning_rate": 9.591943284662206e-06, "loss": 0.3543, "step": 745 }, { "epoch": 1.124552477859431, "grad_norm": 0.5577968039251089, "learning_rate": 9.56563311192564e-06, "loss": 0.356, "step": 746 }, { "epoch": 1.1260599208592426, "grad_norm": 0.5624603535612774, "learning_rate": 9.53932595133022e-06, "loss": 0.322, "step": 747 }, { "epoch": 1.127567363859054, "grad_norm": 0.5863194939952109, "learning_rate": 9.513021985304399e-06, "loss": 0.341, "step": 748 }, { "epoch": 1.1290748068588656, "grad_norm": 0.5297072497497793, "learning_rate": 9.486721396254484e-06, "loss": 0.3263, "step": 749 }, { "epoch": 1.1305822498586773, "grad_norm": 0.5597259831895821, "learning_rate": 9.460424366563355e-06, "loss": 0.3243, "step": 750 }, { "epoch": 1.1320896928584887, "grad_norm": 0.5464179018975297, "learning_rate": 9.434131078589224e-06, "loss": 0.3206, "step": 751 }, { "epoch": 1.1335971358583004, "grad_norm": 0.5464450895465798, "learning_rate": 9.407841714664343e-06, "loss": 0.3387, "step": 752 }, { "epoch": 1.135104578858112, "grad_norm": 0.5546542012199714, "learning_rate": 9.381556457093752e-06, "loss": 0.337, "step": 753 }, { "epoch": 1.1366120218579234, "grad_norm": 0.5753540187155672, "learning_rate": 9.355275488154025e-06, "loss": 0.3644, "step": 754 }, { "epoch": 1.138119464857735, "grad_norm": 0.571883771055452, "learning_rate": 9.32899899009199e-06, "loss": 0.332, "step": 755 }, { "epoch": 1.1396269078575467, "grad_norm": 0.5383157533846678, "learning_rate": 9.30272714512347e-06, "loss": 0.3397, "step": 756 }, { "epoch": 1.1411343508573581, "grad_norm": 0.564086061412075, "learning_rate": 9.276460135432019e-06, "loss": 0.3592, "step": 757 }, { "epoch": 1.1426417938571698, "grad_norm": 0.510671608636206, "learning_rate": 9.250198143167675e-06, "loss": 0.3301, "step": 758 }, { "epoch": 1.1441492368569812, "grad_norm": 0.5323793942216957, "learning_rate": 9.223941350445666e-06, "loss": 0.3341, "step": 759 }, { "epoch": 1.1456566798567929, "grad_norm": 0.5359098725485097, "learning_rate": 9.19768993934517e-06, "loss": 0.3214, "step": 760 }, { "epoch": 1.1471641228566045, "grad_norm": 0.5451361788825891, "learning_rate": 9.171444091908046e-06, "loss": 0.3195, "step": 761 }, { "epoch": 1.1486715658564162, "grad_norm": 0.5246946467273069, "learning_rate": 9.145203990137571e-06, "loss": 0.3417, "step": 762 }, { "epoch": 1.1501790088562276, "grad_norm": 0.5919155354849388, "learning_rate": 9.118969815997174e-06, "loss": 0.3417, "step": 763 }, { "epoch": 1.1516864518560392, "grad_norm": 0.549339616533448, "learning_rate": 9.092741751409186e-06, "loss": 0.3323, "step": 764 }, { "epoch": 1.1531938948558507, "grad_norm": 0.5520952523067345, "learning_rate": 9.06651997825357e-06, "loss": 0.325, "step": 765 }, { "epoch": 1.1547013378556623, "grad_norm": 0.5546050111571403, "learning_rate": 9.040304678366658e-06, "loss": 0.3798, "step": 766 }, { "epoch": 1.156208780855474, "grad_norm": 0.5355016099382737, "learning_rate": 9.014096033539889e-06, "loss": 0.3324, "step": 767 }, { "epoch": 1.1577162238552854, "grad_norm": 0.5063535090434689, "learning_rate": 8.987894225518556e-06, "loss": 0.3098, "step": 768 }, { "epoch": 1.159223666855097, "grad_norm": 0.538083701203612, "learning_rate": 8.961699436000548e-06, "loss": 0.3378, "step": 769 }, { "epoch": 1.1607311098549087, "grad_norm": 0.5611833134978637, "learning_rate": 8.93551184663507e-06, "loss": 0.3291, "step": 770 }, { "epoch": 1.16223855285472, "grad_norm": 0.5334833426063799, "learning_rate": 8.909331639021414e-06, "loss": 0.3265, "step": 771 }, { "epoch": 1.1637459958545318, "grad_norm": 0.5370028500892263, "learning_rate": 8.883158994707666e-06, "loss": 0.353, "step": 772 }, { "epoch": 1.1652534388543434, "grad_norm": 0.5564851227581507, "learning_rate": 8.856994095189477e-06, "loss": 0.314, "step": 773 }, { "epoch": 1.1667608818541548, "grad_norm": 0.5516816420442727, "learning_rate": 8.830837121908783e-06, "loss": 0.3459, "step": 774 }, { "epoch": 1.1682683248539665, "grad_norm": 0.546010896691211, "learning_rate": 8.804688256252557e-06, "loss": 0.3564, "step": 775 }, { "epoch": 1.1697757678537781, "grad_norm": 0.5204242216440147, "learning_rate": 8.778547679551555e-06, "loss": 0.3093, "step": 776 }, { "epoch": 1.1712832108535896, "grad_norm": 0.5530932960461594, "learning_rate": 8.75241557307904e-06, "loss": 0.3169, "step": 777 }, { "epoch": 1.1727906538534012, "grad_norm": 0.5433657189299205, "learning_rate": 8.726292118049555e-06, "loss": 0.3238, "step": 778 }, { "epoch": 1.1742980968532128, "grad_norm": 0.536612091168906, "learning_rate": 8.700177495617635e-06, "loss": 0.3375, "step": 779 }, { "epoch": 1.1758055398530243, "grad_norm": 0.5547355998217709, "learning_rate": 8.674071886876572e-06, "loss": 0.3285, "step": 780 }, { "epoch": 1.177312982852836, "grad_norm": 0.6048276095962777, "learning_rate": 8.647975472857148e-06, "loss": 0.3704, "step": 781 }, { "epoch": 1.1788204258526473, "grad_norm": 0.571295755561053, "learning_rate": 8.621888434526382e-06, "loss": 0.374, "step": 782 }, { "epoch": 1.180327868852459, "grad_norm": 0.5189210618741348, "learning_rate": 8.595810952786289e-06, "loss": 0.3247, "step": 783 }, { "epoch": 1.1818353118522706, "grad_norm": 0.5407807768349286, "learning_rate": 8.569743208472594e-06, "loss": 0.318, "step": 784 }, { "epoch": 1.183342754852082, "grad_norm": 0.5555658150734397, "learning_rate": 8.543685382353518e-06, "loss": 0.342, "step": 785 }, { "epoch": 1.1848501978518937, "grad_norm": 0.5478498420172522, "learning_rate": 8.51763765512849e-06, "loss": 0.3386, "step": 786 }, { "epoch": 1.1863576408517054, "grad_norm": 0.5229096665922429, "learning_rate": 8.491600207426907e-06, "loss": 0.3218, "step": 787 }, { "epoch": 1.1878650838515168, "grad_norm": 0.5706786350698708, "learning_rate": 8.465573219806893e-06, "loss": 0.3491, "step": 788 }, { "epoch": 1.1893725268513284, "grad_norm": 0.5792169640912351, "learning_rate": 8.439556872754025e-06, "loss": 0.3482, "step": 789 }, { "epoch": 1.19087996985114, "grad_norm": 0.5219487046954187, "learning_rate": 8.413551346680095e-06, "loss": 0.3183, "step": 790 }, { "epoch": 1.1923874128509515, "grad_norm": 0.5680359320055756, "learning_rate": 8.38755682192186e-06, "loss": 0.3257, "step": 791 }, { "epoch": 1.1938948558507632, "grad_norm": 0.54324153892485, "learning_rate": 8.36157347873979e-06, "loss": 0.3151, "step": 792 }, { "epoch": 1.1954022988505748, "grad_norm": 0.5584145695371312, "learning_rate": 8.335601497316809e-06, "loss": 0.3474, "step": 793 }, { "epoch": 1.1969097418503862, "grad_norm": 0.5414937178807059, "learning_rate": 8.309641057757052e-06, "loss": 0.3348, "step": 794 }, { "epoch": 1.1984171848501979, "grad_norm": 0.5933495505366142, "learning_rate": 8.283692340084623e-06, "loss": 0.3743, "step": 795 }, { "epoch": 1.1999246278500095, "grad_norm": 0.5730820400742883, "learning_rate": 8.257755524242333e-06, "loss": 0.3437, "step": 796 }, { "epoch": 1.201432070849821, "grad_norm": 0.5154842086228131, "learning_rate": 8.231830790090461e-06, "loss": 0.3271, "step": 797 }, { "epoch": 1.2029395138496326, "grad_norm": 0.5445619151521616, "learning_rate": 8.205918317405508e-06, "loss": 0.3229, "step": 798 }, { "epoch": 1.204446956849444, "grad_norm": 0.6016710522110904, "learning_rate": 8.18001828587895e-06, "loss": 0.3609, "step": 799 }, { "epoch": 1.2059543998492557, "grad_norm": 0.5457762036159068, "learning_rate": 8.154130875115978e-06, "loss": 0.318, "step": 800 }, { "epoch": 1.2074618428490673, "grad_norm": 0.5404902176604001, "learning_rate": 8.12825626463427e-06, "loss": 0.3323, "step": 801 }, { "epoch": 1.2089692858488787, "grad_norm": 0.5722847292063646, "learning_rate": 8.102394633862743e-06, "loss": 0.3147, "step": 802 }, { "epoch": 1.2104767288486904, "grad_norm": 0.5531842770730636, "learning_rate": 8.0765461621403e-06, "loss": 0.331, "step": 803 }, { "epoch": 1.211984171848502, "grad_norm": 0.5256780853712785, "learning_rate": 8.050711028714589e-06, "loss": 0.3176, "step": 804 }, { "epoch": 1.2134916148483135, "grad_norm": 0.6144603881477418, "learning_rate": 8.02488941274078e-06, "loss": 0.3383, "step": 805 }, { "epoch": 1.2149990578481251, "grad_norm": 0.571788365434139, "learning_rate": 7.999081493280283e-06, "loss": 0.3258, "step": 806 }, { "epoch": 1.2165065008479368, "grad_norm": 0.5982762464323738, "learning_rate": 7.973287449299545e-06, "loss": 0.3503, "step": 807 }, { "epoch": 1.2180139438477482, "grad_norm": 0.5363356894959806, "learning_rate": 7.947507459668784e-06, "loss": 0.3436, "step": 808 }, { "epoch": 1.2195213868475598, "grad_norm": 0.5730894211276505, "learning_rate": 7.921741703160758e-06, "loss": 0.3584, "step": 809 }, { "epoch": 1.2210288298473715, "grad_norm": 0.563926690224309, "learning_rate": 7.895990358449533e-06, "loss": 0.3291, "step": 810 }, { "epoch": 1.222536272847183, "grad_norm": 0.5254920217508706, "learning_rate": 7.87025360410922e-06, "loss": 0.316, "step": 811 }, { "epoch": 1.2240437158469946, "grad_norm": 0.5313261676986573, "learning_rate": 7.844531618612772e-06, "loss": 0.3319, "step": 812 }, { "epoch": 1.2255511588468062, "grad_norm": 0.5790168954324271, "learning_rate": 7.81882458033071e-06, "loss": 0.3202, "step": 813 }, { "epoch": 1.2270586018466176, "grad_norm": 0.5385942852927429, "learning_rate": 7.79313266752991e-06, "loss": 0.3259, "step": 814 }, { "epoch": 1.2285660448464293, "grad_norm": 0.5551972130449111, "learning_rate": 7.767456058372362e-06, "loss": 0.3385, "step": 815 }, { "epoch": 1.2300734878462407, "grad_norm": 0.5322043372006761, "learning_rate": 7.741794930913922e-06, "loss": 0.3215, "step": 816 }, { "epoch": 1.2315809308460524, "grad_norm": 0.5541120887430956, "learning_rate": 7.7161494631031e-06, "loss": 0.3428, "step": 817 }, { "epoch": 1.233088373845864, "grad_norm": 0.5527885462222231, "learning_rate": 7.690519832779799e-06, "loss": 0.3389, "step": 818 }, { "epoch": 1.2345958168456754, "grad_norm": 0.5406331170872595, "learning_rate": 7.664906217674115e-06, "loss": 0.3112, "step": 819 }, { "epoch": 1.236103259845487, "grad_norm": 0.5055150883042695, "learning_rate": 7.639308795405066e-06, "loss": 0.3202, "step": 820 }, { "epoch": 1.2376107028452987, "grad_norm": 0.5563269801825349, "learning_rate": 7.613727743479395e-06, "loss": 0.3571, "step": 821 }, { "epoch": 1.2391181458451102, "grad_norm": 0.5792057709615847, "learning_rate": 7.588163239290316e-06, "loss": 0.3329, "step": 822 }, { "epoch": 1.2406255888449218, "grad_norm": 0.5666249401867434, "learning_rate": 7.562615460116289e-06, "loss": 0.351, "step": 823 }, { "epoch": 1.2421330318447334, "grad_norm": 0.5265355387938444, "learning_rate": 7.537084583119802e-06, "loss": 0.3701, "step": 824 }, { "epoch": 1.2436404748445449, "grad_norm": 0.5495841716595921, "learning_rate": 7.511570785346129e-06, "loss": 0.329, "step": 825 }, { "epoch": 1.2451479178443565, "grad_norm": 0.5587199026990006, "learning_rate": 7.486074243722109e-06, "loss": 0.3252, "step": 826 }, { "epoch": 1.2466553608441682, "grad_norm": 0.5211341468152613, "learning_rate": 7.460595135054916e-06, "loss": 0.3311, "step": 827 }, { "epoch": 1.2481628038439796, "grad_norm": 0.5364245497529563, "learning_rate": 7.435133636030831e-06, "loss": 0.3208, "step": 828 }, { "epoch": 1.2496702468437912, "grad_norm": 0.5314247345107659, "learning_rate": 7.4096899232140295e-06, "loss": 0.3317, "step": 829 }, { "epoch": 1.251177689843603, "grad_norm": 0.5630710895853528, "learning_rate": 7.384264173045339e-06, "loss": 0.3351, "step": 830 }, { "epoch": 1.2526851328434143, "grad_norm": 0.5197283769421239, "learning_rate": 7.358856561841021e-06, "loss": 0.3065, "step": 831 }, { "epoch": 1.254192575843226, "grad_norm": 0.5568788382198039, "learning_rate": 7.333467265791563e-06, "loss": 0.351, "step": 832 }, { "epoch": 1.2557000188430374, "grad_norm": 0.5725220505007355, "learning_rate": 7.308096460960441e-06, "loss": 0.3439, "step": 833 }, { "epoch": 1.257207461842849, "grad_norm": 0.5304098730159461, "learning_rate": 7.282744323282895e-06, "loss": 0.3188, "step": 834 }, { "epoch": 1.2587149048426607, "grad_norm": 0.5263594760039901, "learning_rate": 7.2574110285647244e-06, "loss": 0.3209, "step": 835 }, { "epoch": 1.2602223478424723, "grad_norm": 0.6039158733618325, "learning_rate": 7.232096752481061e-06, "loss": 0.3366, "step": 836 }, { "epoch": 1.2617297908422838, "grad_norm": 0.5807414247418556, "learning_rate": 7.206801670575145e-06, "loss": 0.3446, "step": 837 }, { "epoch": 1.2632372338420954, "grad_norm": 0.5398549865816707, "learning_rate": 7.181525958257116e-06, "loss": 0.2976, "step": 838 }, { "epoch": 1.2647446768419068, "grad_norm": 0.5502842121004295, "learning_rate": 7.156269790802801e-06, "loss": 0.3308, "step": 839 }, { "epoch": 1.2662521198417185, "grad_norm": 0.5520318040890088, "learning_rate": 7.131033343352483e-06, "loss": 0.3347, "step": 840 }, { "epoch": 1.2677595628415301, "grad_norm": 0.5430821284421434, "learning_rate": 7.105816790909699e-06, "loss": 0.3199, "step": 841 }, { "epoch": 1.2692670058413416, "grad_norm": 0.5268656785617308, "learning_rate": 7.080620308340024e-06, "loss": 0.3368, "step": 842 }, { "epoch": 1.2707744488411532, "grad_norm": 0.5488558866283424, "learning_rate": 7.055444070369852e-06, "loss": 0.3185, "step": 843 }, { "epoch": 1.2722818918409646, "grad_norm": 0.5234636191148432, "learning_rate": 7.0302882515852025e-06, "loss": 0.3156, "step": 844 }, { "epoch": 1.2737893348407763, "grad_norm": 0.5571922620156962, "learning_rate": 7.005153026430476e-06, "loss": 0.3475, "step": 845 }, { "epoch": 1.275296777840588, "grad_norm": 0.6043172841328527, "learning_rate": 6.980038569207291e-06, "loss": 0.3535, "step": 846 }, { "epoch": 1.2768042208403996, "grad_norm": 0.5449462283830545, "learning_rate": 6.954945054073228e-06, "loss": 0.3485, "step": 847 }, { "epoch": 1.278311663840211, "grad_norm": 0.553125976275942, "learning_rate": 6.929872655040655e-06, "loss": 0.3392, "step": 848 }, { "epoch": 1.2798191068400226, "grad_norm": 0.5313032640250875, "learning_rate": 6.904821545975507e-06, "loss": 0.3533, "step": 849 }, { "epoch": 1.281326549839834, "grad_norm": 0.5461530058972931, "learning_rate": 6.879791900596077e-06, "loss": 0.3082, "step": 850 }, { "epoch": 1.2828339928396457, "grad_norm": 0.5268975792503748, "learning_rate": 6.854783892471823e-06, "loss": 0.3507, "step": 851 }, { "epoch": 1.2843414358394574, "grad_norm": 0.525335909935522, "learning_rate": 6.829797695022163e-06, "loss": 0.3137, "step": 852 }, { "epoch": 1.285848878839269, "grad_norm": 0.5439698304073414, "learning_rate": 6.804833481515256e-06, "loss": 0.3269, "step": 853 }, { "epoch": 1.2873563218390804, "grad_norm": 0.5426503592650488, "learning_rate": 6.7798914250668154e-06, "loss": 0.3255, "step": 854 }, { "epoch": 1.288863764838892, "grad_norm": 0.546578985401071, "learning_rate": 6.7549716986389146e-06, "loss": 0.3357, "step": 855 }, { "epoch": 1.2903712078387035, "grad_norm": 0.5433998763126892, "learning_rate": 6.730074475038766e-06, "loss": 0.3316, "step": 856 }, { "epoch": 1.2918786508385152, "grad_norm": 0.5364588967630985, "learning_rate": 6.7051999269175405e-06, "loss": 0.3305, "step": 857 }, { "epoch": 1.2933860938383268, "grad_norm": 0.5658934841388523, "learning_rate": 6.680348226769162e-06, "loss": 0.329, "step": 858 }, { "epoch": 1.2948935368381382, "grad_norm": 0.5643062239325746, "learning_rate": 6.655519546929121e-06, "loss": 0.3297, "step": 859 }, { "epoch": 1.2964009798379499, "grad_norm": 0.5371342456598566, "learning_rate": 6.630714059573267e-06, "loss": 0.3411, "step": 860 }, { "epoch": 1.2979084228377613, "grad_norm": 0.5429869820067992, "learning_rate": 6.6059319367166165e-06, "loss": 0.3162, "step": 861 }, { "epoch": 1.299415865837573, "grad_norm": 0.6163498341710386, "learning_rate": 6.581173350212169e-06, "loss": 0.3346, "step": 862 }, { "epoch": 1.3009233088373846, "grad_norm": 0.5249574401357171, "learning_rate": 6.55643847174971e-06, "loss": 0.3184, "step": 863 }, { "epoch": 1.3024307518371963, "grad_norm": 0.5652427669527782, "learning_rate": 6.531727472854617e-06, "loss": 0.3277, "step": 864 }, { "epoch": 1.3039381948370077, "grad_norm": 0.5499255875094143, "learning_rate": 6.507040524886672e-06, "loss": 0.3099, "step": 865 }, { "epoch": 1.3054456378368193, "grad_norm": 0.5395982289283698, "learning_rate": 6.482377799038882e-06, "loss": 0.312, "step": 866 }, { "epoch": 1.3069530808366308, "grad_norm": 0.5425266392409812, "learning_rate": 6.45773946633628e-06, "loss": 0.3288, "step": 867 }, { "epoch": 1.3084605238364424, "grad_norm": 0.5289252666187554, "learning_rate": 6.4331256976347434e-06, "loss": 0.3143, "step": 868 }, { "epoch": 1.309967966836254, "grad_norm": 0.5829209174715098, "learning_rate": 6.408536663619803e-06, "loss": 0.3215, "step": 869 }, { "epoch": 1.3114754098360657, "grad_norm": 0.5955771972087047, "learning_rate": 6.383972534805478e-06, "loss": 0.3232, "step": 870 }, { "epoch": 1.3129828528358771, "grad_norm": 0.5434757595303122, "learning_rate": 6.359433481533074e-06, "loss": 0.318, "step": 871 }, { "epoch": 1.3144902958356888, "grad_norm": 0.5313303734643687, "learning_rate": 6.3349196739700024e-06, "loss": 0.3159, "step": 872 }, { "epoch": 1.3159977388355002, "grad_norm": 0.5678985167703594, "learning_rate": 6.310431282108622e-06, "loss": 0.3757, "step": 873 }, { "epoch": 1.3175051818353118, "grad_norm": 0.5648846998829979, "learning_rate": 6.2859684757650365e-06, "loss": 0.3493, "step": 874 }, { "epoch": 1.3190126248351235, "grad_norm": 0.5488508237006199, "learning_rate": 6.261531424577923e-06, "loss": 0.3219, "step": 875 }, { "epoch": 1.320520067834935, "grad_norm": 0.5438041497270804, "learning_rate": 6.2371202980073596e-06, "loss": 0.3417, "step": 876 }, { "epoch": 1.3220275108347466, "grad_norm": 0.5400748419899576, "learning_rate": 6.212735265333655e-06, "loss": 0.3025, "step": 877 }, { "epoch": 1.323534953834558, "grad_norm": 0.5299843425249701, "learning_rate": 6.188376495656156e-06, "loss": 0.3374, "step": 878 }, { "epoch": 1.3250423968343696, "grad_norm": 0.5236709426443396, "learning_rate": 6.164044157892102e-06, "loss": 0.3221, "step": 879 }, { "epoch": 1.3265498398341813, "grad_norm": 0.5699051867060005, "learning_rate": 6.13973842077543e-06, "loss": 0.3226, "step": 880 }, { "epoch": 1.328057282833993, "grad_norm": 0.5655665319377791, "learning_rate": 6.11545945285561e-06, "loss": 0.3175, "step": 881 }, { "epoch": 1.3295647258338044, "grad_norm": 0.5387118438674878, "learning_rate": 6.091207422496489e-06, "loss": 0.3243, "step": 882 }, { "epoch": 1.331072168833616, "grad_norm": 0.5744706591584219, "learning_rate": 6.066982497875109e-06, "loss": 0.3286, "step": 883 }, { "epoch": 1.3325796118334274, "grad_norm": 0.542466249812019, "learning_rate": 6.042784846980542e-06, "loss": 0.3225, "step": 884 }, { "epoch": 1.334087054833239, "grad_norm": 0.5515488785701044, "learning_rate": 6.018614637612733e-06, "loss": 0.3238, "step": 885 }, { "epoch": 1.3355944978330507, "grad_norm": 0.5349896204088196, "learning_rate": 5.99447203738134e-06, "loss": 0.324, "step": 886 }, { "epoch": 1.3371019408328624, "grad_norm": 0.5371367792089301, "learning_rate": 5.9703572137045495e-06, "loss": 0.3369, "step": 887 }, { "epoch": 1.3386093838326738, "grad_norm": 0.5615358147993731, "learning_rate": 5.946270333807937e-06, "loss": 0.3052, "step": 888 }, { "epoch": 1.3401168268324855, "grad_norm": 0.5743727933679714, "learning_rate": 5.922211564723302e-06, "loss": 0.3455, "step": 889 }, { "epoch": 1.3416242698322969, "grad_norm": 0.5336291605723125, "learning_rate": 5.898181073287504e-06, "loss": 0.3226, "step": 890 }, { "epoch": 1.3431317128321085, "grad_norm": 0.5722163135210774, "learning_rate": 5.87417902614131e-06, "loss": 0.3646, "step": 891 }, { "epoch": 1.3446391558319202, "grad_norm": 0.5524401803992677, "learning_rate": 5.850205589728239e-06, "loss": 0.3016, "step": 892 }, { "epoch": 1.3461465988317316, "grad_norm": 0.5311906031167264, "learning_rate": 5.826260930293417e-06, "loss": 0.3174, "step": 893 }, { "epoch": 1.3476540418315432, "grad_norm": 0.5385817256109608, "learning_rate": 5.802345213882396e-06, "loss": 0.3447, "step": 894 }, { "epoch": 1.349161484831355, "grad_norm": 0.5443226270708521, "learning_rate": 5.778458606340037e-06, "loss": 0.3056, "step": 895 }, { "epoch": 1.3506689278311663, "grad_norm": 0.5247659222065347, "learning_rate": 5.754601273309333e-06, "loss": 0.3045, "step": 896 }, { "epoch": 1.352176370830978, "grad_norm": 0.5555702487315548, "learning_rate": 5.730773380230276e-06, "loss": 0.3186, "step": 897 }, { "epoch": 1.3536838138307896, "grad_norm": 0.5661524854903914, "learning_rate": 5.70697509233871e-06, "loss": 0.3248, "step": 898 }, { "epoch": 1.355191256830601, "grad_norm": 0.5284857763614461, "learning_rate": 5.683206574665165e-06, "loss": 0.3018, "step": 899 }, { "epoch": 1.3566986998304127, "grad_norm": 0.5740850180912764, "learning_rate": 5.6594679920337514e-06, "loss": 0.3529, "step": 900 }, { "epoch": 1.3582061428302241, "grad_norm": 0.5299086867590524, "learning_rate": 5.635759509060969e-06, "loss": 0.32, "step": 901 }, { "epoch": 1.3597135858300358, "grad_norm": 0.5330329949433913, "learning_rate": 5.612081290154607e-06, "loss": 0.3156, "step": 902 }, { "epoch": 1.3612210288298474, "grad_norm": 0.5347611903592508, "learning_rate": 5.58843349951258e-06, "loss": 0.3183, "step": 903 }, { "epoch": 1.362728471829659, "grad_norm": 0.5409021340662399, "learning_rate": 5.564816301121792e-06, "loss": 0.3411, "step": 904 }, { "epoch": 1.3642359148294705, "grad_norm": 0.5560565848550149, "learning_rate": 5.541229858757011e-06, "loss": 0.3508, "step": 905 }, { "epoch": 1.3657433578292821, "grad_norm": 0.5040665273430834, "learning_rate": 5.517674335979721e-06, "loss": 0.3038, "step": 906 }, { "epoch": 1.3672508008290936, "grad_norm": 0.5520505173652595, "learning_rate": 5.494149896136998e-06, "loss": 0.3342, "step": 907 }, { "epoch": 1.3687582438289052, "grad_norm": 0.5286100688050495, "learning_rate": 5.470656702360367e-06, "loss": 0.3051, "step": 908 }, { "epoch": 1.3702656868287169, "grad_norm": 0.5540464877346475, "learning_rate": 5.447194917564671e-06, "loss": 0.3327, "step": 909 }, { "epoch": 1.3717731298285283, "grad_norm": 0.5401690086723988, "learning_rate": 5.423764704446954e-06, "loss": 0.332, "step": 910 }, { "epoch": 1.37328057282834, "grad_norm": 0.5440262612621518, "learning_rate": 5.400366225485326e-06, "loss": 0.3326, "step": 911 }, { "epoch": 1.3747880158281516, "grad_norm": 0.5291318028597245, "learning_rate": 5.376999642937817e-06, "loss": 0.3262, "step": 912 }, { "epoch": 1.376295458827963, "grad_norm": 0.5361093139503608, "learning_rate": 5.353665118841296e-06, "loss": 0.3258, "step": 913 }, { "epoch": 1.3778029018277747, "grad_norm": 0.5442991814951846, "learning_rate": 5.330362815010306e-06, "loss": 0.3162, "step": 914 }, { "epoch": 1.3793103448275863, "grad_norm": 0.5384147655921361, "learning_rate": 5.307092893035951e-06, "loss": 0.3381, "step": 915 }, { "epoch": 1.3808177878273977, "grad_norm": 0.539100490777508, "learning_rate": 5.2838555142847925e-06, "loss": 0.3423, "step": 916 }, { "epoch": 1.3823252308272094, "grad_norm": 0.5427293981456651, "learning_rate": 5.260650839897719e-06, "loss": 0.3217, "step": 917 }, { "epoch": 1.3838326738270208, "grad_norm": 0.5726046303707281, "learning_rate": 5.237479030788817e-06, "loss": 0.3507, "step": 918 }, { "epoch": 1.3853401168268324, "grad_norm": 0.5527176893463295, "learning_rate": 5.214340247644278e-06, "loss": 0.331, "step": 919 }, { "epoch": 1.386847559826644, "grad_norm": 0.5481340240469819, "learning_rate": 5.191234650921273e-06, "loss": 0.318, "step": 920 }, { "epoch": 1.3883550028264557, "grad_norm": 0.5543962320620248, "learning_rate": 5.168162400846835e-06, "loss": 0.3155, "step": 921 }, { "epoch": 1.3898624458262672, "grad_norm": 0.5367203136891187, "learning_rate": 5.145123657416759e-06, "loss": 0.3326, "step": 922 }, { "epoch": 1.3913698888260788, "grad_norm": 0.5460167764137122, "learning_rate": 5.122118580394473e-06, "loss": 0.337, "step": 923 }, { "epoch": 1.3928773318258902, "grad_norm": 0.5195547700814616, "learning_rate": 5.099147329309959e-06, "loss": 0.326, "step": 924 }, { "epoch": 1.394384774825702, "grad_norm": 0.5456994955845843, "learning_rate": 5.076210063458622e-06, "loss": 0.3322, "step": 925 }, { "epoch": 1.3958922178255135, "grad_norm": 0.5843461937914468, "learning_rate": 5.0533069419002e-06, "loss": 0.339, "step": 926 }, { "epoch": 1.397399660825325, "grad_norm": 0.5150644731537505, "learning_rate": 5.030438123457655e-06, "loss": 0.2913, "step": 927 }, { "epoch": 1.3989071038251366, "grad_norm": 0.5258159746479392, "learning_rate": 5.007603766716063e-06, "loss": 0.3107, "step": 928 }, { "epoch": 1.4004145468249483, "grad_norm": 0.5748165660930256, "learning_rate": 4.984804030021533e-06, "loss": 0.3328, "step": 929 }, { "epoch": 1.4019219898247597, "grad_norm": 0.5630833950584739, "learning_rate": 4.962039071480102e-06, "loss": 0.3233, "step": 930 }, { "epoch": 1.4034294328245713, "grad_norm": 0.537736675931464, "learning_rate": 4.939309048956622e-06, "loss": 0.3451, "step": 931 }, { "epoch": 1.404936875824383, "grad_norm": 0.5230851918523695, "learning_rate": 4.9166141200736885e-06, "loss": 0.3389, "step": 932 }, { "epoch": 1.4064443188241944, "grad_norm": 0.5434274109432955, "learning_rate": 4.89395444221055e-06, "loss": 0.3189, "step": 933 }, { "epoch": 1.407951761824006, "grad_norm": 0.5467058284642171, "learning_rate": 4.871330172501979e-06, "loss": 0.3218, "step": 934 }, { "epoch": 1.4094592048238175, "grad_norm": 0.5643806859737526, "learning_rate": 4.848741467837228e-06, "loss": 0.339, "step": 935 }, { "epoch": 1.4109666478236291, "grad_norm": 0.5210588389675654, "learning_rate": 4.826188484858918e-06, "loss": 0.2865, "step": 936 }, { "epoch": 1.4124740908234408, "grad_norm": 0.5575173474168307, "learning_rate": 4.803671379961945e-06, "loss": 0.3448, "step": 937 }, { "epoch": 1.4139815338232524, "grad_norm": 0.5553202634668, "learning_rate": 4.781190309292421e-06, "loss": 0.318, "step": 938 }, { "epoch": 1.4154889768230638, "grad_norm": 0.5277473116633332, "learning_rate": 4.758745428746569e-06, "loss": 0.3047, "step": 939 }, { "epoch": 1.4169964198228755, "grad_norm": 0.5308118737562897, "learning_rate": 4.736336893969652e-06, "loss": 0.3126, "step": 940 }, { "epoch": 1.418503862822687, "grad_norm": 0.562016412444855, "learning_rate": 4.7139648603548925e-06, "loss": 0.3306, "step": 941 }, { "epoch": 1.4200113058224986, "grad_norm": 0.5112809802949265, "learning_rate": 4.691629483042387e-06, "loss": 0.2877, "step": 942 }, { "epoch": 1.4215187488223102, "grad_norm": 0.555811318693021, "learning_rate": 4.669330916918043e-06, "loss": 0.3346, "step": 943 }, { "epoch": 1.4230261918221216, "grad_norm": 0.5388925234150407, "learning_rate": 4.647069316612502e-06, "loss": 0.3137, "step": 944 }, { "epoch": 1.4245336348219333, "grad_norm": 0.5265475799399302, "learning_rate": 4.624844836500052e-06, "loss": 0.3162, "step": 945 }, { "epoch": 1.426041077821745, "grad_norm": 0.5112485522585755, "learning_rate": 4.60265763069758e-06, "loss": 0.2914, "step": 946 }, { "epoch": 1.4275485208215564, "grad_norm": 0.5285723749640436, "learning_rate": 4.580507853063487e-06, "loss": 0.3098, "step": 947 }, { "epoch": 1.429055963821368, "grad_norm": 0.5408160656578395, "learning_rate": 4.5583956571966295e-06, "loss": 0.3365, "step": 948 }, { "epoch": 1.4305634068211797, "grad_norm": 0.5598936258222863, "learning_rate": 4.5363211964352524e-06, "loss": 0.3292, "step": 949 }, { "epoch": 1.432070849820991, "grad_norm": 0.5180426805197446, "learning_rate": 4.514284623855915e-06, "loss": 0.3174, "step": 950 }, { "epoch": 1.4335782928208027, "grad_norm": 0.5639401953538313, "learning_rate": 4.4922860922724466e-06, "loss": 0.3617, "step": 951 }, { "epoch": 1.4350857358206142, "grad_norm": 0.5482846937319309, "learning_rate": 4.470325754234881e-06, "loss": 0.3256, "step": 952 }, { "epoch": 1.4365931788204258, "grad_norm": 0.530946653125974, "learning_rate": 4.448403762028391e-06, "loss": 0.3367, "step": 953 }, { "epoch": 1.4381006218202375, "grad_norm": 0.5630491613208096, "learning_rate": 4.426520267672244e-06, "loss": 0.33, "step": 954 }, { "epoch": 1.439608064820049, "grad_norm": 0.5281029541497921, "learning_rate": 4.40467542291874e-06, "loss": 0.3266, "step": 955 }, { "epoch": 1.4411155078198605, "grad_norm": 0.5134408808419982, "learning_rate": 4.382869379252152e-06, "loss": 0.3002, "step": 956 }, { "epoch": 1.4426229508196722, "grad_norm": 0.5379209815628555, "learning_rate": 4.361102287887698e-06, "loss": 0.3197, "step": 957 }, { "epoch": 1.4441303938194836, "grad_norm": 0.5118973398445678, "learning_rate": 4.339374299770477e-06, "loss": 0.316, "step": 958 }, { "epoch": 1.4456378368192953, "grad_norm": 0.5340927744773125, "learning_rate": 4.31768556557441e-06, "loss": 0.2995, "step": 959 }, { "epoch": 1.447145279819107, "grad_norm": 0.5574015013189253, "learning_rate": 4.296036235701235e-06, "loss": 0.3214, "step": 960 }, { "epoch": 1.4486527228189185, "grad_norm": 0.544283711827625, "learning_rate": 4.274426460279412e-06, "loss": 0.309, "step": 961 }, { "epoch": 1.45016016581873, "grad_norm": 0.5456093505990249, "learning_rate": 4.252856389163128e-06, "loss": 0.3056, "step": 962 }, { "epoch": 1.4516676088185416, "grad_norm": 0.5524390480774677, "learning_rate": 4.231326171931231e-06, "loss": 0.2988, "step": 963 }, { "epoch": 1.453175051818353, "grad_norm": 0.5357243619653109, "learning_rate": 4.209835957886196e-06, "loss": 0.3051, "step": 964 }, { "epoch": 1.4546824948181647, "grad_norm": 0.5567188365205857, "learning_rate": 4.188385896053098e-06, "loss": 0.3211, "step": 965 }, { "epoch": 1.4561899378179763, "grad_norm": 0.5571208137663407, "learning_rate": 4.166976135178575e-06, "loss": 0.3212, "step": 966 }, { "epoch": 1.4576973808177878, "grad_norm": 0.5288681574805124, "learning_rate": 4.1456068237297964e-06, "loss": 0.3247, "step": 967 }, { "epoch": 1.4592048238175994, "grad_norm": 0.5362640814930834, "learning_rate": 4.124278109893432e-06, "loss": 0.3206, "step": 968 }, { "epoch": 1.4607122668174108, "grad_norm": 0.5300069434968542, "learning_rate": 4.10299014157462e-06, "loss": 0.299, "step": 969 }, { "epoch": 1.4622197098172225, "grad_norm": 0.5207197137299924, "learning_rate": 4.0817430663959536e-06, "loss": 0.2872, "step": 970 }, { "epoch": 1.4637271528170341, "grad_norm": 0.5361880777046366, "learning_rate": 4.06053703169645e-06, "loss": 0.3432, "step": 971 }, { "epoch": 1.4652345958168458, "grad_norm": 0.5390277129867954, "learning_rate": 4.039372184530521e-06, "loss": 0.3121, "step": 972 }, { "epoch": 1.4667420388166572, "grad_norm": 0.5098624467494199, "learning_rate": 4.0182486716669656e-06, "loss": 0.3057, "step": 973 }, { "epoch": 1.4682494818164689, "grad_norm": 0.5530264319623549, "learning_rate": 3.9971666395879605e-06, "loss": 0.316, "step": 974 }, { "epoch": 1.4697569248162803, "grad_norm": 0.5648165554049958, "learning_rate": 3.9761262344880096e-06, "loss": 0.3456, "step": 975 }, { "epoch": 1.471264367816092, "grad_norm": 0.5376597362402104, "learning_rate": 3.9551276022729644e-06, "loss": 0.3075, "step": 976 }, { "epoch": 1.4727718108159036, "grad_norm": 0.5207214018679573, "learning_rate": 3.9341708885590034e-06, "loss": 0.3043, "step": 977 }, { "epoch": 1.4742792538157152, "grad_norm": 0.5346717454580582, "learning_rate": 3.913256238671607e-06, "loss": 0.3187, "step": 978 }, { "epoch": 1.4757866968155267, "grad_norm": 0.5474625757974195, "learning_rate": 3.89238379764457e-06, "loss": 0.3341, "step": 979 }, { "epoch": 1.4772941398153383, "grad_norm": 0.553265646517597, "learning_rate": 3.871553710218988e-06, "loss": 0.3615, "step": 980 }, { "epoch": 1.4788015828151497, "grad_norm": 0.5178190237398634, "learning_rate": 3.850766120842252e-06, "loss": 0.3087, "step": 981 }, { "epoch": 1.4803090258149614, "grad_norm": 0.5676605070232937, "learning_rate": 3.830021173667048e-06, "loss": 0.3331, "step": 982 }, { "epoch": 1.481816468814773, "grad_norm": 0.5366490741054173, "learning_rate": 3.809319012550352e-06, "loss": 0.3134, "step": 983 }, { "epoch": 1.4833239118145845, "grad_norm": 0.5237338303143243, "learning_rate": 3.788659781052444e-06, "loss": 0.3426, "step": 984 }, { "epoch": 1.484831354814396, "grad_norm": 0.5118568891202759, "learning_rate": 3.7680436224359084e-06, "loss": 0.3049, "step": 985 }, { "epoch": 1.4863387978142075, "grad_norm": 0.5474188971913913, "learning_rate": 3.747470679664624e-06, "loss": 0.3177, "step": 986 }, { "epoch": 1.4878462408140192, "grad_norm": 0.5623337896836218, "learning_rate": 3.7269410954028107e-06, "loss": 0.3268, "step": 987 }, { "epoch": 1.4893536838138308, "grad_norm": 0.5320249898828978, "learning_rate": 3.706455012013994e-06, "loss": 0.3135, "step": 988 }, { "epoch": 1.4908611268136425, "grad_norm": 0.5258630499196119, "learning_rate": 3.6860125715600513e-06, "loss": 0.2922, "step": 989 }, { "epoch": 1.492368569813454, "grad_norm": 0.5488691155290143, "learning_rate": 3.665613915800217e-06, "loss": 0.3093, "step": 990 }, { "epoch": 1.4938760128132655, "grad_norm": 0.534561267695021, "learning_rate": 3.6452591861900886e-06, "loss": 0.3201, "step": 991 }, { "epoch": 1.495383455813077, "grad_norm": 0.5493934402188156, "learning_rate": 3.6249485238806637e-06, "loss": 0.3258, "step": 992 }, { "epoch": 1.4968908988128886, "grad_norm": 0.5213745241077384, "learning_rate": 3.6046820697173514e-06, "loss": 0.3206, "step": 993 }, { "epoch": 1.4983983418127003, "grad_norm": 0.5189896345789112, "learning_rate": 3.5844599642389965e-06, "loss": 0.3093, "step": 994 }, { "epoch": 1.499905784812512, "grad_norm": 0.535438446636319, "learning_rate": 3.564282347676903e-06, "loss": 0.3449, "step": 995 }, { "epoch": 1.5014132278123233, "grad_norm": 0.5151601964534807, "learning_rate": 3.54414935995387e-06, "loss": 0.3002, "step": 996 }, { "epoch": 1.5029206708121348, "grad_norm": 0.5669837505751246, "learning_rate": 3.524061140683206e-06, "loss": 0.3367, "step": 997 }, { "epoch": 1.5044281138119464, "grad_norm": 0.5376128901605735, "learning_rate": 3.5040178291677816e-06, "loss": 0.3362, "step": 998 }, { "epoch": 1.505935556811758, "grad_norm": 0.5816182611830706, "learning_rate": 3.4840195643990383e-06, "loss": 0.316, "step": 999 }, { "epoch": 1.5074429998115697, "grad_norm": 0.5333548248485912, "learning_rate": 3.464066485056048e-06, "loss": 0.3223, "step": 1000 }, { "epoch": 1.5089504428113814, "grad_norm": 0.5574217586347463, "learning_rate": 3.444158729504549e-06, "loss": 0.2994, "step": 1001 }, { "epoch": 1.5104578858111928, "grad_norm": 0.5272699065186945, "learning_rate": 3.4242964357959597e-06, "loss": 0.3152, "step": 1002 }, { "epoch": 1.5119653288110042, "grad_norm": 0.5352889213452704, "learning_rate": 3.4044797416664564e-06, "loss": 0.3103, "step": 1003 }, { "epoch": 1.5134727718108159, "grad_norm": 0.5602956634920077, "learning_rate": 3.3847087845359996e-06, "loss": 0.334, "step": 1004 }, { "epoch": 1.5149802148106275, "grad_norm": 0.5402201680847467, "learning_rate": 3.364983701507376e-06, "loss": 0.3291, "step": 1005 }, { "epoch": 1.5164876578104391, "grad_norm": 0.5526297524617138, "learning_rate": 3.3453046293652657e-06, "loss": 0.3232, "step": 1006 }, { "epoch": 1.5179951008102506, "grad_norm": 0.5401106392320315, "learning_rate": 3.3256717045752794e-06, "loss": 0.3219, "step": 1007 }, { "epoch": 1.5195025438100622, "grad_norm": 0.5589978845369276, "learning_rate": 3.3060850632830167e-06, "loss": 0.3215, "step": 1008 }, { "epoch": 1.5210099868098736, "grad_norm": 0.5628427903081042, "learning_rate": 3.286544841313126e-06, "loss": 0.3042, "step": 1009 }, { "epoch": 1.5225174298096853, "grad_norm": 0.5291974603976658, "learning_rate": 3.2670511741683475e-06, "loss": 0.3039, "step": 1010 }, { "epoch": 1.524024872809497, "grad_norm": 0.5307771510625195, "learning_rate": 3.2476041970285945e-06, "loss": 0.3225, "step": 1011 }, { "epoch": 1.5255323158093086, "grad_norm": 0.5798408138665074, "learning_rate": 3.2282040447500063e-06, "loss": 0.3574, "step": 1012 }, { "epoch": 1.52703975880912, "grad_norm": 0.5262954379509106, "learning_rate": 3.208850851863998e-06, "loss": 0.3074, "step": 1013 }, { "epoch": 1.5285472018089314, "grad_norm": 0.5411329822808086, "learning_rate": 3.189544752576369e-06, "loss": 0.3291, "step": 1014 }, { "epoch": 1.530054644808743, "grad_norm": 0.512225504454937, "learning_rate": 3.1702858807663175e-06, "loss": 0.2967, "step": 1015 }, { "epoch": 1.5315620878085547, "grad_norm": 0.5358326980021074, "learning_rate": 3.151074369985556e-06, "loss": 0.3281, "step": 1016 }, { "epoch": 1.5330695308083664, "grad_norm": 0.5412115741377782, "learning_rate": 3.131910353457369e-06, "loss": 0.3321, "step": 1017 }, { "epoch": 1.534576973808178, "grad_norm": 0.5355770866583788, "learning_rate": 3.112793964075681e-06, "loss": 0.2999, "step": 1018 }, { "epoch": 1.5360844168079895, "grad_norm": 0.53044658803981, "learning_rate": 3.0937253344041507e-06, "loss": 0.3271, "step": 1019 }, { "epoch": 1.5375918598078009, "grad_norm": 0.5274519029189704, "learning_rate": 3.074704596675242e-06, "loss": 0.3174, "step": 1020 }, { "epoch": 1.5390993028076125, "grad_norm": 0.5410440028748313, "learning_rate": 3.055731882789311e-06, "loss": 0.3268, "step": 1021 }, { "epoch": 1.5406067458074242, "grad_norm": 0.5400373191606019, "learning_rate": 3.0368073243136874e-06, "loss": 0.325, "step": 1022 }, { "epoch": 1.5421141888072358, "grad_norm": 0.5520146881220487, "learning_rate": 3.0179310524817707e-06, "loss": 0.32, "step": 1023 }, { "epoch": 1.5436216318070473, "grad_norm": 0.5351759755594372, "learning_rate": 2.9991031981921026e-06, "loss": 0.3269, "step": 1024 }, { "epoch": 1.545129074806859, "grad_norm": 0.5327689357374978, "learning_rate": 2.9803238920074784e-06, "loss": 0.3082, "step": 1025 }, { "epoch": 1.5466365178066703, "grad_norm": 0.5326589519994432, "learning_rate": 2.961593264154038e-06, "loss": 0.3157, "step": 1026 }, { "epoch": 1.548143960806482, "grad_norm": 0.5249678984746798, "learning_rate": 2.9429114445203423e-06, "loss": 0.3119, "step": 1027 }, { "epoch": 1.5496514038062936, "grad_norm": 0.5606341770532942, "learning_rate": 2.924278562656514e-06, "loss": 0.3351, "step": 1028 }, { "epoch": 1.5511588468061053, "grad_norm": 0.5156779633424778, "learning_rate": 2.90569474777329e-06, "loss": 0.3256, "step": 1029 }, { "epoch": 1.5526662898059167, "grad_norm": 0.5374626464876353, "learning_rate": 2.8871601287411634e-06, "loss": 0.3303, "step": 1030 }, { "epoch": 1.5541737328057281, "grad_norm": 0.5262890780017794, "learning_rate": 2.8686748340894744e-06, "loss": 0.3114, "step": 1031 }, { "epoch": 1.5556811758055398, "grad_norm": 0.5260995538471516, "learning_rate": 2.850238992005514e-06, "loss": 0.2979, "step": 1032 }, { "epoch": 1.5571886188053514, "grad_norm": 0.5573768187241204, "learning_rate": 2.8318527303336465e-06, "loss": 0.3475, "step": 1033 }, { "epoch": 1.558696061805163, "grad_norm": 0.5542071850347167, "learning_rate": 2.81351617657442e-06, "loss": 0.3359, "step": 1034 }, { "epoch": 1.5602035048049747, "grad_norm": 0.5376949139111594, "learning_rate": 2.795229457883678e-06, "loss": 0.3299, "step": 1035 }, { "epoch": 1.5617109478047861, "grad_norm": 0.5213690515169962, "learning_rate": 2.7769927010716814e-06, "loss": 0.3187, "step": 1036 }, { "epoch": 1.5632183908045976, "grad_norm": 0.5282868034684867, "learning_rate": 2.7588060326022205e-06, "loss": 0.3124, "step": 1037 }, { "epoch": 1.5647258338044092, "grad_norm": 0.5629003594886647, "learning_rate": 2.740669578591755e-06, "loss": 0.3453, "step": 1038 }, { "epoch": 1.5662332768042209, "grad_norm": 0.5452741440289394, "learning_rate": 2.7225834648085282e-06, "loss": 0.3148, "step": 1039 }, { "epoch": 1.5677407198040325, "grad_norm": 0.5576848374307647, "learning_rate": 2.7045478166716843e-06, "loss": 0.3362, "step": 1040 }, { "epoch": 1.569248162803844, "grad_norm": 0.5230478868120295, "learning_rate": 2.6865627592504295e-06, "loss": 0.3074, "step": 1041 }, { "epoch": 1.5707556058036556, "grad_norm": 0.5476529275243367, "learning_rate": 2.668628417263137e-06, "loss": 0.314, "step": 1042 }, { "epoch": 1.572263048803467, "grad_norm": 0.5340674210452238, "learning_rate": 2.6507449150764852e-06, "loss": 0.3035, "step": 1043 }, { "epoch": 1.5737704918032787, "grad_norm": 0.5245422709481129, "learning_rate": 2.632912376704607e-06, "loss": 0.3344, "step": 1044 }, { "epoch": 1.5752779348030903, "grad_norm": 0.52661514981572, "learning_rate": 2.615130925808228e-06, "loss": 0.3054, "step": 1045 }, { "epoch": 1.576785377802902, "grad_norm": 0.5147590149467712, "learning_rate": 2.597400685693795e-06, "loss": 0.2879, "step": 1046 }, { "epoch": 1.5782928208027134, "grad_norm": 0.5476707768783776, "learning_rate": 2.5797217793126373e-06, "loss": 0.3395, "step": 1047 }, { "epoch": 1.5798002638025248, "grad_norm": 0.5338841668417198, "learning_rate": 2.5620943292601074e-06, "loss": 0.3211, "step": 1048 }, { "epoch": 1.5813077068023365, "grad_norm": 0.526860653464564, "learning_rate": 2.5445184577747305e-06, "loss": 0.3251, "step": 1049 }, { "epoch": 1.582815149802148, "grad_norm": 0.5250152267933532, "learning_rate": 2.52699428673736e-06, "loss": 0.3126, "step": 1050 }, { "epoch": 1.5843225928019598, "grad_norm": 0.5470636297967526, "learning_rate": 2.5095219376703183e-06, "loss": 0.3063, "step": 1051 }, { "epoch": 1.5858300358017714, "grad_norm": 0.5581707395933467, "learning_rate": 2.4921015317365794e-06, "loss": 0.3624, "step": 1052 }, { "epoch": 1.5873374788015828, "grad_norm": 0.5280009933911688, "learning_rate": 2.4747331897389103e-06, "loss": 0.3106, "step": 1053 }, { "epoch": 1.5888449218013942, "grad_norm": 0.5234869653748981, "learning_rate": 2.4574170321190305e-06, "loss": 0.2956, "step": 1054 }, { "epoch": 1.590352364801206, "grad_norm": 0.546217705596414, "learning_rate": 2.440153178956798e-06, "loss": 0.3215, "step": 1055 }, { "epoch": 1.5918598078010175, "grad_norm": 0.5556302525952723, "learning_rate": 2.42294174996935e-06, "loss": 0.3204, "step": 1056 }, { "epoch": 1.5933672508008292, "grad_norm": 0.5588880844097838, "learning_rate": 2.40578286451029e-06, "loss": 0.3282, "step": 1057 }, { "epoch": 1.5948746938006406, "grad_norm": 0.5241614280996468, "learning_rate": 2.38867664156886e-06, "loss": 0.3255, "step": 1058 }, { "epoch": 1.5963821368004523, "grad_norm": 0.5543274849783603, "learning_rate": 2.3716231997691007e-06, "loss": 0.3175, "step": 1059 }, { "epoch": 1.5978895798002637, "grad_norm": 0.5306578564545272, "learning_rate": 2.3546226573690444e-06, "loss": 0.3211, "step": 1060 }, { "epoch": 1.5993970228000753, "grad_norm": 0.5401209566379707, "learning_rate": 2.3376751322599e-06, "loss": 0.3117, "step": 1061 }, { "epoch": 1.600904465799887, "grad_norm": 0.5339229576030943, "learning_rate": 2.320780741965206e-06, "loss": 0.3064, "step": 1062 }, { "epoch": 1.6024119087996986, "grad_norm": 0.5291570037477905, "learning_rate": 2.3039396036400463e-06, "loss": 0.3001, "step": 1063 }, { "epoch": 1.60391935179951, "grad_norm": 0.5544131085966325, "learning_rate": 2.287151834070226e-06, "loss": 0.3173, "step": 1064 }, { "epoch": 1.6054267947993215, "grad_norm": 0.5042273491393638, "learning_rate": 2.2704175496714552e-06, "loss": 0.3035, "step": 1065 }, { "epoch": 1.6069342377991331, "grad_norm": 0.5164264296676705, "learning_rate": 2.2537368664885527e-06, "loss": 0.306, "step": 1066 }, { "epoch": 1.6084416807989448, "grad_norm": 0.540939444102417, "learning_rate": 2.2371099001946385e-06, "loss": 0.3417, "step": 1067 }, { "epoch": 1.6099491237987564, "grad_norm": 0.5349172500611197, "learning_rate": 2.2205367660903267e-06, "loss": 0.3155, "step": 1068 }, { "epoch": 1.611456566798568, "grad_norm": 0.5392150017492342, "learning_rate": 2.2040175791029305e-06, "loss": 0.334, "step": 1069 }, { "epoch": 1.6129640097983795, "grad_norm": 0.5420224175155496, "learning_rate": 2.187552453785662e-06, "loss": 0.2981, "step": 1070 }, { "epoch": 1.614471452798191, "grad_norm": 0.5385758816342323, "learning_rate": 2.1711415043168395e-06, "loss": 0.3313, "step": 1071 }, { "epoch": 1.6159788957980026, "grad_norm": 0.5437131207841849, "learning_rate": 2.1547848444991025e-06, "loss": 0.3352, "step": 1072 }, { "epoch": 1.6174863387978142, "grad_norm": 0.5395621366503963, "learning_rate": 2.138482587758605e-06, "loss": 0.308, "step": 1073 }, { "epoch": 1.6189937817976259, "grad_norm": 0.5255773037738669, "learning_rate": 2.1222348471442477e-06, "loss": 0.3014, "step": 1074 }, { "epoch": 1.6205012247974373, "grad_norm": 0.5226567653631905, "learning_rate": 2.1060417353268845e-06, "loss": 0.3143, "step": 1075 }, { "epoch": 1.622008667797249, "grad_norm": 0.5449482690998529, "learning_rate": 2.0899033645985423e-06, "loss": 0.3091, "step": 1076 }, { "epoch": 1.6235161107970604, "grad_norm": 0.5424238529202222, "learning_rate": 2.073819846871646e-06, "loss": 0.3185, "step": 1077 }, { "epoch": 1.625023553796872, "grad_norm": 0.5311225228497766, "learning_rate": 2.0577912936782317e-06, "loss": 0.2983, "step": 1078 }, { "epoch": 1.6265309967966837, "grad_norm": 0.5208053420833654, "learning_rate": 2.041817816169187e-06, "loss": 0.3295, "step": 1079 }, { "epoch": 1.6280384397964953, "grad_norm": 0.5269145283569221, "learning_rate": 2.025899525113474e-06, "loss": 0.3026, "step": 1080 }, { "epoch": 1.6295458827963067, "grad_norm": 0.5371643301644231, "learning_rate": 2.010036530897359e-06, "loss": 0.3196, "step": 1081 }, { "epoch": 1.6310533257961182, "grad_norm": 0.5349956576564423, "learning_rate": 1.9942289435236506e-06, "loss": 0.3304, "step": 1082 }, { "epoch": 1.6325607687959298, "grad_norm": 0.5604150538269126, "learning_rate": 1.978476872610939e-06, "loss": 0.3485, "step": 1083 }, { "epoch": 1.6340682117957415, "grad_norm": 0.5245207862200475, "learning_rate": 1.962780427392823e-06, "loss": 0.2882, "step": 1084 }, { "epoch": 1.6355756547955531, "grad_norm": 0.5195585896558206, "learning_rate": 1.9471397167171714e-06, "loss": 0.3051, "step": 1085 }, { "epoch": 1.6370830977953648, "grad_norm": 0.5147063039454572, "learning_rate": 1.931554849045355e-06, "loss": 0.3078, "step": 1086 }, { "epoch": 1.6385905407951762, "grad_norm": 0.5303520825987951, "learning_rate": 1.916025932451493e-06, "loss": 0.3141, "step": 1087 }, { "epoch": 1.6400979837949876, "grad_norm": 0.5154838630662848, "learning_rate": 1.9005530746217238e-06, "loss": 0.2971, "step": 1088 }, { "epoch": 1.6416054267947993, "grad_norm": 0.5537432078636199, "learning_rate": 1.8851363828534253e-06, "loss": 0.3124, "step": 1089 }, { "epoch": 1.643112869794611, "grad_norm": 0.5634336334894083, "learning_rate": 1.869775964054501e-06, "loss": 0.3271, "step": 1090 }, { "epoch": 1.6446203127944226, "grad_norm": 0.5433031560068617, "learning_rate": 1.8544719247426224e-06, "loss": 0.3191, "step": 1091 }, { "epoch": 1.646127755794234, "grad_norm": 0.5357448136347239, "learning_rate": 1.8392243710444911e-06, "loss": 0.2982, "step": 1092 }, { "epoch": 1.6476351987940456, "grad_norm": 0.5552897165798768, "learning_rate": 1.8240334086951117e-06, "loss": 0.3537, "step": 1093 }, { "epoch": 1.649142641793857, "grad_norm": 0.5318934621576651, "learning_rate": 1.8088991430370506e-06, "loss": 0.3005, "step": 1094 }, { "epoch": 1.6506500847936687, "grad_norm": 0.5465559179605479, "learning_rate": 1.7938216790197071e-06, "loss": 0.3207, "step": 1095 }, { "epoch": 1.6521575277934804, "grad_norm": 0.5641671337079456, "learning_rate": 1.77880112119859e-06, "loss": 0.3095, "step": 1096 }, { "epoch": 1.653664970793292, "grad_norm": 0.5270236586496325, "learning_rate": 1.7638375737345804e-06, "loss": 0.312, "step": 1097 }, { "epoch": 1.6551724137931034, "grad_norm": 0.5192997218773957, "learning_rate": 1.7489311403932274e-06, "loss": 0.2937, "step": 1098 }, { "epoch": 1.656679856792915, "grad_norm": 0.5620286825583494, "learning_rate": 1.7340819245440166e-06, "loss": 0.3186, "step": 1099 }, { "epoch": 1.6581872997927265, "grad_norm": 0.5445642066374056, "learning_rate": 1.7192900291596493e-06, "loss": 0.3222, "step": 1100 }, { "epoch": 1.6596947427925381, "grad_norm": 0.5157869374514513, "learning_rate": 1.7045555568153415e-06, "loss": 0.306, "step": 1101 }, { "epoch": 1.6612021857923498, "grad_norm": 0.5138381277919514, "learning_rate": 1.6898786096881104e-06, "loss": 0.2715, "step": 1102 }, { "epoch": 1.6627096287921614, "grad_norm": 0.5431676779116132, "learning_rate": 1.6752592895560493e-06, "loss": 0.3202, "step": 1103 }, { "epoch": 1.6642170717919729, "grad_norm": 0.5396928068758252, "learning_rate": 1.6606976977976408e-06, "loss": 0.3122, "step": 1104 }, { "epoch": 1.6657245147917843, "grad_norm": 0.5332820215409003, "learning_rate": 1.6461939353910494e-06, "loss": 0.3083, "step": 1105 }, { "epoch": 1.667231957791596, "grad_norm": 0.5475766503326961, "learning_rate": 1.631748102913412e-06, "loss": 0.3166, "step": 1106 }, { "epoch": 1.6687394007914076, "grad_norm": 0.5416380330717204, "learning_rate": 1.6173603005401505e-06, "loss": 0.3158, "step": 1107 }, { "epoch": 1.6702468437912192, "grad_norm": 0.5416213794269614, "learning_rate": 1.6030306280442764e-06, "loss": 0.3077, "step": 1108 }, { "epoch": 1.6717542867910307, "grad_norm": 0.5153535761957841, "learning_rate": 1.588759184795694e-06, "loss": 0.3064, "step": 1109 }, { "epoch": 1.6732617297908423, "grad_norm": 0.5315610226872074, "learning_rate": 1.574546069760514e-06, "loss": 0.3241, "step": 1110 }, { "epoch": 1.6747691727906537, "grad_norm": 0.5684878986820251, "learning_rate": 1.5603913815003634e-06, "loss": 0.3403, "step": 1111 }, { "epoch": 1.6762766157904654, "grad_norm": 0.5361918937068931, "learning_rate": 1.5462952181717117e-06, "loss": 0.3157, "step": 1112 }, { "epoch": 1.677784058790277, "grad_norm": 0.5495570916809654, "learning_rate": 1.532257677525183e-06, "loss": 0.3224, "step": 1113 }, { "epoch": 1.6792915017900887, "grad_norm": 0.5281943000093583, "learning_rate": 1.5182788569048689e-06, "loss": 0.3209, "step": 1114 }, { "epoch": 1.6807989447899, "grad_norm": 0.5572787989841019, "learning_rate": 1.5043588532476827e-06, "loss": 0.3663, "step": 1115 }, { "epoch": 1.6823063877897118, "grad_norm": 0.5228968415248135, "learning_rate": 1.49049776308265e-06, "loss": 0.2889, "step": 1116 }, { "epoch": 1.6838138307895232, "grad_norm": 0.5220477714238411, "learning_rate": 1.476695682530268e-06, "loss": 0.3031, "step": 1117 }, { "epoch": 1.6853212737893348, "grad_norm": 0.5351071420566498, "learning_rate": 1.4629527073018267e-06, "loss": 0.3308, "step": 1118 }, { "epoch": 1.6868287167891465, "grad_norm": 0.5396208371722178, "learning_rate": 1.449268932698743e-06, "loss": 0.2971, "step": 1119 }, { "epoch": 1.6883361597889581, "grad_norm": 0.5416569763589452, "learning_rate": 1.4356444536119085e-06, "loss": 0.3024, "step": 1120 }, { "epoch": 1.6898436027887695, "grad_norm": 0.5370439069377987, "learning_rate": 1.422079364521024e-06, "loss": 0.3169, "step": 1121 }, { "epoch": 1.691351045788581, "grad_norm": 0.5450238281058462, "learning_rate": 1.4085737594939497e-06, "loss": 0.333, "step": 1122 }, { "epoch": 1.6928584887883926, "grad_norm": 0.5168406644621856, "learning_rate": 1.3951277321860468e-06, "loss": 0.3006, "step": 1123 }, { "epoch": 1.6943659317882043, "grad_norm": 0.5087028192552481, "learning_rate": 1.381741375839537e-06, "loss": 0.2664, "step": 1124 }, { "epoch": 1.695873374788016, "grad_norm": 0.5165999383002566, "learning_rate": 1.3684147832828409e-06, "loss": 0.281, "step": 1125 }, { "epoch": 1.6973808177878273, "grad_norm": 0.5150348541332692, "learning_rate": 1.355148046929956e-06, "loss": 0.307, "step": 1126 }, { "epoch": 1.698888260787639, "grad_norm": 0.5168141041103775, "learning_rate": 1.3419412587797908e-06, "loss": 0.293, "step": 1127 }, { "epoch": 1.7003957037874504, "grad_norm": 0.5133973098786774, "learning_rate": 1.3287945104155487e-06, "loss": 0.3015, "step": 1128 }, { "epoch": 1.701903146787262, "grad_norm": 0.5513676691974454, "learning_rate": 1.3157078930040856e-06, "loss": 0.3179, "step": 1129 }, { "epoch": 1.7034105897870737, "grad_norm": 0.546503387891844, "learning_rate": 1.3026814972952674e-06, "loss": 0.3043, "step": 1130 }, { "epoch": 1.7049180327868854, "grad_norm": 0.5133928571679112, "learning_rate": 1.2897154136213542e-06, "loss": 0.2938, "step": 1131 }, { "epoch": 1.7064254757866968, "grad_norm": 0.5441429881648664, "learning_rate": 1.2768097318963701e-06, "loss": 0.2978, "step": 1132 }, { "epoch": 1.7079329187865084, "grad_norm": 0.5430821498300733, "learning_rate": 1.2639645416154744e-06, "loss": 0.3204, "step": 1133 }, { "epoch": 1.7094403617863199, "grad_norm": 0.5536392631462185, "learning_rate": 1.2511799318543493e-06, "loss": 0.3196, "step": 1134 }, { "epoch": 1.7109478047861315, "grad_norm": 0.5403946840340975, "learning_rate": 1.2384559912685768e-06, "loss": 0.3156, "step": 1135 }, { "epoch": 1.7124552477859432, "grad_norm": 0.5360872549447121, "learning_rate": 1.2257928080930236e-06, "loss": 0.3275, "step": 1136 }, { "epoch": 1.7139626907857548, "grad_norm": 0.5482125645711281, "learning_rate": 1.2131904701412345e-06, "loss": 0.3041, "step": 1137 }, { "epoch": 1.7154701337855662, "grad_norm": 0.5497152879102077, "learning_rate": 1.2006490648048118e-06, "loss": 0.3148, "step": 1138 }, { "epoch": 1.7169775767853777, "grad_norm": 0.5141035384965538, "learning_rate": 1.1881686790528279e-06, "loss": 0.3068, "step": 1139 }, { "epoch": 1.7184850197851893, "grad_norm": 0.529010142033819, "learning_rate": 1.1757493994312052e-06, "loss": 0.3088, "step": 1140 }, { "epoch": 1.719992462785001, "grad_norm": 0.5495066854090749, "learning_rate": 1.1633913120621188e-06, "loss": 0.3236, "step": 1141 }, { "epoch": 1.7214999057848126, "grad_norm": 0.5492447937245492, "learning_rate": 1.151094502643414e-06, "loss": 0.3308, "step": 1142 }, { "epoch": 1.723007348784624, "grad_norm": 0.5562630282965828, "learning_rate": 1.1388590564479895e-06, "loss": 0.299, "step": 1143 }, { "epoch": 1.7245147917844357, "grad_norm": 0.5231751271166386, "learning_rate": 1.1266850583232224e-06, "loss": 0.3053, "step": 1144 }, { "epoch": 1.726022234784247, "grad_norm": 0.5201419013982067, "learning_rate": 1.1145725926903772e-06, "loss": 0.3023, "step": 1145 }, { "epoch": 1.7275296777840587, "grad_norm": 0.511856980805046, "learning_rate": 1.1025217435440116e-06, "loss": 0.2867, "step": 1146 }, { "epoch": 1.7290371207838704, "grad_norm": 0.565983997374927, "learning_rate": 1.0905325944514034e-06, "loss": 0.3232, "step": 1147 }, { "epoch": 1.730544563783682, "grad_norm": 0.5363045072212188, "learning_rate": 1.078605228551971e-06, "loss": 0.3182, "step": 1148 }, { "epoch": 1.7320520067834935, "grad_norm": 0.5329886787330583, "learning_rate": 1.0667397285566893e-06, "loss": 0.3061, "step": 1149 }, { "epoch": 1.7335594497833051, "grad_norm": 0.5397864180847504, "learning_rate": 1.0549361767475241e-06, "loss": 0.2873, "step": 1150 }, { "epoch": 1.7350668927831165, "grad_norm": 0.5436027401118747, "learning_rate": 1.0431946549768567e-06, "loss": 0.3213, "step": 1151 }, { "epoch": 1.7365743357829282, "grad_norm": 0.5304426225729307, "learning_rate": 1.0315152446669142e-06, "loss": 0.295, "step": 1152 }, { "epoch": 1.7380817787827398, "grad_norm": 0.5119724476906113, "learning_rate": 1.019898026809214e-06, "loss": 0.3009, "step": 1153 }, { "epoch": 1.7395892217825515, "grad_norm": 0.5399752438286287, "learning_rate": 1.0083430819639962e-06, "loss": 0.3097, "step": 1154 }, { "epoch": 1.741096664782363, "grad_norm": 0.5329124149971953, "learning_rate": 9.968504902596566e-07, "loss": 0.3094, "step": 1155 }, { "epoch": 1.7426041077821743, "grad_norm": 0.5265575018375785, "learning_rate": 9.85420331392214e-07, "loss": 0.3001, "step": 1156 }, { "epoch": 1.744111550781986, "grad_norm": 0.5415027063140824, "learning_rate": 9.74052684624731e-07, "loss": 0.3052, "step": 1157 }, { "epoch": 1.7456189937817976, "grad_norm": 0.5273083269054069, "learning_rate": 9.62747628786782e-07, "loss": 0.2918, "step": 1158 }, { "epoch": 1.7471264367816093, "grad_norm": 0.5467675396074031, "learning_rate": 9.515052422739035e-07, "loss": 0.3013, "step": 1159 }, { "epoch": 1.748633879781421, "grad_norm": 0.5012647001758278, "learning_rate": 9.403256030470386e-07, "loss": 0.2922, "step": 1160 }, { "epoch": 1.7501413227812324, "grad_norm": 0.5473662670334606, "learning_rate": 9.292087886320166e-07, "loss": 0.3275, "step": 1161 }, { "epoch": 1.7516487657810438, "grad_norm": 0.5119711645632107, "learning_rate": 9.181548761189996e-07, "loss": 0.2997, "step": 1162 }, { "epoch": 1.7531562087808554, "grad_norm": 0.5579153837729429, "learning_rate": 9.071639421619527e-07, "loss": 0.3373, "step": 1163 }, { "epoch": 1.754663651780667, "grad_norm": 0.5369066551498206, "learning_rate": 8.962360629781164e-07, "loss": 0.3013, "step": 1164 }, { "epoch": 1.7561710947804787, "grad_norm": 0.5299407293801213, "learning_rate": 8.853713143474685e-07, "loss": 0.2977, "step": 1165 }, { "epoch": 1.7576785377802902, "grad_norm": 0.5246411563266595, "learning_rate": 8.745697716122081e-07, "loss": 0.3192, "step": 1166 }, { "epoch": 1.7591859807801018, "grad_norm": 0.5450557723814945, "learning_rate": 8.638315096762318e-07, "loss": 0.3075, "step": 1167 }, { "epoch": 1.7606934237799132, "grad_norm": 0.5392032704205785, "learning_rate": 8.531566030046035e-07, "loss": 0.3231, "step": 1168 }, { "epoch": 1.7622008667797249, "grad_norm": 0.5370373229347402, "learning_rate": 8.425451256230588e-07, "loss": 0.3012, "step": 1169 }, { "epoch": 1.7637083097795365, "grad_norm": 0.5405109358545394, "learning_rate": 8.319971511174718e-07, "loss": 0.3165, "step": 1170 }, { "epoch": 1.7652157527793482, "grad_norm": 0.528010212076121, "learning_rate": 8.215127526333499e-07, "loss": 0.3236, "step": 1171 }, { "epoch": 1.7667231957791596, "grad_norm": 0.5226712118154457, "learning_rate": 8.110920028753355e-07, "loss": 0.3088, "step": 1172 }, { "epoch": 1.768230638778971, "grad_norm": 0.5574504221106463, "learning_rate": 8.007349741066939e-07, "loss": 0.3258, "step": 1173 }, { "epoch": 1.7697380817787827, "grad_norm": 0.5452638755092976, "learning_rate": 7.904417381488083e-07, "loss": 0.3167, "step": 1174 }, { "epoch": 1.7712455247785943, "grad_norm": 0.5517680932401637, "learning_rate": 7.802123663806938e-07, "loss": 0.3227, "step": 1175 }, { "epoch": 1.772752967778406, "grad_norm": 0.5480833770797733, "learning_rate": 7.700469297384927e-07, "loss": 0.3307, "step": 1176 }, { "epoch": 1.7742604107782176, "grad_norm": 0.5430614726412718, "learning_rate": 7.599454987149868e-07, "loss": 0.3463, "step": 1177 }, { "epoch": 1.775767853778029, "grad_norm": 0.549255722400039, "learning_rate": 7.499081433591071e-07, "loss": 0.3284, "step": 1178 }, { "epoch": 1.7772752967778405, "grad_norm": 0.5410909371721678, "learning_rate": 7.399349332754458e-07, "loss": 0.315, "step": 1179 }, { "epoch": 1.778782739777652, "grad_norm": 0.530772119547457, "learning_rate": 7.300259376237795e-07, "loss": 0.3101, "step": 1180 }, { "epoch": 1.7802901827774638, "grad_norm": 0.5484763677836378, "learning_rate": 7.201812251185869e-07, "loss": 0.3192, "step": 1181 }, { "epoch": 1.7817976257772754, "grad_norm": 0.5299690672941552, "learning_rate": 7.104008640285642e-07, "loss": 0.3115, "step": 1182 }, { "epoch": 1.7833050687770868, "grad_norm": 0.5424348226189657, "learning_rate": 7.006849221761736e-07, "loss": 0.3119, "step": 1183 }, { "epoch": 1.7848125117768985, "grad_norm": 0.5221149596372863, "learning_rate": 6.910334669371433e-07, "loss": 0.3078, "step": 1184 }, { "epoch": 1.78631995477671, "grad_norm": 0.5581110749980865, "learning_rate": 6.814465652400237e-07, "loss": 0.3364, "step": 1185 }, { "epoch": 1.7878273977765216, "grad_norm": 0.5227271898985753, "learning_rate": 6.719242835657147e-07, "loss": 0.3057, "step": 1186 }, { "epoch": 1.7893348407763332, "grad_norm": 0.5541663205023336, "learning_rate": 6.62466687947001e-07, "loss": 0.335, "step": 1187 }, { "epoch": 1.7908422837761448, "grad_norm": 0.5269336686543489, "learning_rate": 6.530738439681017e-07, "loss": 0.3151, "step": 1188 }, { "epoch": 1.7923497267759563, "grad_norm": 0.5786593133724774, "learning_rate": 6.437458167642164e-07, "loss": 0.3366, "step": 1189 }, { "epoch": 1.7938571697757677, "grad_norm": 0.5253461874134103, "learning_rate": 6.344826710210584e-07, "loss": 0.3127, "step": 1190 }, { "epoch": 1.7953646127755793, "grad_norm": 0.5299856089834871, "learning_rate": 6.252844709744255e-07, "loss": 0.3135, "step": 1191 }, { "epoch": 1.796872055775391, "grad_norm": 0.521456043233, "learning_rate": 6.161512804097436e-07, "loss": 0.2977, "step": 1192 }, { "epoch": 1.7983794987752026, "grad_norm": 0.5469215546867617, "learning_rate": 6.070831626616236e-07, "loss": 0.3255, "step": 1193 }, { "epoch": 1.7998869417750143, "grad_norm": 0.5228672904607026, "learning_rate": 5.980801806134318e-07, "loss": 0.2866, "step": 1194 }, { "epoch": 1.8013943847748257, "grad_norm": 0.5146912409802952, "learning_rate": 5.891423966968413e-07, "loss": 0.2881, "step": 1195 }, { "epoch": 1.8029018277746371, "grad_norm": 0.5474095770320819, "learning_rate": 5.80269872891408e-07, "loss": 0.3036, "step": 1196 }, { "epoch": 1.8044092707744488, "grad_norm": 0.5298374503272721, "learning_rate": 5.714626707241411e-07, "loss": 0.3032, "step": 1197 }, { "epoch": 1.8059167137742604, "grad_norm": 0.5577685295614593, "learning_rate": 5.627208512690641e-07, "loss": 0.3136, "step": 1198 }, { "epoch": 1.807424156774072, "grad_norm": 0.5641313668289314, "learning_rate": 5.5404447514681e-07, "loss": 0.3057, "step": 1199 }, { "epoch": 1.8089315997738835, "grad_norm": 0.5295651592847044, "learning_rate": 5.45433602524188e-07, "loss": 0.292, "step": 1200 }, { "epoch": 1.8104390427736952, "grad_norm": 0.5164790354946905, "learning_rate": 5.368882931137675e-07, "loss": 0.3171, "step": 1201 }, { "epoch": 1.8119464857735066, "grad_norm": 0.5440394178727653, "learning_rate": 5.284086061734672e-07, "loss": 0.3389, "step": 1202 }, { "epoch": 1.8134539287733182, "grad_norm": 0.5379140129646219, "learning_rate": 5.199946005061462e-07, "loss": 0.3191, "step": 1203 }, { "epoch": 1.8149613717731299, "grad_norm": 0.5586596394798488, "learning_rate": 5.116463344591893e-07, "loss": 0.3297, "step": 1204 }, { "epoch": 1.8164688147729415, "grad_norm": 0.5460139307968361, "learning_rate": 5.033638659241102e-07, "loss": 0.3179, "step": 1205 }, { "epoch": 1.817976257772753, "grad_norm": 0.5099561737975997, "learning_rate": 4.951472523361401e-07, "loss": 0.2881, "step": 1206 }, { "epoch": 1.8194837007725644, "grad_norm": 0.5127038476010487, "learning_rate": 4.869965506738416e-07, "loss": 0.301, "step": 1207 }, { "epoch": 1.820991143772376, "grad_norm": 0.5160382306170839, "learning_rate": 4.789118174587071e-07, "loss": 0.2951, "step": 1208 }, { "epoch": 1.8224985867721877, "grad_norm": 0.5368235750006268, "learning_rate": 4.7089310875475856e-07, "loss": 0.3244, "step": 1209 }, { "epoch": 1.8240060297719993, "grad_norm": 0.5486310091200752, "learning_rate": 4.6294048016817917e-07, "loss": 0.3357, "step": 1210 }, { "epoch": 1.825513472771811, "grad_norm": 0.5531427959479509, "learning_rate": 4.550539868469106e-07, "loss": 0.3404, "step": 1211 }, { "epoch": 1.8270209157716224, "grad_norm": 0.5715387730686571, "learning_rate": 4.4723368348027375e-07, "loss": 0.3172, "step": 1212 }, { "epoch": 1.8285283587714338, "grad_norm": 0.5208566706980682, "learning_rate": 4.394796242985933e-07, "loss": 0.3334, "step": 1213 }, { "epoch": 1.8300358017712455, "grad_norm": 0.5088911051543478, "learning_rate": 4.317918630728235e-07, "loss": 0.3022, "step": 1214 }, { "epoch": 1.8315432447710571, "grad_norm": 0.5459330178466746, "learning_rate": 4.241704531141633e-07, "loss": 0.3192, "step": 1215 }, { "epoch": 1.8330506877708688, "grad_norm": 0.5256076032695434, "learning_rate": 4.166154472737061e-07, "loss": 0.2962, "step": 1216 }, { "epoch": 1.8345581307706802, "grad_norm": 0.525111212488327, "learning_rate": 4.091268979420537e-07, "loss": 0.3015, "step": 1217 }, { "epoch": 1.8360655737704918, "grad_norm": 0.5131054923837834, "learning_rate": 4.0170485704896453e-07, "loss": 0.2984, "step": 1218 }, { "epoch": 1.8375730167703033, "grad_norm": 0.5222731798701179, "learning_rate": 3.943493760629924e-07, "loss": 0.3007, "step": 1219 }, { "epoch": 1.839080459770115, "grad_norm": 0.526864271558084, "learning_rate": 3.8706050599112363e-07, "loss": 0.3265, "step": 1220 }, { "epoch": 1.8405879027699266, "grad_norm": 0.5429845817805197, "learning_rate": 3.798382973784298e-07, "loss": 0.3126, "step": 1221 }, { "epoch": 1.8420953457697382, "grad_norm": 0.5203043666384023, "learning_rate": 3.7268280030771655e-07, "loss": 0.3005, "step": 1222 }, { "epoch": 1.8436027887695496, "grad_norm": 0.5553304676785623, "learning_rate": 3.655940643991718e-07, "loss": 0.3033, "step": 1223 }, { "epoch": 1.845110231769361, "grad_norm": 0.5453329521030166, "learning_rate": 3.585721388100283e-07, "loss": 0.3197, "step": 1224 }, { "epoch": 1.8466176747691727, "grad_norm": 0.5358448339719871, "learning_rate": 3.516170722342127e-07, "loss": 0.3123, "step": 1225 }, { "epoch": 1.8481251177689844, "grad_norm": 0.5255670215605667, "learning_rate": 3.4472891290201927e-07, "loss": 0.3052, "step": 1226 }, { "epoch": 1.849632560768796, "grad_norm": 0.5558001652650641, "learning_rate": 3.3790770857976995e-07, "loss": 0.326, "step": 1227 }, { "epoch": 1.8511400037686077, "grad_norm": 0.5383925013665675, "learning_rate": 3.3115350656948043e-07, "loss": 0.3074, "step": 1228 }, { "epoch": 1.852647446768419, "grad_norm": 0.537705868540796, "learning_rate": 3.2446635370853686e-07, "loss": 0.3304, "step": 1229 }, { "epoch": 1.8541548897682305, "grad_norm": 0.5382781367285551, "learning_rate": 3.1784629636937404e-07, "loss": 0.2883, "step": 1230 }, { "epoch": 1.8556623327680422, "grad_norm": 0.5198656289929648, "learning_rate": 3.1129338045914004e-07, "loss": 0.3067, "step": 1231 }, { "epoch": 1.8571697757678538, "grad_norm": 0.5472844326917599, "learning_rate": 3.0480765141939316e-07, "loss": 0.2992, "step": 1232 }, { "epoch": 1.8586772187676655, "grad_norm": 0.5507075256404199, "learning_rate": 2.9838915422578e-07, "loss": 0.3179, "step": 1233 }, { "epoch": 1.8601846617674769, "grad_norm": 0.5510217424809797, "learning_rate": 2.920379333877221e-07, "loss": 0.2994, "step": 1234 }, { "epoch": 1.8616921047672885, "grad_norm": 0.5400374711049234, "learning_rate": 2.8575403294811123e-07, "loss": 0.32, "step": 1235 }, { "epoch": 1.8631995477671, "grad_norm": 0.5378450793044806, "learning_rate": 2.795374964830022e-07, "loss": 0.2982, "step": 1236 }, { "epoch": 1.8647069907669116, "grad_norm": 0.5537869321901812, "learning_rate": 2.733883671013082e-07, "loss": 0.3215, "step": 1237 }, { "epoch": 1.8662144337667232, "grad_norm": 0.5214658307160525, "learning_rate": 2.673066874445096e-07, "loss": 0.2967, "step": 1238 }, { "epoch": 1.867721876766535, "grad_norm": 0.5396057264477051, "learning_rate": 2.612924996863453e-07, "loss": 0.3323, "step": 1239 }, { "epoch": 1.8692293197663463, "grad_norm": 0.5029335509841266, "learning_rate": 2.5534584553253526e-07, "loss": 0.2874, "step": 1240 }, { "epoch": 1.8707367627661577, "grad_norm": 0.5258804019291271, "learning_rate": 2.494667662204797e-07, "loss": 0.2899, "step": 1241 }, { "epoch": 1.8722442057659694, "grad_norm": 0.5248481914254555, "learning_rate": 2.436553025189758e-07, "loss": 0.3024, "step": 1242 }, { "epoch": 1.873751648765781, "grad_norm": 0.5561247405618174, "learning_rate": 2.3791149472794373e-07, "loss": 0.3224, "step": 1243 }, { "epoch": 1.8752590917655927, "grad_norm": 0.5386355445772364, "learning_rate": 2.3223538267813317e-07, "loss": 0.3252, "step": 1244 }, { "epoch": 1.8767665347654043, "grad_norm": 0.5387316814949316, "learning_rate": 2.2662700573085505e-07, "loss": 0.3188, "step": 1245 }, { "epoch": 1.8782739777652158, "grad_norm": 0.5140491567851894, "learning_rate": 2.2108640277771153e-07, "loss": 0.3087, "step": 1246 }, { "epoch": 1.8797814207650272, "grad_norm": 0.5578574961077984, "learning_rate": 2.156136122403174e-07, "loss": 0.3339, "step": 1247 }, { "epoch": 1.8812888637648388, "grad_norm": 0.531923059262347, "learning_rate": 2.1020867207004026e-07, "loss": 0.302, "step": 1248 }, { "epoch": 1.8827963067646505, "grad_norm": 0.5199091046599704, "learning_rate": 2.048716197477374e-07, "loss": 0.3, "step": 1249 }, { "epoch": 1.8843037497644621, "grad_norm": 0.5292062225255757, "learning_rate": 1.996024922834905e-07, "loss": 0.3136, "step": 1250 }, { "epoch": 1.8858111927642736, "grad_norm": 0.5116718173644801, "learning_rate": 1.9440132621635687e-07, "loss": 0.3022, "step": 1251 }, { "epoch": 1.8873186357640852, "grad_norm": 0.5311851666913588, "learning_rate": 1.8926815761410867e-07, "loss": 0.3119, "step": 1252 }, { "epoch": 1.8888260787638966, "grad_norm": 0.5164961460809835, "learning_rate": 1.8420302207298623e-07, "loss": 0.3366, "step": 1253 }, { "epoch": 1.8903335217637083, "grad_norm": 0.504315215516738, "learning_rate": 1.792059547174507e-07, "loss": 0.2975, "step": 1254 }, { "epoch": 1.89184096476352, "grad_norm": 0.5345352832708335, "learning_rate": 1.7427699019994415e-07, "loss": 0.3236, "step": 1255 }, { "epoch": 1.8933484077633316, "grad_norm": 0.5381742517110331, "learning_rate": 1.6941616270063854e-07, "loss": 0.3279, "step": 1256 }, { "epoch": 1.894855850763143, "grad_norm": 0.5375566393713683, "learning_rate": 1.6462350592721498e-07, "loss": 0.3362, "step": 1257 }, { "epoch": 1.8963632937629544, "grad_norm": 0.5285514266127366, "learning_rate": 1.5989905311461274e-07, "loss": 0.3204, "step": 1258 }, { "epoch": 1.897870736762766, "grad_norm": 0.5414536639413304, "learning_rate": 1.5524283702481158e-07, "loss": 0.3335, "step": 1259 }, { "epoch": 1.8993781797625777, "grad_norm": 0.512538356667461, "learning_rate": 1.5065488994659983e-07, "loss": 0.3053, "step": 1260 }, { "epoch": 1.9008856227623894, "grad_norm": 0.5324212249495981, "learning_rate": 1.461352436953478e-07, "loss": 0.3072, "step": 1261 }, { "epoch": 1.902393065762201, "grad_norm": 0.5455697348576503, "learning_rate": 1.4168392961279254e-07, "loss": 0.3316, "step": 1262 }, { "epoch": 1.9039005087620124, "grad_norm": 0.5466375519251029, "learning_rate": 1.3730097856681668e-07, "loss": 0.3226, "step": 1263 }, { "epoch": 1.9054079517618239, "grad_norm": 0.5312632713929628, "learning_rate": 1.329864209512377e-07, "loss": 0.315, "step": 1264 }, { "epoch": 1.9069153947616355, "grad_norm": 0.5425648068314173, "learning_rate": 1.2874028668559247e-07, "loss": 0.3235, "step": 1265 }, { "epoch": 1.9084228377614472, "grad_norm": 0.5312642091039448, "learning_rate": 1.245626052149318e-07, "loss": 0.3203, "step": 1266 }, { "epoch": 1.9099302807612588, "grad_norm": 0.532495465640754, "learning_rate": 1.2045340550961958e-07, "loss": 0.3155, "step": 1267 }, { "epoch": 1.9114377237610702, "grad_norm": 0.5246778980321247, "learning_rate": 1.164127160651285e-07, "loss": 0.2926, "step": 1268 }, { "epoch": 1.9129451667608819, "grad_norm": 0.5339514500193528, "learning_rate": 1.1244056490184008e-07, "loss": 0.3029, "step": 1269 }, { "epoch": 1.9144526097606933, "grad_norm": 0.520828858822998, "learning_rate": 1.0853697956485942e-07, "loss": 0.3065, "step": 1270 }, { "epoch": 1.915960052760505, "grad_norm": 0.520817868672033, "learning_rate": 1.0470198712381086e-07, "loss": 0.307, "step": 1271 }, { "epoch": 1.9174674957603166, "grad_norm": 0.516414932582989, "learning_rate": 1.009356141726614e-07, "loss": 0.3101, "step": 1272 }, { "epoch": 1.9189749387601283, "grad_norm": 0.549210829131398, "learning_rate": 9.723788682953539e-08, "loss": 0.3562, "step": 1273 }, { "epoch": 1.9204823817599397, "grad_norm": 0.5457067373758283, "learning_rate": 9.360883073652238e-08, "loss": 0.3179, "step": 1274 }, { "epoch": 1.921989824759751, "grad_norm": 0.5418508804321499, "learning_rate": 9.004847105951509e-08, "loss": 0.3159, "step": 1275 }, { "epoch": 1.9234972677595628, "grad_norm": 0.5366441286826634, "learning_rate": 8.655683248802282e-08, "loss": 0.2996, "step": 1276 }, { "epoch": 1.9250047107593744, "grad_norm": 0.5442333602669928, "learning_rate": 8.313393923500613e-08, "loss": 0.3088, "step": 1277 }, { "epoch": 1.926512153759186, "grad_norm": 0.5391901662166373, "learning_rate": 7.977981503670795e-08, "loss": 0.3061, "step": 1278 }, { "epoch": 1.9280195967589977, "grad_norm": 0.5435340810409717, "learning_rate": 7.64944831524872e-08, "loss": 0.3285, "step": 1279 }, { "epoch": 1.9295270397588091, "grad_norm": 0.521864945549257, "learning_rate": 7.327796636465767e-08, "loss": 0.3076, "step": 1280 }, { "epoch": 1.9310344827586206, "grad_norm": 0.5514257916288331, "learning_rate": 7.01302869783338e-08, "loss": 0.3094, "step": 1281 }, { "epoch": 1.9325419257584322, "grad_norm": 0.5219787357249853, "learning_rate": 6.705146682127184e-08, "loss": 0.2995, "step": 1282 }, { "epoch": 1.9340493687582438, "grad_norm": 0.5276229371813537, "learning_rate": 6.404152724371892e-08, "loss": 0.3091, "step": 1283 }, { "epoch": 1.9355568117580555, "grad_norm": 0.5314226178684127, "learning_rate": 6.110048911826871e-08, "loss": 0.3158, "step": 1284 }, { "epoch": 1.937064254757867, "grad_norm": 0.5502887577785275, "learning_rate": 5.82283728397115e-08, "loss": 0.3215, "step": 1285 }, { "epoch": 1.9385716977576786, "grad_norm": 0.5486357521857442, "learning_rate": 5.542519832489546e-08, "loss": 0.3386, "step": 1286 }, { "epoch": 1.94007914075749, "grad_norm": 0.5249846416693436, "learning_rate": 5.269098501259007e-08, "loss": 0.3097, "step": 1287 }, { "epoch": 1.9415865837573016, "grad_norm": 0.5254444548298214, "learning_rate": 5.002575186334735e-08, "loss": 0.3357, "step": 1288 }, { "epoch": 1.9430940267571133, "grad_norm": 0.5333212942639225, "learning_rate": 4.742951735937418e-08, "loss": 0.3051, "step": 1289 }, { "epoch": 1.944601469756925, "grad_norm": 0.551663577132892, "learning_rate": 4.490229950440239e-08, "loss": 0.3004, "step": 1290 }, { "epoch": 1.9461089127567364, "grad_norm": 0.5255799475848183, "learning_rate": 4.2444115823562226e-08, "loss": 0.2978, "step": 1291 }, { "epoch": 1.9476163557565478, "grad_norm": 0.5122243774948546, "learning_rate": 4.005498336326463e-08, "loss": 0.2904, "step": 1292 }, { "epoch": 1.9491237987563594, "grad_norm": 0.5387853567764707, "learning_rate": 3.773491869108137e-08, "loss": 0.3186, "step": 1293 }, { "epoch": 1.950631241756171, "grad_norm": 0.5188343179014033, "learning_rate": 3.548393789562732e-08, "loss": 0.3009, "step": 1294 }, { "epoch": 1.9521386847559827, "grad_norm": 0.5377374320626865, "learning_rate": 3.3302056586453916e-08, "loss": 0.3054, "step": 1295 }, { "epoch": 1.9536461277557944, "grad_norm": 0.535561804417277, "learning_rate": 3.118928989393699e-08, "loss": 0.296, "step": 1296 }, { "epoch": 1.9551535707556058, "grad_norm": 0.5298311028053398, "learning_rate": 2.9145652469174666e-08, "loss": 0.3049, "step": 1297 }, { "epoch": 1.9566610137554172, "grad_norm": 0.5269215357647239, "learning_rate": 2.7171158483882963e-08, "loss": 0.2986, "step": 1298 }, { "epoch": 1.9581684567552289, "grad_norm": 0.5270798678914951, "learning_rate": 2.5265821630298116e-08, "loss": 0.3214, "step": 1299 }, { "epoch": 1.9596758997550405, "grad_norm": 0.5488284780441306, "learning_rate": 2.3429655121085525e-08, "loss": 0.3293, "step": 1300 }, { "epoch": 1.9611833427548522, "grad_norm": 0.5281296618472574, "learning_rate": 2.1662671689242076e-08, "loss": 0.3269, "step": 1301 }, { "epoch": 1.9626907857546636, "grad_norm": 0.5477047385786338, "learning_rate": 1.996488358801174e-08, "loss": 0.3116, "step": 1302 }, { "epoch": 1.9641982287544753, "grad_norm": 0.548270877454329, "learning_rate": 1.8336302590798992e-08, "loss": 0.3415, "step": 1303 }, { "epoch": 1.9657056717542867, "grad_norm": 0.5385366961987965, "learning_rate": 1.677693999109109e-08, "loss": 0.3036, "step": 1304 }, { "epoch": 1.9672131147540983, "grad_norm": 0.5125316134927453, "learning_rate": 1.5286806602372583e-08, "loss": 0.2899, "step": 1305 }, { "epoch": 1.96872055775391, "grad_norm": 0.5211226615097172, "learning_rate": 1.3865912758054267e-08, "loss": 0.3025, "step": 1306 }, { "epoch": 1.9702280007537216, "grad_norm": 0.5265304987884217, "learning_rate": 1.2514268311405452e-08, "loss": 0.3005, "step": 1307 }, { "epoch": 1.971735443753533, "grad_norm": 0.5410147654111483, "learning_rate": 1.1231882635477364e-08, "loss": 0.3119, "step": 1308 }, { "epoch": 1.9732428867533447, "grad_norm": 0.5170988890501786, "learning_rate": 1.0018764623045407e-08, "loss": 0.2958, "step": 1309 }, { "epoch": 1.9747503297531561, "grad_norm": 0.5184171784095163, "learning_rate": 8.874922686541442e-09, "loss": 0.2924, "step": 1310 }, { "epoch": 1.9762577727529678, "grad_norm": 0.5140179548472411, "learning_rate": 7.800364758002721e-09, "loss": 0.2935, "step": 1311 }, { "epoch": 1.9777652157527794, "grad_norm": 0.5486141459025122, "learning_rate": 6.795098289008595e-09, "loss": 0.315, "step": 1312 }, { "epoch": 1.979272658752591, "grad_norm": 0.5311267803536656, "learning_rate": 5.859130250636113e-09, "loss": 0.3115, "step": 1313 }, { "epoch": 1.9807801017524025, "grad_norm": 0.5127976418049099, "learning_rate": 4.992467133406731e-09, "loss": 0.2853, "step": 1314 }, { "epoch": 1.982287544752214, "grad_norm": 0.5328293684979241, "learning_rate": 4.195114947244117e-09, "loss": 0.3117, "step": 1315 }, { "epoch": 1.9837949877520256, "grad_norm": 0.5211491118658048, "learning_rate": 3.4670792214297476e-09, "loss": 0.3049, "step": 1316 }, { "epoch": 1.9853024307518372, "grad_norm": 0.532080710582646, "learning_rate": 2.808365004569602e-09, "loss": 0.3075, "step": 1317 }, { "epoch": 1.9868098737516489, "grad_norm": 0.530526147652671, "learning_rate": 2.2189768645519693e-09, "loss": 0.3158, "step": 1318 }, { "epoch": 1.9883173167514603, "grad_norm": 0.5335931657065038, "learning_rate": 1.6989188885219165e-09, "loss": 0.319, "step": 1319 }, { "epoch": 1.989824759751272, "grad_norm": 0.527218281586083, "learning_rate": 1.2481946828502011e-09, "loss": 0.2986, "step": 1320 }, { "epoch": 1.9913322027510834, "grad_norm": 0.5444631806162264, "learning_rate": 8.668073731088467e-10, "loss": 0.3414, "step": 1321 }, { "epoch": 1.992839645750895, "grad_norm": 0.5458452249259766, "learning_rate": 5.547596040489378e-10, "loss": 0.3312, "step": 1322 }, { "epoch": 1.9943470887507067, "grad_norm": 0.5337875506880636, "learning_rate": 3.1205353958285724e-10, "loss": 0.3065, "step": 1323 }, { "epoch": 1.9958545317505183, "grad_norm": 0.5477429410153635, "learning_rate": 1.3869086276985243e-10, "loss": 0.308, "step": 1324 }, { "epoch": 1.9973619747503297, "grad_norm": 0.5355633680169556, "learning_rate": 3.467277580271322e-11, "loss": 0.3114, "step": 1325 }, { "epoch": 1.9988694177501414, "grad_norm": 0.5487135118890082, "learning_rate": 0.0, "loss": 0.3367, "step": 1326 }, { "epoch": 1.9988694177501414, "step": 1326, "total_flos": 5.576345153511096e+17, "train_loss": 0.3973805017061363, "train_runtime": 5664.6789, "train_samples_per_second": 29.975, "train_steps_per_second": 0.234 } ], "logging_steps": 1, "max_steps": 1326, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.576345153511096e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }