diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9324 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9988694177501414, + "eval_steps": 500, + "global_step": 1326, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015074429998115696, + "grad_norm": 2.758618933635527, + "learning_rate": 1.5037593984962406e-07, + "loss": 0.7902, + "step": 1 + }, + { + "epoch": 0.003014885999623139, + "grad_norm": 2.6991611264758757, + "learning_rate": 3.007518796992481e-07, + "loss": 0.7844, + "step": 2 + }, + { + "epoch": 0.0045223289994347085, + "grad_norm": 2.7272565236365143, + "learning_rate": 4.511278195488722e-07, + "loss": 0.7626, + "step": 3 + }, + { + "epoch": 0.006029771999246278, + "grad_norm": 2.7528117182790965, + "learning_rate": 6.015037593984962e-07, + "loss": 0.7858, + "step": 4 + }, + { + "epoch": 0.007537214999057848, + "grad_norm": 2.775699578303916, + "learning_rate": 7.518796992481203e-07, + "loss": 0.8026, + "step": 5 + }, + { + "epoch": 0.009044657998869417, + "grad_norm": 2.6767851045683204, + "learning_rate": 9.022556390977444e-07, + "loss": 0.769, + "step": 6 + }, + { + "epoch": 0.010552100998680987, + "grad_norm": 2.5261120500748224, + "learning_rate": 1.0526315789473685e-06, + "loss": 0.7706, + "step": 7 + }, + { + "epoch": 0.012059543998492557, + "grad_norm": 2.514977460751076, + "learning_rate": 1.2030075187969925e-06, + "loss": 0.7785, + "step": 8 + }, + { + "epoch": 0.013566986998304126, + "grad_norm": 2.200241814288396, + "learning_rate": 1.3533834586466167e-06, + "loss": 0.7499, + "step": 9 + }, + { + "epoch": 0.015074429998115696, + "grad_norm": 2.212492267506947, + "learning_rate": 1.5037593984962406e-06, + "loss": 0.784, + "step": 10 + }, + { + "epoch": 0.016581872997927266, + "grad_norm": 1.9319703389138259, + "learning_rate": 1.6541353383458648e-06, + "loss": 0.7302, + "step": 11 + }, + { + "epoch": 0.018089315997738834, + "grad_norm": 1.2550044831716305, + "learning_rate": 1.8045112781954887e-06, + "loss": 0.7056, + "step": 12 + }, + { + "epoch": 0.019596758997550406, + "grad_norm": 1.2195135372913062, + "learning_rate": 1.9548872180451127e-06, + "loss": 0.7014, + "step": 13 + }, + { + "epoch": 0.021104201997361974, + "grad_norm": 1.2455313883450765, + "learning_rate": 2.105263157894737e-06, + "loss": 0.7374, + "step": 14 + }, + { + "epoch": 0.022611644997173545, + "grad_norm": 1.1872107473955416, + "learning_rate": 2.255639097744361e-06, + "loss": 0.7186, + "step": 15 + }, + { + "epoch": 0.024119087996985113, + "grad_norm": 1.1982009143546264, + "learning_rate": 2.406015037593985e-06, + "loss": 0.6828, + "step": 16 + }, + { + "epoch": 0.025626530996796685, + "grad_norm": 2.3539969715580384, + "learning_rate": 2.556390977443609e-06, + "loss": 0.6636, + "step": 17 + }, + { + "epoch": 0.027133973996608253, + "grad_norm": 2.6228143835706765, + "learning_rate": 2.7067669172932333e-06, + "loss": 0.6967, + "step": 18 + }, + { + "epoch": 0.028641416996419825, + "grad_norm": 2.224580122320562, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.6761, + "step": 19 + }, + { + "epoch": 0.030148859996231393, + "grad_norm": 1.6908938829069853, + "learning_rate": 3.007518796992481e-06, + "loss": 0.6711, + "step": 20 + }, + { + "epoch": 0.03165630299604296, + "grad_norm": 1.2313675848377437, + "learning_rate": 3.157894736842105e-06, + "loss": 0.6731, + "step": 21 + }, + { + "epoch": 0.03316374599585453, + "grad_norm": 1.0688239960942414, + "learning_rate": 3.3082706766917295e-06, + "loss": 0.6766, + "step": 22 + }, + { + "epoch": 0.034671188995666104, + "grad_norm": 1.0400364411240592, + "learning_rate": 3.4586466165413535e-06, + "loss": 0.6388, + "step": 23 + }, + { + "epoch": 0.03617863199547767, + "grad_norm": 1.1273406799386165, + "learning_rate": 3.6090225563909775e-06, + "loss": 0.6406, + "step": 24 + }, + { + "epoch": 0.03768607499528924, + "grad_norm": 1.1097242153227487, + "learning_rate": 3.7593984962406014e-06, + "loss": 0.6316, + "step": 25 + }, + { + "epoch": 0.03919351799510081, + "grad_norm": 0.9446010057713108, + "learning_rate": 3.909774436090225e-06, + "loss": 0.6023, + "step": 26 + }, + { + "epoch": 0.04070096099491238, + "grad_norm": 0.8323088497080903, + "learning_rate": 4.06015037593985e-06, + "loss": 0.6183, + "step": 27 + }, + { + "epoch": 0.04220840399472395, + "grad_norm": 0.7872962129475931, + "learning_rate": 4.210526315789474e-06, + "loss": 0.603, + "step": 28 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.7767297100748087, + "learning_rate": 4.360902255639098e-06, + "loss": 0.6217, + "step": 29 + }, + { + "epoch": 0.04522328999434709, + "grad_norm": 0.8604923635307716, + "learning_rate": 4.511278195488722e-06, + "loss": 0.5855, + "step": 30 + }, + { + "epoch": 0.046730732994158655, + "grad_norm": 0.8294236539215625, + "learning_rate": 4.661654135338346e-06, + "loss": 0.5933, + "step": 31 + }, + { + "epoch": 0.04823817599397023, + "grad_norm": 0.7967164005183986, + "learning_rate": 4.81203007518797e-06, + "loss": 0.6159, + "step": 32 + }, + { + "epoch": 0.0497456189937818, + "grad_norm": 0.6830740716112117, + "learning_rate": 4.962406015037594e-06, + "loss": 0.5752, + "step": 33 + }, + { + "epoch": 0.05125306199359337, + "grad_norm": 0.6829489822638658, + "learning_rate": 5.112781954887218e-06, + "loss": 0.5792, + "step": 34 + }, + { + "epoch": 0.052760504993404934, + "grad_norm": 0.7505911666509206, + "learning_rate": 5.263157894736842e-06, + "loss": 0.602, + "step": 35 + }, + { + "epoch": 0.054267947993216506, + "grad_norm": 0.7281087618287696, + "learning_rate": 5.413533834586467e-06, + "loss": 0.5994, + "step": 36 + }, + { + "epoch": 0.05577539099302808, + "grad_norm": 0.7061965815038841, + "learning_rate": 5.56390977443609e-06, + "loss": 0.5734, + "step": 37 + }, + { + "epoch": 0.05728283399283965, + "grad_norm": 0.6836377512068608, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5543, + "step": 38 + }, + { + "epoch": 0.058790276992651214, + "grad_norm": 0.6751595521927064, + "learning_rate": 5.864661654135339e-06, + "loss": 0.5983, + "step": 39 + }, + { + "epoch": 0.060297719992462785, + "grad_norm": 0.6517307660683558, + "learning_rate": 6.015037593984962e-06, + "loss": 0.5482, + "step": 40 + }, + { + "epoch": 0.06180516299227436, + "grad_norm": 0.670740477832527, + "learning_rate": 6.165413533834587e-06, + "loss": 0.5771, + "step": 41 + }, + { + "epoch": 0.06331260599208592, + "grad_norm": 0.6386183426057812, + "learning_rate": 6.31578947368421e-06, + "loss": 0.5338, + "step": 42 + }, + { + "epoch": 0.0648200489918975, + "grad_norm": 0.6325251598452951, + "learning_rate": 6.466165413533835e-06, + "loss": 0.5409, + "step": 43 + }, + { + "epoch": 0.06632749199170906, + "grad_norm": 0.6190880971896819, + "learning_rate": 6.616541353383459e-06, + "loss": 0.5386, + "step": 44 + }, + { + "epoch": 0.06783493499152063, + "grad_norm": 0.6390822260018926, + "learning_rate": 6.766917293233083e-06, + "loss": 0.5824, + "step": 45 + }, + { + "epoch": 0.06934237799133221, + "grad_norm": 0.6866065901609671, + "learning_rate": 6.917293233082707e-06, + "loss": 0.5661, + "step": 46 + }, + { + "epoch": 0.07084982099114377, + "grad_norm": 0.6253742924174672, + "learning_rate": 7.067669172932331e-06, + "loss": 0.5371, + "step": 47 + }, + { + "epoch": 0.07235726399095534, + "grad_norm": 0.6086976797344416, + "learning_rate": 7.218045112781955e-06, + "loss": 0.5394, + "step": 48 + }, + { + "epoch": 0.07386470699076692, + "grad_norm": 0.6617551336853821, + "learning_rate": 7.368421052631579e-06, + "loss": 0.5481, + "step": 49 + }, + { + "epoch": 0.07537214999057848, + "grad_norm": 0.6511571841438215, + "learning_rate": 7.518796992481203e-06, + "loss": 0.557, + "step": 50 + }, + { + "epoch": 0.07687959299039004, + "grad_norm": 0.6424690635636273, + "learning_rate": 7.669172932330828e-06, + "loss": 0.554, + "step": 51 + }, + { + "epoch": 0.07838703599020162, + "grad_norm": 0.6208252498748196, + "learning_rate": 7.81954887218045e-06, + "loss": 0.5316, + "step": 52 + }, + { + "epoch": 0.07989447899001319, + "grad_norm": 0.6178927978791646, + "learning_rate": 7.969924812030075e-06, + "loss": 0.5303, + "step": 53 + }, + { + "epoch": 0.08140192198982477, + "grad_norm": 0.6246216171964205, + "learning_rate": 8.1203007518797e-06, + "loss": 0.5551, + "step": 54 + }, + { + "epoch": 0.08290936498963633, + "grad_norm": 0.6071119189590479, + "learning_rate": 8.270676691729324e-06, + "loss": 0.5324, + "step": 55 + }, + { + "epoch": 0.0844168079894479, + "grad_norm": 0.6123261613238393, + "learning_rate": 8.421052631578948e-06, + "loss": 0.5503, + "step": 56 + }, + { + "epoch": 0.08592425098925947, + "grad_norm": 0.620387110972641, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5353, + "step": 57 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.6185030072259556, + "learning_rate": 8.721804511278195e-06, + "loss": 0.5544, + "step": 58 + }, + { + "epoch": 0.0889391369888826, + "grad_norm": 0.6244700607026835, + "learning_rate": 8.87218045112782e-06, + "loss": 0.5768, + "step": 59 + }, + { + "epoch": 0.09044657998869418, + "grad_norm": 0.655865564676625, + "learning_rate": 9.022556390977444e-06, + "loss": 0.5541, + "step": 60 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 0.6355623162806917, + "learning_rate": 9.172932330827068e-06, + "loss": 0.5317, + "step": 61 + }, + { + "epoch": 0.09346146598831731, + "grad_norm": 0.6620650186277286, + "learning_rate": 9.323308270676693e-06, + "loss": 0.5825, + "step": 62 + }, + { + "epoch": 0.09496890898812889, + "grad_norm": 0.6544984607034259, + "learning_rate": 9.473684210526315e-06, + "loss": 0.5367, + "step": 63 + }, + { + "epoch": 0.09647635198794045, + "grad_norm": 0.6566999876216955, + "learning_rate": 9.62406015037594e-06, + "loss": 0.5334, + "step": 64 + }, + { + "epoch": 0.09798379498775203, + "grad_norm": 0.6538652733227992, + "learning_rate": 9.774436090225564e-06, + "loss": 0.5088, + "step": 65 + }, + { + "epoch": 0.0994912379875636, + "grad_norm": 0.7184816645886852, + "learning_rate": 9.924812030075189e-06, + "loss": 0.5015, + "step": 66 + }, + { + "epoch": 0.10099868098737516, + "grad_norm": 0.6287887378220718, + "learning_rate": 1.0075187969924813e-05, + "loss": 0.5171, + "step": 67 + }, + { + "epoch": 0.10250612398718674, + "grad_norm": 0.7045986205120561, + "learning_rate": 1.0225563909774436e-05, + "loss": 0.5499, + "step": 68 + }, + { + "epoch": 0.1040135669869983, + "grad_norm": 0.6263524660452249, + "learning_rate": 1.0375939849624062e-05, + "loss": 0.5319, + "step": 69 + }, + { + "epoch": 0.10552100998680987, + "grad_norm": 0.6340009161866458, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.5295, + "step": 70 + }, + { + "epoch": 0.10702845298662145, + "grad_norm": 0.6930114141626272, + "learning_rate": 1.0676691729323309e-05, + "loss": 0.543, + "step": 71 + }, + { + "epoch": 0.10853589598643301, + "grad_norm": 0.6582132112309497, + "learning_rate": 1.0827067669172933e-05, + "loss": 0.5164, + "step": 72 + }, + { + "epoch": 0.11004333898624458, + "grad_norm": 0.6774079053656817, + "learning_rate": 1.0977443609022558e-05, + "loss": 0.5202, + "step": 73 + }, + { + "epoch": 0.11155078198605616, + "grad_norm": 0.6435562490480392, + "learning_rate": 1.112781954887218e-05, + "loss": 0.5095, + "step": 74 + }, + { + "epoch": 0.11305822498586772, + "grad_norm": 0.7112297517037395, + "learning_rate": 1.1278195488721806e-05, + "loss": 0.5316, + "step": 75 + }, + { + "epoch": 0.1145656679856793, + "grad_norm": 0.709494451956929, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.4935, + "step": 76 + }, + { + "epoch": 0.11607311098549086, + "grad_norm": 0.6777802836075782, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.5043, + "step": 77 + }, + { + "epoch": 0.11758055398530243, + "grad_norm": 0.6296151489375509, + "learning_rate": 1.1729323308270678e-05, + "loss": 0.4874, + "step": 78 + }, + { + "epoch": 0.119087996985114, + "grad_norm": 0.6808431409244452, + "learning_rate": 1.1879699248120302e-05, + "loss": 0.4788, + "step": 79 + }, + { + "epoch": 0.12059543998492557, + "grad_norm": 0.6704429377361576, + "learning_rate": 1.2030075187969925e-05, + "loss": 0.5011, + "step": 80 + }, + { + "epoch": 0.12210288298473713, + "grad_norm": 0.6926069766970787, + "learning_rate": 1.2180451127819551e-05, + "loss": 0.496, + "step": 81 + }, + { + "epoch": 0.12361032598454871, + "grad_norm": 0.639818862010909, + "learning_rate": 1.2330827067669174e-05, + "loss": 0.5308, + "step": 82 + }, + { + "epoch": 0.12511776898436028, + "grad_norm": 0.6204899572762589, + "learning_rate": 1.2481203007518798e-05, + "loss": 0.5063, + "step": 83 + }, + { + "epoch": 0.12662521198417184, + "grad_norm": 0.6865925022658576, + "learning_rate": 1.263157894736842e-05, + "loss": 0.507, + "step": 84 + }, + { + "epoch": 0.1281326549839834, + "grad_norm": 0.7029706975479946, + "learning_rate": 1.2781954887218047e-05, + "loss": 0.5209, + "step": 85 + }, + { + "epoch": 0.129640097983795, + "grad_norm": 0.6524424672188123, + "learning_rate": 1.293233082706767e-05, + "loss": 0.5527, + "step": 86 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.6074805023838824, + "learning_rate": 1.3082706766917295e-05, + "loss": 0.4873, + "step": 87 + }, + { + "epoch": 0.13265498398341813, + "grad_norm": 0.6891851394415897, + "learning_rate": 1.3233082706766918e-05, + "loss": 0.5335, + "step": 88 + }, + { + "epoch": 0.1341624269832297, + "grad_norm": 0.6322157680641546, + "learning_rate": 1.3383458646616543e-05, + "loss": 0.5163, + "step": 89 + }, + { + "epoch": 0.13566986998304126, + "grad_norm": 0.6652911326311045, + "learning_rate": 1.3533834586466165e-05, + "loss": 0.5227, + "step": 90 + }, + { + "epoch": 0.13717731298285282, + "grad_norm": 0.7241927650908743, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.5271, + "step": 91 + }, + { + "epoch": 0.13868475598266441, + "grad_norm": 0.654474241851782, + "learning_rate": 1.3834586466165414e-05, + "loss": 0.5154, + "step": 92 + }, + { + "epoch": 0.14019219898247598, + "grad_norm": 0.7184414157305912, + "learning_rate": 1.3984962406015038e-05, + "loss": 0.5077, + "step": 93 + }, + { + "epoch": 0.14169964198228754, + "grad_norm": 0.6449190071052974, + "learning_rate": 1.4135338345864663e-05, + "loss": 0.5038, + "step": 94 + }, + { + "epoch": 0.1432070849820991, + "grad_norm": 0.691580302982374, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.4667, + "step": 95 + }, + { + "epoch": 0.14471452798191067, + "grad_norm": 0.6288139344926038, + "learning_rate": 1.443609022556391e-05, + "loss": 0.49, + "step": 96 + }, + { + "epoch": 0.14622197098172227, + "grad_norm": 0.643873596183986, + "learning_rate": 1.4586466165413536e-05, + "loss": 0.4749, + "step": 97 + }, + { + "epoch": 0.14772941398153383, + "grad_norm": 0.6755660192421138, + "learning_rate": 1.4736842105263159e-05, + "loss": 0.5075, + "step": 98 + }, + { + "epoch": 0.1492368569813454, + "grad_norm": 0.646796309870399, + "learning_rate": 1.4887218045112783e-05, + "loss": 0.5296, + "step": 99 + }, + { + "epoch": 0.15074429998115696, + "grad_norm": 0.7044467423953411, + "learning_rate": 1.5037593984962406e-05, + "loss": 0.5303, + "step": 100 + }, + { + "epoch": 0.15225174298096852, + "grad_norm": 0.7730989605725896, + "learning_rate": 1.5187969924812032e-05, + "loss": 0.5099, + "step": 101 + }, + { + "epoch": 0.1537591859807801, + "grad_norm": 0.6620556405595589, + "learning_rate": 1.5338345864661656e-05, + "loss": 0.5189, + "step": 102 + }, + { + "epoch": 0.15526662898059168, + "grad_norm": 0.8038364131821449, + "learning_rate": 1.548872180451128e-05, + "loss": 0.5152, + "step": 103 + }, + { + "epoch": 0.15677407198040325, + "grad_norm": 0.6537353177538859, + "learning_rate": 1.56390977443609e-05, + "loss": 0.5012, + "step": 104 + }, + { + "epoch": 0.1582815149802148, + "grad_norm": 0.7802019309424624, + "learning_rate": 1.578947368421053e-05, + "loss": 0.4851, + "step": 105 + }, + { + "epoch": 0.15978895798002637, + "grad_norm": 0.7590456113216669, + "learning_rate": 1.593984962406015e-05, + "loss": 0.5077, + "step": 106 + }, + { + "epoch": 0.16129640097983794, + "grad_norm": 0.6740526095538228, + "learning_rate": 1.6090225563909775e-05, + "loss": 0.4794, + "step": 107 + }, + { + "epoch": 0.16280384397964953, + "grad_norm": 0.7266285917065574, + "learning_rate": 1.62406015037594e-05, + "loss": 0.5368, + "step": 108 + }, + { + "epoch": 0.1643112869794611, + "grad_norm": 0.7202106895600753, + "learning_rate": 1.6390977443609023e-05, + "loss": 0.5077, + "step": 109 + }, + { + "epoch": 0.16581872997927266, + "grad_norm": 0.7646664609937389, + "learning_rate": 1.6541353383458648e-05, + "loss": 0.517, + "step": 110 + }, + { + "epoch": 0.16732617297908423, + "grad_norm": 0.7090240598112959, + "learning_rate": 1.6691729323308272e-05, + "loss": 0.5217, + "step": 111 + }, + { + "epoch": 0.1688336159788958, + "grad_norm": 0.7260255784190195, + "learning_rate": 1.6842105263157896e-05, + "loss": 0.4977, + "step": 112 + }, + { + "epoch": 0.17034105897870735, + "grad_norm": 0.6392143364785348, + "learning_rate": 1.699248120300752e-05, + "loss": 0.4844, + "step": 113 + }, + { + "epoch": 0.17184850197851895, + "grad_norm": 0.7380625519153193, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.5063, + "step": 114 + }, + { + "epoch": 0.1733559449783305, + "grad_norm": 0.6999588028799495, + "learning_rate": 1.729323308270677e-05, + "loss": 0.5124, + "step": 115 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.7189938723035283, + "learning_rate": 1.744360902255639e-05, + "loss": 0.4723, + "step": 116 + }, + { + "epoch": 0.17637083097795364, + "grad_norm": 0.755925477740424, + "learning_rate": 1.7593984962406015e-05, + "loss": 0.5071, + "step": 117 + }, + { + "epoch": 0.1778782739777652, + "grad_norm": 0.6932179016486248, + "learning_rate": 1.774436090225564e-05, + "loss": 0.4813, + "step": 118 + }, + { + "epoch": 0.1793857169775768, + "grad_norm": 0.6803984999939205, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.4537, + "step": 119 + }, + { + "epoch": 0.18089315997738836, + "grad_norm": 0.6943064601614549, + "learning_rate": 1.8045112781954888e-05, + "loss": 0.5302, + "step": 120 + }, + { + "epoch": 0.18240060297719993, + "grad_norm": 0.7024561418818766, + "learning_rate": 1.8195488721804512e-05, + "loss": 0.5292, + "step": 121 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 0.6516091971357849, + "learning_rate": 1.8345864661654137e-05, + "loss": 0.5192, + "step": 122 + }, + { + "epoch": 0.18541548897682306, + "grad_norm": 0.733121234769519, + "learning_rate": 1.849624060150376e-05, + "loss": 0.5137, + "step": 123 + }, + { + "epoch": 0.18692293197663462, + "grad_norm": 0.6379606914374305, + "learning_rate": 1.8646616541353386e-05, + "loss": 0.5004, + "step": 124 + }, + { + "epoch": 0.1884303749764462, + "grad_norm": 0.682116565034204, + "learning_rate": 1.879699248120301e-05, + "loss": 0.4932, + "step": 125 + }, + { + "epoch": 0.18993781797625778, + "grad_norm": 0.6586357032334851, + "learning_rate": 1.894736842105263e-05, + "loss": 0.4949, + "step": 126 + }, + { + "epoch": 0.19144526097606934, + "grad_norm": 0.6996866499647233, + "learning_rate": 1.909774436090226e-05, + "loss": 0.4926, + "step": 127 + }, + { + "epoch": 0.1929527039758809, + "grad_norm": 0.7344954131354208, + "learning_rate": 1.924812030075188e-05, + "loss": 0.477, + "step": 128 + }, + { + "epoch": 0.19446014697569247, + "grad_norm": 0.6945294612726404, + "learning_rate": 1.9398496240601504e-05, + "loss": 0.4872, + "step": 129 + }, + { + "epoch": 0.19596758997550406, + "grad_norm": 0.7624604146665339, + "learning_rate": 1.954887218045113e-05, + "loss": 0.5224, + "step": 130 + }, + { + "epoch": 0.19747503297531563, + "grad_norm": 0.685229042181731, + "learning_rate": 1.9699248120300753e-05, + "loss": 0.4964, + "step": 131 + }, + { + "epoch": 0.1989824759751272, + "grad_norm": 0.8306163709490333, + "learning_rate": 1.9849624060150377e-05, + "loss": 0.5021, + "step": 132 + }, + { + "epoch": 0.20048991897493876, + "grad_norm": 0.6752867411242717, + "learning_rate": 2e-05, + "loss": 0.4946, + "step": 133 + }, + { + "epoch": 0.20199736197475032, + "grad_norm": 0.9473906923308808, + "learning_rate": 1.99999653272242e-05, + "loss": 0.5112, + "step": 134 + }, + { + "epoch": 0.2035048049745619, + "grad_norm": 0.6355233169612663, + "learning_rate": 1.9999861309137232e-05, + "loss": 0.5318, + "step": 135 + }, + { + "epoch": 0.20501224797437348, + "grad_norm": 0.8423903087733013, + "learning_rate": 1.999968794646042e-05, + "loss": 0.5148, + "step": 136 + }, + { + "epoch": 0.20651969097418504, + "grad_norm": 0.6660475408627802, + "learning_rate": 1.9999445240395953e-05, + "loss": 0.5178, + "step": 137 + }, + { + "epoch": 0.2080271339739966, + "grad_norm": 0.72967331295993, + "learning_rate": 1.9999133192626893e-05, + "loss": 0.5262, + "step": 138 + }, + { + "epoch": 0.20953457697380817, + "grad_norm": 0.7393548066200798, + "learning_rate": 1.9998751805317152e-05, + "loss": 0.5057, + "step": 139 + }, + { + "epoch": 0.21104201997361974, + "grad_norm": 0.687138877245702, + "learning_rate": 1.999830108111148e-05, + "loss": 0.4958, + "step": 140 + }, + { + "epoch": 0.21254946297343133, + "grad_norm": 0.7007673502124087, + "learning_rate": 1.999778102313545e-05, + "loss": 0.4948, + "step": 141 + }, + { + "epoch": 0.2140569059732429, + "grad_norm": 0.7183800488623966, + "learning_rate": 1.999719163499543e-05, + "loss": 0.5104, + "step": 142 + }, + { + "epoch": 0.21556434897305446, + "grad_norm": 0.6535866563135689, + "learning_rate": 1.999653292077857e-05, + "loss": 0.5145, + "step": 143 + }, + { + "epoch": 0.21707179197286602, + "grad_norm": 0.64107430044815, + "learning_rate": 1.999580488505276e-05, + "loss": 0.4659, + "step": 144 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.6779403955139097, + "learning_rate": 1.9995007532866594e-05, + "loss": 0.4964, + "step": 145 + }, + { + "epoch": 0.22008667797248915, + "grad_norm": 0.6539110005752458, + "learning_rate": 1.9994140869749366e-05, + "loss": 0.5092, + "step": 146 + }, + { + "epoch": 0.22159412097230075, + "grad_norm": 0.6565635872751927, + "learning_rate": 1.9993204901710995e-05, + "loss": 0.5185, + "step": 147 + }, + { + "epoch": 0.2231015639721123, + "grad_norm": 0.6372834025735034, + "learning_rate": 1.9992199635241997e-05, + "loss": 0.5152, + "step": 148 + }, + { + "epoch": 0.22460900697192387, + "grad_norm": 0.622264657968412, + "learning_rate": 1.999112507731346e-05, + "loss": 0.5, + "step": 149 + }, + { + "epoch": 0.22611644997173544, + "grad_norm": 0.6267246999704467, + "learning_rate": 1.9989981235376956e-05, + "loss": 0.4897, + "step": 150 + }, + { + "epoch": 0.227623892971547, + "grad_norm": 0.6551262788067906, + "learning_rate": 1.9988768117364526e-05, + "loss": 0.5165, + "step": 151 + }, + { + "epoch": 0.2291313359713586, + "grad_norm": 0.6664514520384526, + "learning_rate": 1.9987485731688595e-05, + "loss": 0.5002, + "step": 152 + }, + { + "epoch": 0.23063877897117016, + "grad_norm": 0.6076256276502832, + "learning_rate": 1.998613408724195e-05, + "loss": 0.5084, + "step": 153 + }, + { + "epoch": 0.23214622197098173, + "grad_norm": 0.6373492462291207, + "learning_rate": 1.998471319339763e-05, + "loss": 0.5026, + "step": 154 + }, + { + "epoch": 0.2336536649707933, + "grad_norm": 1.745130262060046, + "learning_rate": 1.9983223060008908e-05, + "loss": 0.5034, + "step": 155 + }, + { + "epoch": 0.23516110797060485, + "grad_norm": 5.046536940100192, + "learning_rate": 1.9981663697409203e-05, + "loss": 0.5424, + "step": 156 + }, + { + "epoch": 0.23666855097041642, + "grad_norm": 1.5507957638980387, + "learning_rate": 1.998003511641199e-05, + "loss": 0.5301, + "step": 157 + }, + { + "epoch": 0.238175993970228, + "grad_norm": 0.9453551152302114, + "learning_rate": 1.997833732831076e-05, + "loss": 0.4793, + "step": 158 + }, + { + "epoch": 0.23968343697003958, + "grad_norm": 1.1864330727309345, + "learning_rate": 1.9976570344878916e-05, + "loss": 0.5125, + "step": 159 + }, + { + "epoch": 0.24119087996985114, + "grad_norm": 0.8095198214822489, + "learning_rate": 1.9974734178369702e-05, + "loss": 0.4904, + "step": 160 + }, + { + "epoch": 0.2426983229696627, + "grad_norm": 0.620347427984293, + "learning_rate": 1.997282884151612e-05, + "loss": 0.4611, + "step": 161 + }, + { + "epoch": 0.24420576596947427, + "grad_norm": 0.7590913383659819, + "learning_rate": 1.9970854347530828e-05, + "loss": 0.5085, + "step": 162 + }, + { + "epoch": 0.24571320896928586, + "grad_norm": 0.5932187358928716, + "learning_rate": 1.9968810710106065e-05, + "loss": 0.49, + "step": 163 + }, + { + "epoch": 0.24722065196909743, + "grad_norm": 0.7421744580230403, + "learning_rate": 1.9966697943413548e-05, + "loss": 0.4789, + "step": 164 + }, + { + "epoch": 0.248728094968909, + "grad_norm": 0.634606964098851, + "learning_rate": 1.9964516062104377e-05, + "loss": 0.5008, + "step": 165 + }, + { + "epoch": 0.25023553796872056, + "grad_norm": 0.721486256547781, + "learning_rate": 1.996226508130892e-05, + "loss": 0.4546, + "step": 166 + }, + { + "epoch": 0.25174298096853215, + "grad_norm": 0.6785391250628956, + "learning_rate": 1.995994501663674e-05, + "loss": 0.4892, + "step": 167 + }, + { + "epoch": 0.2532504239683437, + "grad_norm": 0.5862796281463728, + "learning_rate": 1.995755588417644e-05, + "loss": 0.4736, + "step": 168 + }, + { + "epoch": 0.2547578669681553, + "grad_norm": 0.6656043784418029, + "learning_rate": 1.99550977004956e-05, + "loss": 0.4749, + "step": 169 + }, + { + "epoch": 0.2562653099679668, + "grad_norm": 0.6091440270236256, + "learning_rate": 1.9952570482640628e-05, + "loss": 0.4997, + "step": 170 + }, + { + "epoch": 0.2577727529677784, + "grad_norm": 0.7671131220608588, + "learning_rate": 1.9949974248136655e-05, + "loss": 0.4741, + "step": 171 + }, + { + "epoch": 0.25928019596759, + "grad_norm": 0.6246356814602296, + "learning_rate": 1.9947309014987414e-05, + "loss": 0.4727, + "step": 172 + }, + { + "epoch": 0.26078763896740154, + "grad_norm": 0.7874820318511245, + "learning_rate": 1.9944574801675106e-05, + "loss": 0.4965, + "step": 173 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.6727323732082747, + "learning_rate": 1.9941771627160287e-05, + "loss": 0.5361, + "step": 174 + }, + { + "epoch": 0.26380252496702467, + "grad_norm": 0.6896058530733802, + "learning_rate": 1.9938899510881732e-05, + "loss": 0.4574, + "step": 175 + }, + { + "epoch": 0.26530996796683626, + "grad_norm": 0.6396080754362474, + "learning_rate": 1.9935958472756283e-05, + "loss": 0.4791, + "step": 176 + }, + { + "epoch": 0.26681741096664785, + "grad_norm": 0.6978820918898457, + "learning_rate": 1.993294853317873e-05, + "loss": 0.4936, + "step": 177 + }, + { + "epoch": 0.2683248539664594, + "grad_norm": 0.6200726182474722, + "learning_rate": 1.9929869713021668e-05, + "loss": 0.4809, + "step": 178 + }, + { + "epoch": 0.269832296966271, + "grad_norm": 0.6621164817055001, + "learning_rate": 1.9926722033635343e-05, + "loss": 0.4833, + "step": 179 + }, + { + "epoch": 0.2713397399660825, + "grad_norm": 0.6443066814567524, + "learning_rate": 1.9923505516847514e-05, + "loss": 0.452, + "step": 180 + }, + { + "epoch": 0.2728471829658941, + "grad_norm": 0.6324303832157692, + "learning_rate": 1.9920220184963296e-05, + "loss": 0.4942, + "step": 181 + }, + { + "epoch": 0.27435462596570565, + "grad_norm": 0.7093590972609833, + "learning_rate": 1.9916866060764994e-05, + "loss": 0.4666, + "step": 182 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.6056554558199854, + "learning_rate": 1.991344316751198e-05, + "loss": 0.48, + "step": 183 + }, + { + "epoch": 0.27736951196532883, + "grad_norm": 0.6668247301054864, + "learning_rate": 1.9909951528940485e-05, + "loss": 0.4892, + "step": 184 + }, + { + "epoch": 0.27887695496514037, + "grad_norm": 0.6248269362388523, + "learning_rate": 1.990639116926348e-05, + "loss": 0.4601, + "step": 185 + }, + { + "epoch": 0.28038439796495196, + "grad_norm": 0.5988280290089756, + "learning_rate": 1.9902762113170467e-05, + "loss": 0.4671, + "step": 186 + }, + { + "epoch": 0.2818918409647635, + "grad_norm": 0.6183022850194487, + "learning_rate": 1.989906438582734e-05, + "loss": 0.5023, + "step": 187 + }, + { + "epoch": 0.2833992839645751, + "grad_norm": 0.6345731772578389, + "learning_rate": 1.9895298012876192e-05, + "loss": 0.4749, + "step": 188 + }, + { + "epoch": 0.2849067269643867, + "grad_norm": 0.6026738883514794, + "learning_rate": 1.9891463020435144e-05, + "loss": 0.4884, + "step": 189 + }, + { + "epoch": 0.2864141699641982, + "grad_norm": 0.6275566066201014, + "learning_rate": 1.9887559435098162e-05, + "loss": 0.4868, + "step": 190 + }, + { + "epoch": 0.2879216129640098, + "grad_norm": 0.6830623512458401, + "learning_rate": 1.9883587283934875e-05, + "loss": 0.4797, + "step": 191 + }, + { + "epoch": 0.28942905596382135, + "grad_norm": 0.621100203862078, + "learning_rate": 1.9879546594490383e-05, + "loss": 0.4781, + "step": 192 + }, + { + "epoch": 0.29093649896363294, + "grad_norm": 0.7266845450092815, + "learning_rate": 1.987543739478507e-05, + "loss": 0.4838, + "step": 193 + }, + { + "epoch": 0.29244394196344453, + "grad_norm": 0.5998498411317879, + "learning_rate": 1.987125971331441e-05, + "loss": 0.4809, + "step": 194 + }, + { + "epoch": 0.29395138496325607, + "grad_norm": 0.7629414665635117, + "learning_rate": 1.9867013579048765e-05, + "loss": 0.4891, + "step": 195 + }, + { + "epoch": 0.29545882796306766, + "grad_norm": 0.6340989750127195, + "learning_rate": 1.9862699021433186e-05, + "loss": 0.4696, + "step": 196 + }, + { + "epoch": 0.2969662709628792, + "grad_norm": 0.6600966786500729, + "learning_rate": 1.9858316070387208e-05, + "loss": 0.4568, + "step": 197 + }, + { + "epoch": 0.2984737139626908, + "grad_norm": 0.6836572646612057, + "learning_rate": 1.9853864756304654e-05, + "loss": 0.4849, + "step": 198 + }, + { + "epoch": 0.2999811569625024, + "grad_norm": 0.5912116643865833, + "learning_rate": 1.9849345110053405e-05, + "loss": 0.4752, + "step": 199 + }, + { + "epoch": 0.3014885999623139, + "grad_norm": 0.6202584603281575, + "learning_rate": 1.984475716297519e-05, + "loss": 0.478, + "step": 200 + }, + { + "epoch": 0.3029960429621255, + "grad_norm": 0.6021031729150327, + "learning_rate": 1.984010094688539e-05, + "loss": 0.4818, + "step": 201 + }, + { + "epoch": 0.30450348596193705, + "grad_norm": 0.6013263404823498, + "learning_rate": 1.9835376494072788e-05, + "loss": 0.4798, + "step": 202 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.6391395935977097, + "learning_rate": 1.9830583837299363e-05, + "loss": 0.5079, + "step": 203 + }, + { + "epoch": 0.3075183719615602, + "grad_norm": 0.6403126078695583, + "learning_rate": 1.9825723009800058e-05, + "loss": 0.4994, + "step": 204 + }, + { + "epoch": 0.30902581496137177, + "grad_norm": 0.6996650791864127, + "learning_rate": 1.9820794045282553e-05, + "loss": 0.458, + "step": 205 + }, + { + "epoch": 0.31053325796118336, + "grad_norm": 0.6120915229627125, + "learning_rate": 1.9815796977927015e-05, + "loss": 0.4837, + "step": 206 + }, + { + "epoch": 0.3120407009609949, + "grad_norm": 0.698625059347094, + "learning_rate": 1.9810731842385892e-05, + "loss": 0.4762, + "step": 207 + }, + { + "epoch": 0.3135481439608065, + "grad_norm": 0.6458152328354264, + "learning_rate": 1.9805598673783644e-05, + "loss": 0.4877, + "step": 208 + }, + { + "epoch": 0.31505558696061803, + "grad_norm": 0.6183456339468536, + "learning_rate": 1.980039750771651e-05, + "loss": 0.4555, + "step": 209 + }, + { + "epoch": 0.3165630299604296, + "grad_norm": 0.6625119162294268, + "learning_rate": 1.9795128380252263e-05, + "loss": 0.467, + "step": 210 + }, + { + "epoch": 0.3180704729602412, + "grad_norm": 0.5634839413053515, + "learning_rate": 1.978979132792996e-05, + "loss": 0.4664, + "step": 211 + }, + { + "epoch": 0.31957791596005275, + "grad_norm": 0.6026219616185302, + "learning_rate": 1.9784386387759684e-05, + "loss": 0.4774, + "step": 212 + }, + { + "epoch": 0.32108535895986434, + "grad_norm": 0.6269218843440012, + "learning_rate": 1.977891359722229e-05, + "loss": 0.4432, + "step": 213 + }, + { + "epoch": 0.3225928019596759, + "grad_norm": 0.5910572611931473, + "learning_rate": 1.9773372994269147e-05, + "loss": 0.4699, + "step": 214 + }, + { + "epoch": 0.32410024495948747, + "grad_norm": 0.6536939644754692, + "learning_rate": 1.976776461732187e-05, + "loss": 0.4807, + "step": 215 + }, + { + "epoch": 0.32560768795929906, + "grad_norm": 0.6191726708771672, + "learning_rate": 1.976208850527206e-05, + "loss": 0.4944, + "step": 216 + }, + { + "epoch": 0.3271151309591106, + "grad_norm": 0.6298298802683915, + "learning_rate": 1.9756344697481027e-05, + "loss": 0.4862, + "step": 217 + }, + { + "epoch": 0.3286225739589222, + "grad_norm": 0.6539080215758202, + "learning_rate": 1.975053323377952e-05, + "loss": 0.4817, + "step": 218 + }, + { + "epoch": 0.33013001695873373, + "grad_norm": 0.6146092404035427, + "learning_rate": 1.9744654154467468e-05, + "loss": 0.5422, + "step": 219 + }, + { + "epoch": 0.3316374599585453, + "grad_norm": 0.6490399293285286, + "learning_rate": 1.9738707500313655e-05, + "loss": 0.4703, + "step": 220 + }, + { + "epoch": 0.3331449029583569, + "grad_norm": 0.615019483124787, + "learning_rate": 1.9732693312555492e-05, + "loss": 0.4801, + "step": 221 + }, + { + "epoch": 0.33465234595816845, + "grad_norm": 0.5547372740595196, + "learning_rate": 1.9726611632898693e-05, + "loss": 0.4719, + "step": 222 + }, + { + "epoch": 0.33615978895798004, + "grad_norm": 0.6076625752065381, + "learning_rate": 1.9720462503517e-05, + "loss": 0.498, + "step": 223 + }, + { + "epoch": 0.3376672319577916, + "grad_norm": 0.6047398581844834, + "learning_rate": 1.971424596705189e-05, + "loss": 0.4643, + "step": 224 + }, + { + "epoch": 0.3391746749576032, + "grad_norm": 0.5727445152315086, + "learning_rate": 1.9707962066612278e-05, + "loss": 0.4515, + "step": 225 + }, + { + "epoch": 0.3406821179574147, + "grad_norm": 0.5573272058322264, + "learning_rate": 1.970161084577422e-05, + "loss": 0.4524, + "step": 226 + }, + { + "epoch": 0.3421895609572263, + "grad_norm": 0.6257676922974255, + "learning_rate": 1.9695192348580606e-05, + "loss": 0.4815, + "step": 227 + }, + { + "epoch": 0.3436970039570379, + "grad_norm": 0.5745183403896584, + "learning_rate": 1.9688706619540863e-05, + "loss": 0.4717, + "step": 228 + }, + { + "epoch": 0.34520444695684943, + "grad_norm": 0.5964564340890054, + "learning_rate": 1.968215370363063e-05, + "loss": 0.4839, + "step": 229 + }, + { + "epoch": 0.346711889956661, + "grad_norm": 0.5672877352491237, + "learning_rate": 1.9675533646291463e-05, + "loss": 0.4914, + "step": 230 + }, + { + "epoch": 0.34821933295647256, + "grad_norm": 0.6672213227292868, + "learning_rate": 1.9668846493430522e-05, + "loss": 0.4718, + "step": 231 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.5546791014566226, + "learning_rate": 1.9662092291420233e-05, + "loss": 0.4392, + "step": 232 + }, + { + "epoch": 0.35123421895609575, + "grad_norm": 0.6546100852352986, + "learning_rate": 1.965527108709798e-05, + "loss": 0.4836, + "step": 233 + }, + { + "epoch": 0.3527416619559073, + "grad_norm": 0.5992362903479123, + "learning_rate": 1.964838292776579e-05, + "loss": 0.4464, + "step": 234 + }, + { + "epoch": 0.3542491049557189, + "grad_norm": 0.63523888294575, + "learning_rate": 1.9641427861189973e-05, + "loss": 0.4856, + "step": 235 + }, + { + "epoch": 0.3557565479555304, + "grad_norm": 0.6032723041133213, + "learning_rate": 1.963440593560083e-05, + "loss": 0.4966, + "step": 236 + }, + { + "epoch": 0.357263990955342, + "grad_norm": 0.6306498314236755, + "learning_rate": 1.9627317199692287e-05, + "loss": 0.4771, + "step": 237 + }, + { + "epoch": 0.3587714339551536, + "grad_norm": 0.5865071462782886, + "learning_rate": 1.962016170262157e-05, + "loss": 0.4573, + "step": 238 + }, + { + "epoch": 0.36027887695496513, + "grad_norm": 0.5665927327271444, + "learning_rate": 1.961293949400888e-05, + "loss": 0.4485, + "step": 239 + }, + { + "epoch": 0.3617863199547767, + "grad_norm": 0.554220781330076, + "learning_rate": 1.960565062393701e-05, + "loss": 0.4686, + "step": 240 + }, + { + "epoch": 0.36329376295458826, + "grad_norm": 0.6455923732389204, + "learning_rate": 1.9598295142951035e-05, + "loss": 0.4592, + "step": 241 + }, + { + "epoch": 0.36480120595439985, + "grad_norm": 0.596721778819204, + "learning_rate": 1.9590873102057948e-05, + "loss": 0.4907, + "step": 242 + }, + { + "epoch": 0.36630864895421145, + "grad_norm": 0.6716627212373145, + "learning_rate": 1.9583384552726294e-05, + "loss": 0.4799, + "step": 243 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 0.6229194933798746, + "learning_rate": 1.957582954688584e-05, + "loss": 0.4652, + "step": 244 + }, + { + "epoch": 0.3693235349538346, + "grad_norm": 0.6298037085236075, + "learning_rate": 1.9568208136927177e-05, + "loss": 0.4717, + "step": 245 + }, + { + "epoch": 0.3708309779536461, + "grad_norm": 0.5787887978421966, + "learning_rate": 1.9560520375701408e-05, + "loss": 0.4845, + "step": 246 + }, + { + "epoch": 0.3723384209534577, + "grad_norm": 0.6451526421523999, + "learning_rate": 1.9552766316519726e-05, + "loss": 0.4516, + "step": 247 + }, + { + "epoch": 0.37384586395326924, + "grad_norm": 0.538692705705553, + "learning_rate": 1.9544946013153093e-05, + "loss": 0.4649, + "step": 248 + }, + { + "epoch": 0.37535330695308083, + "grad_norm": 0.6399780775437526, + "learning_rate": 1.9537059519831822e-05, + "loss": 0.4594, + "step": 249 + }, + { + "epoch": 0.3768607499528924, + "grad_norm": 0.6082935211607333, + "learning_rate": 1.9529106891245244e-05, + "loss": 0.4709, + "step": 250 + }, + { + "epoch": 0.37836819295270396, + "grad_norm": 0.6106738888512755, + "learning_rate": 1.9521088182541298e-05, + "loss": 0.492, + "step": 251 + }, + { + "epoch": 0.37987563595251556, + "grad_norm": 0.5803041737823633, + "learning_rate": 1.951300344932616e-05, + "loss": 0.4646, + "step": 252 + }, + { + "epoch": 0.3813830789523271, + "grad_norm": 0.5647638332240319, + "learning_rate": 1.9504852747663862e-05, + "loss": 0.4725, + "step": 253 + }, + { + "epoch": 0.3828905219521387, + "grad_norm": 0.664315669006426, + "learning_rate": 1.9496636134075894e-05, + "loss": 0.4689, + "step": 254 + }, + { + "epoch": 0.3843979649519503, + "grad_norm": 0.6019633789641826, + "learning_rate": 1.9488353665540813e-05, + "loss": 0.4613, + "step": 255 + }, + { + "epoch": 0.3859054079517618, + "grad_norm": 0.5805016640621002, + "learning_rate": 1.9480005399493857e-05, + "loss": 0.4613, + "step": 256 + }, + { + "epoch": 0.3874128509515734, + "grad_norm": 0.6053466035481387, + "learning_rate": 1.9471591393826536e-05, + "loss": 0.4877, + "step": 257 + }, + { + "epoch": 0.38892029395138494, + "grad_norm": 0.5443749204002357, + "learning_rate": 1.9463111706886234e-05, + "loss": 0.481, + "step": 258 + }, + { + "epoch": 0.39042773695119654, + "grad_norm": 0.6422687053592201, + "learning_rate": 1.9454566397475813e-05, + "loss": 0.464, + "step": 259 + }, + { + "epoch": 0.39193517995100813, + "grad_norm": 0.5911574213296809, + "learning_rate": 1.944595552485319e-05, + "loss": 0.4451, + "step": 260 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.6244696365384524, + "learning_rate": 1.943727914873094e-05, + "loss": 0.465, + "step": 261 + }, + { + "epoch": 0.39495006595063126, + "grad_norm": 0.6787496907794774, + "learning_rate": 1.9428537329275862e-05, + "loss": 0.4591, + "step": 262 + }, + { + "epoch": 0.3964575089504428, + "grad_norm": 0.584284155721975, + "learning_rate": 1.941973012710859e-05, + "loss": 0.4835, + "step": 263 + }, + { + "epoch": 0.3979649519502544, + "grad_norm": 0.6636147745329853, + "learning_rate": 1.941085760330316e-05, + "loss": 0.4558, + "step": 264 + }, + { + "epoch": 0.3994723949500659, + "grad_norm": 0.580002453326873, + "learning_rate": 1.940191981938657e-05, + "loss": 0.4848, + "step": 265 + }, + { + "epoch": 0.4009798379498775, + "grad_norm": 0.6067452479296194, + "learning_rate": 1.9392916837338376e-05, + "loss": 0.4783, + "step": 266 + }, + { + "epoch": 0.4024872809496891, + "grad_norm": 0.6517612748843483, + "learning_rate": 1.9383848719590257e-05, + "loss": 0.4849, + "step": 267 + }, + { + "epoch": 0.40399472394950064, + "grad_norm": 0.6355304966389256, + "learning_rate": 1.9374715529025575e-05, + "loss": 0.4312, + "step": 268 + }, + { + "epoch": 0.40550216694931224, + "grad_norm": 0.627744747765263, + "learning_rate": 1.9365517328978943e-05, + "loss": 0.4762, + "step": 269 + }, + { + "epoch": 0.4070096099491238, + "grad_norm": 0.6640367945419465, + "learning_rate": 1.9356254183235785e-05, + "loss": 0.432, + "step": 270 + }, + { + "epoch": 0.40851705294893537, + "grad_norm": 0.647008694411896, + "learning_rate": 1.93469261560319e-05, + "loss": 0.4795, + "step": 271 + }, + { + "epoch": 0.41002449594874696, + "grad_norm": 0.6742117075938286, + "learning_rate": 1.9337533312053002e-05, + "loss": 0.4573, + "step": 272 + }, + { + "epoch": 0.4115319389485585, + "grad_norm": 0.6000668524451142, + "learning_rate": 1.9328075716434287e-05, + "loss": 0.4474, + "step": 273 + }, + { + "epoch": 0.4130393819483701, + "grad_norm": 0.6027061587937567, + "learning_rate": 1.931855343475998e-05, + "loss": 0.4283, + "step": 274 + }, + { + "epoch": 0.4145468249481816, + "grad_norm": 0.56875377174764, + "learning_rate": 1.930896653306286e-05, + "loss": 0.4446, + "step": 275 + }, + { + "epoch": 0.4160542679479932, + "grad_norm": 0.6494800822344575, + "learning_rate": 1.929931507782383e-05, + "loss": 0.4504, + "step": 276 + }, + { + "epoch": 0.4175617109478048, + "grad_norm": 0.5925306999643124, + "learning_rate": 1.9289599135971437e-05, + "loss": 0.4993, + "step": 277 + }, + { + "epoch": 0.41906915394761635, + "grad_norm": 0.5812846521774916, + "learning_rate": 1.9279818774881418e-05, + "loss": 0.4574, + "step": 278 + }, + { + "epoch": 0.42057659694742794, + "grad_norm": 0.5625417674563119, + "learning_rate": 1.9269974062376224e-05, + "loss": 0.4325, + "step": 279 + }, + { + "epoch": 0.4220840399472395, + "grad_norm": 0.5839055838922522, + "learning_rate": 1.926006506672456e-05, + "loss": 0.4669, + "step": 280 + }, + { + "epoch": 0.42359148294705107, + "grad_norm": 0.6042605173402862, + "learning_rate": 1.9250091856640895e-05, + "loss": 0.4224, + "step": 281 + }, + { + "epoch": 0.42509892594686266, + "grad_norm": 0.5856982708883072, + "learning_rate": 1.9240054501285015e-05, + "loss": 0.4709, + "step": 282 + }, + { + "epoch": 0.4266063689466742, + "grad_norm": 0.5631263514578662, + "learning_rate": 1.922995307026151e-05, + "loss": 0.4614, + "step": 283 + }, + { + "epoch": 0.4281138119464858, + "grad_norm": 0.5583569731432177, + "learning_rate": 1.921978763361931e-05, + "loss": 0.4589, + "step": 284 + }, + { + "epoch": 0.4296212549462973, + "grad_norm": 0.6050421963625475, + "learning_rate": 1.9209558261851194e-05, + "loss": 0.4382, + "step": 285 + }, + { + "epoch": 0.4311286979461089, + "grad_norm": 0.533785762634786, + "learning_rate": 1.919926502589331e-05, + "loss": 0.4862, + "step": 286 + }, + { + "epoch": 0.43263614094592046, + "grad_norm": 0.5693448486944194, + "learning_rate": 1.9188907997124666e-05, + "loss": 0.4562, + "step": 287 + }, + { + "epoch": 0.43414358394573205, + "grad_norm": 0.5654990613672617, + "learning_rate": 1.9178487247366652e-05, + "loss": 0.4492, + "step": 288 + }, + { + "epoch": 0.43565102694554364, + "grad_norm": 0.5771432152665512, + "learning_rate": 1.916800284888253e-05, + "loss": 0.4478, + "step": 289 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.5734596310020046, + "learning_rate": 1.915745487437694e-05, + "loss": 0.4801, + "step": 290 + }, + { + "epoch": 0.43866591294516677, + "grad_norm": 0.5832753483996317, + "learning_rate": 1.9146843396995396e-05, + "loss": 0.4563, + "step": 291 + }, + { + "epoch": 0.4401733559449783, + "grad_norm": 0.5879841082366902, + "learning_rate": 1.9136168490323772e-05, + "loss": 0.4689, + "step": 292 + }, + { + "epoch": 0.4416807989447899, + "grad_norm": 0.5521570450782258, + "learning_rate": 1.9125430228387794e-05, + "loss": 0.4581, + "step": 293 + }, + { + "epoch": 0.4431882419446015, + "grad_norm": 0.5673604877581071, + "learning_rate": 1.9114628685652535e-05, + "loss": 0.4668, + "step": 294 + }, + { + "epoch": 0.444695684944413, + "grad_norm": 0.5866077006525799, + "learning_rate": 1.9103763937021887e-05, + "loss": 0.4588, + "step": 295 + }, + { + "epoch": 0.4462031279442246, + "grad_norm": 0.5731048741878798, + "learning_rate": 1.909283605783805e-05, + "loss": 0.4774, + "step": 296 + }, + { + "epoch": 0.44771057094403616, + "grad_norm": 0.6251177027508026, + "learning_rate": 1.9081845123881002e-05, + "loss": 0.4813, + "step": 297 + }, + { + "epoch": 0.44921801394384775, + "grad_norm": 0.5256954818277138, + "learning_rate": 1.9070791211367984e-05, + "loss": 0.4473, + "step": 298 + }, + { + "epoch": 0.45072545694365934, + "grad_norm": 0.6199874516009303, + "learning_rate": 1.9059674396952963e-05, + "loss": 0.4629, + "step": 299 + }, + { + "epoch": 0.4522328999434709, + "grad_norm": 0.5917017492987557, + "learning_rate": 1.90484947577261e-05, + "loss": 0.4979, + "step": 300 + }, + { + "epoch": 0.45374034294328247, + "grad_norm": 0.6120361922704654, + "learning_rate": 1.903725237121322e-05, + "loss": 0.4831, + "step": 301 + }, + { + "epoch": 0.455247785943094, + "grad_norm": 0.5514120347682593, + "learning_rate": 1.902594731537527e-05, + "loss": 0.4452, + "step": 302 + }, + { + "epoch": 0.4567552289429056, + "grad_norm": 0.5767336190747095, + "learning_rate": 1.901457966860779e-05, + "loss": 0.4435, + "step": 303 + }, + { + "epoch": 0.4582626719427172, + "grad_norm": 0.5868519118956824, + "learning_rate": 1.9003149509740347e-05, + "loss": 0.492, + "step": 304 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.6168191655686016, + "learning_rate": 1.899165691803601e-05, + "loss": 0.4512, + "step": 305 + }, + { + "epoch": 0.4612775579423403, + "grad_norm": 0.6053359821845329, + "learning_rate": 1.8980101973190787e-05, + "loss": 0.4749, + "step": 306 + }, + { + "epoch": 0.46278500094215186, + "grad_norm": 0.60634572707715, + "learning_rate": 1.896848475533309e-05, + "loss": 0.4682, + "step": 307 + }, + { + "epoch": 0.46429244394196345, + "grad_norm": 0.6339199718330278, + "learning_rate": 1.8956805345023145e-05, + "loss": 0.4643, + "step": 308 + }, + { + "epoch": 0.465799886941775, + "grad_norm": 0.6011561135843241, + "learning_rate": 1.894506382325248e-05, + "loss": 0.435, + "step": 309 + }, + { + "epoch": 0.4673073299415866, + "grad_norm": 0.6067579490346751, + "learning_rate": 1.8933260271443313e-05, + "loss": 0.4162, + "step": 310 + }, + { + "epoch": 0.4688147729413982, + "grad_norm": 0.5747986536629459, + "learning_rate": 1.8921394771448032e-05, + "loss": 0.448, + "step": 311 + }, + { + "epoch": 0.4703222159412097, + "grad_norm": 0.605434367981348, + "learning_rate": 1.89094674055486e-05, + "loss": 0.4264, + "step": 312 + }, + { + "epoch": 0.4718296589410213, + "grad_norm": 0.6028982875539595, + "learning_rate": 1.889747825645599e-05, + "loss": 0.447, + "step": 313 + }, + { + "epoch": 0.47333710194083284, + "grad_norm": 0.6024460995063091, + "learning_rate": 1.8885427407309627e-05, + "loss": 0.4689, + "step": 314 + }, + { + "epoch": 0.47484454494064443, + "grad_norm": 0.6726949468749703, + "learning_rate": 1.887331494167678e-05, + "loss": 0.4562, + "step": 315 + }, + { + "epoch": 0.476351987940456, + "grad_norm": 0.6108367421924343, + "learning_rate": 1.8861140943552014e-05, + "loss": 0.4574, + "step": 316 + }, + { + "epoch": 0.47785943094026756, + "grad_norm": 0.6095993211515124, + "learning_rate": 1.884890549735659e-05, + "loss": 0.429, + "step": 317 + }, + { + "epoch": 0.47936687394007915, + "grad_norm": 0.5708366516060817, + "learning_rate": 1.8836608687937883e-05, + "loss": 0.4494, + "step": 318 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.6319148329863508, + "learning_rate": 1.8824250600568798e-05, + "loss": 0.4457, + "step": 319 + }, + { + "epoch": 0.4823817599397023, + "grad_norm": 0.5817901717334689, + "learning_rate": 1.8811831320947177e-05, + "loss": 0.4444, + "step": 320 + }, + { + "epoch": 0.4838892029395139, + "grad_norm": 0.6167253992638152, + "learning_rate": 1.879935093519519e-05, + "loss": 0.4758, + "step": 321 + }, + { + "epoch": 0.4853966459393254, + "grad_norm": 0.5918299912550289, + "learning_rate": 1.878680952985877e-05, + "loss": 0.4586, + "step": 322 + }, + { + "epoch": 0.486904088939137, + "grad_norm": 0.5897988696893806, + "learning_rate": 1.8774207191906976e-05, + "loss": 0.4548, + "step": 323 + }, + { + "epoch": 0.48841153193894854, + "grad_norm": 0.5336492924439385, + "learning_rate": 1.8761544008731426e-05, + "loss": 0.4477, + "step": 324 + }, + { + "epoch": 0.48991897493876013, + "grad_norm": 0.5969332291879268, + "learning_rate": 1.874882006814565e-05, + "loss": 0.4423, + "step": 325 + }, + { + "epoch": 0.4914264179385717, + "grad_norm": 0.5894559630672119, + "learning_rate": 1.8736035458384528e-05, + "loss": 0.4681, + "step": 326 + }, + { + "epoch": 0.49293386093838326, + "grad_norm": 0.583381204713255, + "learning_rate": 1.8723190268103634e-05, + "loss": 0.431, + "step": 327 + }, + { + "epoch": 0.49444130393819485, + "grad_norm": 0.5501857874739489, + "learning_rate": 1.8710284586378645e-05, + "loss": 0.4501, + "step": 328 + }, + { + "epoch": 0.4959487469380064, + "grad_norm": 0.5807568427837185, + "learning_rate": 1.8697318502704734e-05, + "loss": 0.446, + "step": 329 + }, + { + "epoch": 0.497456189937818, + "grad_norm": 0.5344952874232914, + "learning_rate": 1.8684292106995916e-05, + "loss": 0.464, + "step": 330 + }, + { + "epoch": 0.4989636329376295, + "grad_norm": 0.5875400091192824, + "learning_rate": 1.8671205489584453e-05, + "loss": 0.462, + "step": 331 + }, + { + "epoch": 0.5004710759374411, + "grad_norm": 0.5898142606962845, + "learning_rate": 1.865805874122021e-05, + "loss": 0.4495, + "step": 332 + }, + { + "epoch": 0.5019785189372526, + "grad_norm": 0.5383180946864506, + "learning_rate": 1.8644851953070045e-05, + "loss": 0.474, + "step": 333 + }, + { + "epoch": 0.5034859619370643, + "grad_norm": 0.5701159430118912, + "learning_rate": 1.863158521671716e-05, + "loss": 0.4644, + "step": 334 + }, + { + "epoch": 0.5049934049368758, + "grad_norm": 0.5456550772582448, + "learning_rate": 1.8618258624160465e-05, + "loss": 0.4426, + "step": 335 + }, + { + "epoch": 0.5065008479366874, + "grad_norm": 0.5806062450133762, + "learning_rate": 1.8604872267813954e-05, + "loss": 0.4428, + "step": 336 + }, + { + "epoch": 0.508008290936499, + "grad_norm": 0.5723184224994758, + "learning_rate": 1.859142624050605e-05, + "loss": 0.427, + "step": 337 + }, + { + "epoch": 0.5095157339363106, + "grad_norm": 0.5503430826330011, + "learning_rate": 1.8577920635478976e-05, + "loss": 0.4863, + "step": 338 + }, + { + "epoch": 0.5110231769361221, + "grad_norm": 0.5922429005891785, + "learning_rate": 1.8564355546388094e-05, + "loss": 0.472, + "step": 339 + }, + { + "epoch": 0.5125306199359336, + "grad_norm": 0.5243816217609505, + "learning_rate": 1.855073106730126e-05, + "loss": 0.4563, + "step": 340 + }, + { + "epoch": 0.5140380629357453, + "grad_norm": 0.571898057341335, + "learning_rate": 1.8537047292698175e-05, + "loss": 0.4686, + "step": 341 + }, + { + "epoch": 0.5155455059355568, + "grad_norm": 0.5389787797747003, + "learning_rate": 1.852330431746973e-05, + "loss": 0.4044, + "step": 342 + }, + { + "epoch": 0.5170529489353684, + "grad_norm": 0.5755069679771695, + "learning_rate": 1.8509502236917353e-05, + "loss": 0.4536, + "step": 343 + }, + { + "epoch": 0.51856039193518, + "grad_norm": 0.5386650306089089, + "learning_rate": 1.8495641146752322e-05, + "loss": 0.4285, + "step": 344 + }, + { + "epoch": 0.5200678349349915, + "grad_norm": 0.5775045065740545, + "learning_rate": 1.848172114309513e-05, + "loss": 0.4579, + "step": 345 + }, + { + "epoch": 0.5215752779348031, + "grad_norm": 0.6222104655446267, + "learning_rate": 1.8467742322474822e-05, + "loss": 0.4733, + "step": 346 + }, + { + "epoch": 0.5230827209346146, + "grad_norm": 0.5869893846228816, + "learning_rate": 1.845370478182829e-05, + "loss": 0.5073, + "step": 347 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.6007295355780623, + "learning_rate": 1.8439608618499637e-05, + "loss": 0.4859, + "step": 348 + }, + { + "epoch": 0.5260976069342378, + "grad_norm": 0.5715093886190423, + "learning_rate": 1.842545393023949e-05, + "loss": 0.436, + "step": 349 + }, + { + "epoch": 0.5276050499340493, + "grad_norm": 0.5370655215760771, + "learning_rate": 1.841124081520431e-05, + "loss": 0.4545, + "step": 350 + }, + { + "epoch": 0.529112492933861, + "grad_norm": 0.5468526752808022, + "learning_rate": 1.8396969371955724e-05, + "loss": 0.4412, + "step": 351 + }, + { + "epoch": 0.5306199359336725, + "grad_norm": 0.5386055180491347, + "learning_rate": 1.838263969945985e-05, + "loss": 0.455, + "step": 352 + }, + { + "epoch": 0.532127378933484, + "grad_norm": 0.5273830292324821, + "learning_rate": 1.836825189708659e-05, + "loss": 0.4208, + "step": 353 + }, + { + "epoch": 0.5336348219332957, + "grad_norm": 0.5324858057392972, + "learning_rate": 1.8353806064608953e-05, + "loss": 0.4259, + "step": 354 + }, + { + "epoch": 0.5351422649331072, + "grad_norm": 0.5185086851614243, + "learning_rate": 1.833930230220236e-05, + "loss": 0.4506, + "step": 355 + }, + { + "epoch": 0.5366497079329188, + "grad_norm": 0.5553133756097826, + "learning_rate": 1.8324740710443955e-05, + "loss": 0.4629, + "step": 356 + }, + { + "epoch": 0.5381571509327303, + "grad_norm": 0.5742120676044152, + "learning_rate": 1.831012139031189e-05, + "loss": 0.4357, + "step": 357 + }, + { + "epoch": 0.539664593932542, + "grad_norm": 0.5605121444976939, + "learning_rate": 1.829544444318466e-05, + "loss": 0.4606, + "step": 358 + }, + { + "epoch": 0.5411720369323535, + "grad_norm": 0.6092704764024721, + "learning_rate": 1.8280709970840352e-05, + "loss": 0.4589, + "step": 359 + }, + { + "epoch": 0.542679479932165, + "grad_norm": 0.5515104498699946, + "learning_rate": 1.8265918075455985e-05, + "loss": 0.4554, + "step": 360 + }, + { + "epoch": 0.5441869229319767, + "grad_norm": 0.5517752011641777, + "learning_rate": 1.8251068859606777e-05, + "loss": 0.4446, + "step": 361 + }, + { + "epoch": 0.5456943659317882, + "grad_norm": 0.523313087940014, + "learning_rate": 1.823616242626542e-05, + "loss": 0.4453, + "step": 362 + }, + { + "epoch": 0.5472018089315998, + "grad_norm": 0.5555090795115328, + "learning_rate": 1.8221198878801415e-05, + "loss": 0.431, + "step": 363 + }, + { + "epoch": 0.5487092519314113, + "grad_norm": 0.5254077832278897, + "learning_rate": 1.8206178320980295e-05, + "loss": 0.4512, + "step": 364 + }, + { + "epoch": 0.5502166949312229, + "grad_norm": 0.5382752275452225, + "learning_rate": 1.819110085696295e-05, + "loss": 0.4489, + "step": 365 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.5752845306460045, + "learning_rate": 1.817596659130489e-05, + "loss": 0.4659, + "step": 366 + }, + { + "epoch": 0.553231580930846, + "grad_norm": 0.534082668899082, + "learning_rate": 1.816077562895551e-05, + "loss": 0.443, + "step": 367 + }, + { + "epoch": 0.5547390239306577, + "grad_norm": 0.4935673245960411, + "learning_rate": 1.814552807525738e-05, + "loss": 0.4265, + "step": 368 + }, + { + "epoch": 0.5562464669304692, + "grad_norm": 0.5587086828843211, + "learning_rate": 1.81302240359455e-05, + "loss": 0.4171, + "step": 369 + }, + { + "epoch": 0.5577539099302807, + "grad_norm": 0.5706799552715889, + "learning_rate": 1.8114863617146576e-05, + "loss": 0.4419, + "step": 370 + }, + { + "epoch": 0.5592613529300924, + "grad_norm": 0.5559814423377313, + "learning_rate": 1.8099446925378278e-05, + "loss": 0.4646, + "step": 371 + }, + { + "epoch": 0.5607687959299039, + "grad_norm": 0.6399807563842037, + "learning_rate": 1.8083974067548506e-05, + "loss": 0.4662, + "step": 372 + }, + { + "epoch": 0.5622762389297155, + "grad_norm": 0.5499667823126643, + "learning_rate": 1.806844515095465e-05, + "loss": 0.4705, + "step": 373 + }, + { + "epoch": 0.563783681929527, + "grad_norm": 0.5802308318791667, + "learning_rate": 1.8052860283282832e-05, + "loss": 0.4285, + "step": 374 + }, + { + "epoch": 0.5652911249293386, + "grad_norm": 0.616061675009139, + "learning_rate": 1.8037219572607177e-05, + "loss": 0.4661, + "step": 375 + }, + { + "epoch": 0.5667985679291502, + "grad_norm": 0.5381388831653736, + "learning_rate": 1.8021523127389066e-05, + "loss": 0.442, + "step": 376 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.5427863037336617, + "learning_rate": 1.800577105647635e-05, + "loss": 0.4737, + "step": 377 + }, + { + "epoch": 0.5698134539287734, + "grad_norm": 0.647319829296571, + "learning_rate": 1.7989963469102643e-05, + "loss": 0.4597, + "step": 378 + }, + { + "epoch": 0.5713208969285849, + "grad_norm": 0.5361993689265471, + "learning_rate": 1.797410047488653e-05, + "loss": 0.4515, + "step": 379 + }, + { + "epoch": 0.5728283399283964, + "grad_norm": 0.5928443348297506, + "learning_rate": 1.7958182183830816e-05, + "loss": 0.4383, + "step": 380 + }, + { + "epoch": 0.574335782928208, + "grad_norm": 0.5525429424873411, + "learning_rate": 1.794220870632177e-05, + "loss": 0.4676, + "step": 381 + }, + { + "epoch": 0.5758432259280196, + "grad_norm": 0.6048913598018805, + "learning_rate": 1.7926180153128358e-05, + "loss": 0.4803, + "step": 382 + }, + { + "epoch": 0.5773506689278312, + "grad_norm": 0.6159208841600681, + "learning_rate": 1.791009663540146e-05, + "loss": 0.4446, + "step": 383 + }, + { + "epoch": 0.5788581119276427, + "grad_norm": 0.604058916697408, + "learning_rate": 1.789395826467312e-05, + "loss": 0.4406, + "step": 384 + }, + { + "epoch": 0.5803655549274543, + "grad_norm": 0.6189321454832999, + "learning_rate": 1.7877765152855757e-05, + "loss": 0.4757, + "step": 385 + }, + { + "epoch": 0.5818729979272659, + "grad_norm": 0.5252310621840579, + "learning_rate": 1.78615174122414e-05, + "loss": 0.4226, + "step": 386 + }, + { + "epoch": 0.5833804409270774, + "grad_norm": 0.6058698433864601, + "learning_rate": 1.78452151555009e-05, + "loss": 0.4242, + "step": 387 + }, + { + "epoch": 0.5848878839268891, + "grad_norm": 0.5784597918661724, + "learning_rate": 1.7828858495683162e-05, + "loss": 0.4546, + "step": 388 + }, + { + "epoch": 0.5863953269267006, + "grad_norm": 0.5778733445604559, + "learning_rate": 1.781244754621434e-05, + "loss": 0.4474, + "step": 389 + }, + { + "epoch": 0.5879027699265121, + "grad_norm": 0.5574362195371769, + "learning_rate": 1.779598242089707e-05, + "loss": 0.4461, + "step": 390 + }, + { + "epoch": 0.5894102129263237, + "grad_norm": 0.6035018906117913, + "learning_rate": 1.7779463233909677e-05, + "loss": 0.4647, + "step": 391 + }, + { + "epoch": 0.5909176559261353, + "grad_norm": 0.5783320653215531, + "learning_rate": 1.7762890099805362e-05, + "loss": 0.4509, + "step": 392 + }, + { + "epoch": 0.5924250989259469, + "grad_norm": 0.608063697903211, + "learning_rate": 1.774626313351145e-05, + "loss": 0.4496, + "step": 393 + }, + { + "epoch": 0.5939325419257584, + "grad_norm": 0.5637493289630973, + "learning_rate": 1.7729582450328547e-05, + "loss": 0.4548, + "step": 394 + }, + { + "epoch": 0.59543998492557, + "grad_norm": 0.5878505952019026, + "learning_rate": 1.771284816592978e-05, + "loss": 0.4025, + "step": 395 + }, + { + "epoch": 0.5969474279253816, + "grad_norm": 0.5732228081169485, + "learning_rate": 1.7696060396359956e-05, + "loss": 0.4155, + "step": 396 + }, + { + "epoch": 0.5984548709251931, + "grad_norm": 0.5275574748856542, + "learning_rate": 1.7679219258034798e-05, + "loss": 0.4668, + "step": 397 + }, + { + "epoch": 0.5999623139250048, + "grad_norm": 0.565193432089848, + "learning_rate": 1.7662324867740102e-05, + "loss": 0.464, + "step": 398 + }, + { + "epoch": 0.6014697569248163, + "grad_norm": 0.5276065053060457, + "learning_rate": 1.7645377342630956e-05, + "loss": 0.4641, + "step": 399 + }, + { + "epoch": 0.6029771999246278, + "grad_norm": 0.5504334109425478, + "learning_rate": 1.76283768002309e-05, + "loss": 0.4288, + "step": 400 + }, + { + "epoch": 0.6044846429244394, + "grad_norm": 0.6059296820868759, + "learning_rate": 1.7611323358431145e-05, + "loss": 0.4961, + "step": 401 + }, + { + "epoch": 0.605992085924251, + "grad_norm": 0.5077017761738585, + "learning_rate": 1.759421713548971e-05, + "loss": 0.4706, + "step": 402 + }, + { + "epoch": 0.6074995289240626, + "grad_norm": 0.5590656170710925, + "learning_rate": 1.757705825003065e-05, + "loss": 0.4034, + "step": 403 + }, + { + "epoch": 0.6090069719238741, + "grad_norm": 0.525709220345065, + "learning_rate": 1.7559846821043205e-05, + "loss": 0.4379, + "step": 404 + }, + { + "epoch": 0.6105144149236857, + "grad_norm": 0.5538945207929713, + "learning_rate": 1.754258296788097e-05, + "loss": 0.445, + "step": 405 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.5517645766831191, + "learning_rate": 1.7525266810261096e-05, + "loss": 0.4469, + "step": 406 + }, + { + "epoch": 0.6135293009233088, + "grad_norm": 0.5594555749715797, + "learning_rate": 1.7507898468263422e-05, + "loss": 0.4343, + "step": 407 + }, + { + "epoch": 0.6150367439231204, + "grad_norm": 0.5530618540850076, + "learning_rate": 1.7490478062329686e-05, + "loss": 0.4625, + "step": 408 + }, + { + "epoch": 0.616544186922932, + "grad_norm": 0.5948076942836006, + "learning_rate": 1.7473005713262644e-05, + "loss": 0.4497, + "step": 409 + }, + { + "epoch": 0.6180516299227435, + "grad_norm": 0.5776155556563956, + "learning_rate": 1.7455481542225272e-05, + "loss": 0.3959, + "step": 410 + }, + { + "epoch": 0.6195590729225551, + "grad_norm": 0.5391682645939875, + "learning_rate": 1.7437905670739893e-05, + "loss": 0.4337, + "step": 411 + }, + { + "epoch": 0.6210665159223667, + "grad_norm": 0.5918312025262793, + "learning_rate": 1.7420278220687366e-05, + "loss": 0.4749, + "step": 412 + }, + { + "epoch": 0.6225739589221783, + "grad_norm": 0.5169533403943937, + "learning_rate": 1.7402599314306207e-05, + "loss": 0.4361, + "step": 413 + }, + { + "epoch": 0.6240814019219898, + "grad_norm": 0.5757476582664114, + "learning_rate": 1.7384869074191777e-05, + "loss": 0.4423, + "step": 414 + }, + { + "epoch": 0.6255888449218014, + "grad_norm": 0.5789420594237762, + "learning_rate": 1.7367087623295394e-05, + "loss": 0.4493, + "step": 415 + }, + { + "epoch": 0.627096287921613, + "grad_norm": 0.5146689624027024, + "learning_rate": 1.7349255084923517e-05, + "loss": 0.4128, + "step": 416 + }, + { + "epoch": 0.6286037309214245, + "grad_norm": 0.5556214483108315, + "learning_rate": 1.7331371582736864e-05, + "loss": 0.4097, + "step": 417 + }, + { + "epoch": 0.6301111739212361, + "grad_norm": 0.5781033815860408, + "learning_rate": 1.731343724074957e-05, + "loss": 0.4755, + "step": 418 + }, + { + "epoch": 0.6316186169210477, + "grad_norm": 0.505299705771376, + "learning_rate": 1.7295452183328317e-05, + "loss": 0.423, + "step": 419 + }, + { + "epoch": 0.6331260599208592, + "grad_norm": 0.6019529322565086, + "learning_rate": 1.7277416535191478e-05, + "loss": 0.4467, + "step": 420 + }, + { + "epoch": 0.6346335029206708, + "grad_norm": 0.5423258091864472, + "learning_rate": 1.7259330421408247e-05, + "loss": 0.4297, + "step": 421 + }, + { + "epoch": 0.6361409459204824, + "grad_norm": 0.550859799446333, + "learning_rate": 1.7241193967397784e-05, + "loss": 0.4334, + "step": 422 + }, + { + "epoch": 0.637648388920294, + "grad_norm": 0.5436505610454662, + "learning_rate": 1.7223007298928322e-05, + "loss": 0.4227, + "step": 423 + }, + { + "epoch": 0.6391558319201055, + "grad_norm": 0.5265015330498195, + "learning_rate": 1.7204770542116326e-05, + "loss": 0.4407, + "step": 424 + }, + { + "epoch": 0.640663274919917, + "grad_norm": 0.577557633955233, + "learning_rate": 1.7186483823425582e-05, + "loss": 0.4794, + "step": 425 + }, + { + "epoch": 0.6421707179197287, + "grad_norm": 0.5304780945155085, + "learning_rate": 1.7168147269666357e-05, + "loss": 0.4306, + "step": 426 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 0.5436263482054755, + "learning_rate": 1.714976100799449e-05, + "loss": 0.4505, + "step": 427 + }, + { + "epoch": 0.6451856039193518, + "grad_norm": 0.5239803005942689, + "learning_rate": 1.713132516591053e-05, + "loss": 0.4204, + "step": 428 + }, + { + "epoch": 0.6466930469191634, + "grad_norm": 0.5640485363783228, + "learning_rate": 1.7112839871258838e-05, + "loss": 0.4709, + "step": 429 + }, + { + "epoch": 0.6482004899189749, + "grad_norm": 0.5112413611963181, + "learning_rate": 1.7094305252226713e-05, + "loss": 0.4352, + "step": 430 + }, + { + "epoch": 0.6497079329187865, + "grad_norm": 0.5839208365283748, + "learning_rate": 1.7075721437343488e-05, + "loss": 0.467, + "step": 431 + }, + { + "epoch": 0.6512153759185981, + "grad_norm": 0.5264144807133015, + "learning_rate": 1.705708855547966e-05, + "loss": 0.4427, + "step": 432 + }, + { + "epoch": 0.6527228189184097, + "grad_norm": 0.503285177882026, + "learning_rate": 1.7038406735845967e-05, + "loss": 0.4206, + "step": 433 + }, + { + "epoch": 0.6542302619182212, + "grad_norm": 0.523921175908132, + "learning_rate": 1.7019676107992523e-05, + "loss": 0.4636, + "step": 434 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.5213012549969936, + "learning_rate": 1.70008968018079e-05, + "loss": 0.4385, + "step": 435 + }, + { + "epoch": 0.6572451479178444, + "grad_norm": 0.5616975925596913, + "learning_rate": 1.6982068947518235e-05, + "loss": 0.4495, + "step": 436 + }, + { + "epoch": 0.6587525909176559, + "grad_norm": 0.5094741288290618, + "learning_rate": 1.6963192675686312e-05, + "loss": 0.4354, + "step": 437 + }, + { + "epoch": 0.6602600339174675, + "grad_norm": 0.5691859599654164, + "learning_rate": 1.694426811721069e-05, + "loss": 0.4121, + "step": 438 + }, + { + "epoch": 0.6617674769172791, + "grad_norm": 0.565755177059836, + "learning_rate": 1.6925295403324758e-05, + "loss": 0.4291, + "step": 439 + }, + { + "epoch": 0.6632749199170906, + "grad_norm": 0.5182694692522232, + "learning_rate": 1.6906274665595854e-05, + "loss": 0.4187, + "step": 440 + }, + { + "epoch": 0.6647823629169022, + "grad_norm": 0.5442306033345655, + "learning_rate": 1.688720603592432e-05, + "loss": 0.4596, + "step": 441 + }, + { + "epoch": 0.6662898059167138, + "grad_norm": 0.508987211991653, + "learning_rate": 1.6868089646542632e-05, + "loss": 0.4218, + "step": 442 + }, + { + "epoch": 0.6677972489165254, + "grad_norm": 0.5409018441358341, + "learning_rate": 1.6848925630014445e-05, + "loss": 0.4422, + "step": 443 + }, + { + "epoch": 0.6693046919163369, + "grad_norm": 0.5332135170482968, + "learning_rate": 1.6829714119233688e-05, + "loss": 0.4742, + "step": 444 + }, + { + "epoch": 0.6708121349161484, + "grad_norm": 0.510365685539909, + "learning_rate": 1.6810455247423634e-05, + "loss": 0.4308, + "step": 445 + }, + { + "epoch": 0.6723195779159601, + "grad_norm": 0.5088383566851198, + "learning_rate": 1.6791149148136003e-05, + "loss": 0.4491, + "step": 446 + }, + { + "epoch": 0.6738270209157716, + "grad_norm": 0.5398522018308489, + "learning_rate": 1.677179595525e-05, + "loss": 0.465, + "step": 447 + }, + { + "epoch": 0.6753344639155832, + "grad_norm": 0.5312851766133058, + "learning_rate": 1.675239580297141e-05, + "loss": 0.4574, + "step": 448 + }, + { + "epoch": 0.6768419069153948, + "grad_norm": 0.5377924163432233, + "learning_rate": 1.6732948825831657e-05, + "loss": 0.4282, + "step": 449 + }, + { + "epoch": 0.6783493499152063, + "grad_norm": 0.5411515105207517, + "learning_rate": 1.671345515868688e-05, + "loss": 0.437, + "step": 450 + }, + { + "epoch": 0.6798567929150179, + "grad_norm": 0.5061423487479686, + "learning_rate": 1.6693914936716983e-05, + "loss": 0.4244, + "step": 451 + }, + { + "epoch": 0.6813642359148294, + "grad_norm": 0.5390647508447596, + "learning_rate": 1.6674328295424723e-05, + "loss": 0.4395, + "step": 452 + }, + { + "epoch": 0.6828716789146411, + "grad_norm": 0.5706362763533134, + "learning_rate": 1.6654695370634738e-05, + "loss": 0.4421, + "step": 453 + }, + { + "epoch": 0.6843791219144526, + "grad_norm": 0.5330284685793139, + "learning_rate": 1.6635016298492628e-05, + "loss": 0.4303, + "step": 454 + }, + { + "epoch": 0.6858865649142641, + "grad_norm": 0.5267067326608682, + "learning_rate": 1.6615291215464005e-05, + "loss": 0.4245, + "step": 455 + }, + { + "epoch": 0.6873940079140758, + "grad_norm": 0.5726680200512305, + "learning_rate": 1.6595520258333545e-05, + "loss": 0.4752, + "step": 456 + }, + { + "epoch": 0.6889014509138873, + "grad_norm": 0.5183865668680759, + "learning_rate": 1.657570356420404e-05, + "loss": 0.4542, + "step": 457 + }, + { + "epoch": 0.6904088939136989, + "grad_norm": 0.553551099478117, + "learning_rate": 1.6555841270495456e-05, + "loss": 0.445, + "step": 458 + }, + { + "epoch": 0.6919163369135105, + "grad_norm": 0.5929224658029257, + "learning_rate": 1.6535933514943955e-05, + "loss": 0.4183, + "step": 459 + }, + { + "epoch": 0.693423779913322, + "grad_norm": 0.5010271872134405, + "learning_rate": 1.6515980435600965e-05, + "loss": 0.4169, + "step": 460 + }, + { + "epoch": 0.6949312229131336, + "grad_norm": 0.49068598527278895, + "learning_rate": 1.6495982170832224e-05, + "loss": 0.4122, + "step": 461 + }, + { + "epoch": 0.6964386659129451, + "grad_norm": 0.5288472547252633, + "learning_rate": 1.6475938859316795e-05, + "loss": 0.4154, + "step": 462 + }, + { + "epoch": 0.6979461089127568, + "grad_norm": 0.5364001246117184, + "learning_rate": 1.6455850640046134e-05, + "loss": 0.4247, + "step": 463 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.5248089160285507, + "learning_rate": 1.6435717652323097e-05, + "loss": 0.4522, + "step": 464 + }, + { + "epoch": 0.7009609949123798, + "grad_norm": 0.5871578611838155, + "learning_rate": 1.6415540035761008e-05, + "loss": 0.4477, + "step": 465 + }, + { + "epoch": 0.7024684379121915, + "grad_norm": 0.531098674787926, + "learning_rate": 1.639531793028265e-05, + "loss": 0.43, + "step": 466 + }, + { + "epoch": 0.703975880912003, + "grad_norm": 0.6050322359617515, + "learning_rate": 1.637505147611934e-05, + "loss": 0.4533, + "step": 467 + }, + { + "epoch": 0.7054833239118146, + "grad_norm": 0.5045703819799817, + "learning_rate": 1.6354740813809917e-05, + "loss": 0.4021, + "step": 468 + }, + { + "epoch": 0.7069907669116261, + "grad_norm": 0.5129545738188582, + "learning_rate": 1.6334386084199787e-05, + "loss": 0.4517, + "step": 469 + }, + { + "epoch": 0.7084982099114377, + "grad_norm": 0.5736577274561188, + "learning_rate": 1.631398742843995e-05, + "loss": 0.418, + "step": 470 + }, + { + "epoch": 0.7100056529112493, + "grad_norm": 0.5323460252829038, + "learning_rate": 1.629354498798601e-05, + "loss": 0.4251, + "step": 471 + }, + { + "epoch": 0.7115130959110608, + "grad_norm": 0.5747199097534378, + "learning_rate": 1.627305890459719e-05, + "loss": 0.4394, + "step": 472 + }, + { + "epoch": 0.7130205389108725, + "grad_norm": 0.5646262513047455, + "learning_rate": 1.625252932033538e-05, + "loss": 0.4297, + "step": 473 + }, + { + "epoch": 0.714527981910684, + "grad_norm": 0.49304427786239235, + "learning_rate": 1.6231956377564095e-05, + "loss": 0.4224, + "step": 474 + }, + { + "epoch": 0.7160354249104955, + "grad_norm": 0.5791416730858486, + "learning_rate": 1.621134021894756e-05, + "loss": 0.4388, + "step": 475 + }, + { + "epoch": 0.7175428679103072, + "grad_norm": 0.5186150019034591, + "learning_rate": 1.619068098744965e-05, + "loss": 0.4422, + "step": 476 + }, + { + "epoch": 0.7190503109101187, + "grad_norm": 0.5839335428128258, + "learning_rate": 1.6169978826332955e-05, + "loss": 0.458, + "step": 477 + }, + { + "epoch": 0.7205577539099303, + "grad_norm": 0.5613046419371709, + "learning_rate": 1.6149233879157747e-05, + "loss": 0.4669, + "step": 478 + }, + { + "epoch": 0.7220651969097418, + "grad_norm": 0.5154157204007299, + "learning_rate": 1.6128446289781012e-05, + "loss": 0.4372, + "step": 479 + }, + { + "epoch": 0.7235726399095535, + "grad_norm": 0.5677977726488427, + "learning_rate": 1.610761620235543e-05, + "loss": 0.4731, + "step": 480 + }, + { + "epoch": 0.725080082909365, + "grad_norm": 0.5375971717165063, + "learning_rate": 1.60867437613284e-05, + "loss": 0.4566, + "step": 481 + }, + { + "epoch": 0.7265875259091765, + "grad_norm": 0.49724342603457516, + "learning_rate": 1.6065829111441e-05, + "loss": 0.4507, + "step": 482 + }, + { + "epoch": 0.7280949689089882, + "grad_norm": 0.5827089081742053, + "learning_rate": 1.6044872397727037e-05, + "loss": 0.4564, + "step": 483 + }, + { + "epoch": 0.7296024119087997, + "grad_norm": 0.5474489228753104, + "learning_rate": 1.6023873765511993e-05, + "loss": 0.4309, + "step": 484 + }, + { + "epoch": 0.7311098549086112, + "grad_norm": 0.5319969584661621, + "learning_rate": 1.6002833360412044e-05, + "loss": 0.4394, + "step": 485 + }, + { + "epoch": 0.7326172979084229, + "grad_norm": 0.5521662619957021, + "learning_rate": 1.5981751328333036e-05, + "loss": 0.4568, + "step": 486 + }, + { + "epoch": 0.7341247409082344, + "grad_norm": 0.4814653766664411, + "learning_rate": 1.5960627815469486e-05, + "loss": 0.4066, + "step": 487 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 0.5109256400558994, + "learning_rate": 1.5939462968303554e-05, + "loss": 0.4272, + "step": 488 + }, + { + "epoch": 0.7371396269078575, + "grad_norm": 0.5357957318401174, + "learning_rate": 1.5918256933604047e-05, + "loss": 0.4237, + "step": 489 + }, + { + "epoch": 0.7386470699076692, + "grad_norm": 0.5396229844011063, + "learning_rate": 1.589700985842538e-05, + "loss": 0.4205, + "step": 490 + }, + { + "epoch": 0.7401545129074807, + "grad_norm": 0.5056971418930007, + "learning_rate": 1.5875721890106574e-05, + "loss": 0.4558, + "step": 491 + }, + { + "epoch": 0.7416619559072922, + "grad_norm": 0.5466763607345122, + "learning_rate": 1.5854393176270205e-05, + "loss": 0.4262, + "step": 492 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.5318696480713733, + "learning_rate": 1.5833023864821427e-05, + "loss": 0.4222, + "step": 493 + }, + { + "epoch": 0.7446768419069154, + "grad_norm": 0.5577732122364522, + "learning_rate": 1.5811614103946905e-05, + "loss": 0.4643, + "step": 494 + }, + { + "epoch": 0.746184284906727, + "grad_norm": 0.5396811070945262, + "learning_rate": 1.5790164042113805e-05, + "loss": 0.4619, + "step": 495 + }, + { + "epoch": 0.7476917279065385, + "grad_norm": 0.5116348501037207, + "learning_rate": 1.576867382806877e-05, + "loss": 0.4257, + "step": 496 + }, + { + "epoch": 0.7491991709063501, + "grad_norm": 0.5376269628887883, + "learning_rate": 1.5747143610836873e-05, + "loss": 0.4431, + "step": 497 + }, + { + "epoch": 0.7507066139061617, + "grad_norm": 0.5552456121649234, + "learning_rate": 1.5725573539720592e-05, + "loss": 0.4345, + "step": 498 + }, + { + "epoch": 0.7522140569059732, + "grad_norm": 0.5525594597252514, + "learning_rate": 1.570396376429877e-05, + "loss": 0.4288, + "step": 499 + }, + { + "epoch": 0.7537214999057849, + "grad_norm": 0.5130914024917077, + "learning_rate": 1.5682314434425593e-05, + "loss": 0.4506, + "step": 500 + }, + { + "epoch": 0.7552289429055964, + "grad_norm": 0.5438445066019086, + "learning_rate": 1.5660625700229526e-05, + "loss": 0.451, + "step": 501 + }, + { + "epoch": 0.7567363859054079, + "grad_norm": 0.5393532424898553, + "learning_rate": 1.5638897712112303e-05, + "loss": 0.4339, + "step": 502 + }, + { + "epoch": 0.7582438289052196, + "grad_norm": 0.5067131473915181, + "learning_rate": 1.561713062074785e-05, + "loss": 0.4452, + "step": 503 + }, + { + "epoch": 0.7597512719050311, + "grad_norm": 0.511705817056659, + "learning_rate": 1.5595324577081265e-05, + "loss": 0.4227, + "step": 504 + }, + { + "epoch": 0.7612587149048426, + "grad_norm": 0.5105016396695756, + "learning_rate": 1.5573479732327758e-05, + "loss": 0.4223, + "step": 505 + }, + { + "epoch": 0.7627661579046542, + "grad_norm": 0.508814615305124, + "learning_rate": 1.555159623797161e-05, + "loss": 0.4649, + "step": 506 + }, + { + "epoch": 0.7642736009044658, + "grad_norm": 0.5115538447430213, + "learning_rate": 1.552967424576512e-05, + "loss": 0.4257, + "step": 507 + }, + { + "epoch": 0.7657810439042774, + "grad_norm": 0.519809456344861, + "learning_rate": 1.5507713907727557e-05, + "loss": 0.4393, + "step": 508 + }, + { + "epoch": 0.7672884869040889, + "grad_norm": 0.5220982867467517, + "learning_rate": 1.5485715376144087e-05, + "loss": 0.4296, + "step": 509 + }, + { + "epoch": 0.7687959299039006, + "grad_norm": 0.4819994486336346, + "learning_rate": 1.5463678803564753e-05, + "loss": 0.4227, + "step": 510 + }, + { + "epoch": 0.7703033729037121, + "grad_norm": 0.5721785385849657, + "learning_rate": 1.5441604342803374e-05, + "loss": 0.4446, + "step": 511 + }, + { + "epoch": 0.7718108159035236, + "grad_norm": 0.5203314012229143, + "learning_rate": 1.5419492146936518e-05, + "loss": 0.4205, + "step": 512 + }, + { + "epoch": 0.7733182589033352, + "grad_norm": 0.5359755271436466, + "learning_rate": 1.5397342369302425e-05, + "loss": 0.4402, + "step": 513 + }, + { + "epoch": 0.7748257019031468, + "grad_norm": 0.5233100133672925, + "learning_rate": 1.5375155163499953e-05, + "loss": 0.4177, + "step": 514 + }, + { + "epoch": 0.7763331449029583, + "grad_norm": 0.5349268255121612, + "learning_rate": 1.5352930683387502e-05, + "loss": 0.4586, + "step": 515 + }, + { + "epoch": 0.7778405879027699, + "grad_norm": 0.5815705753331589, + "learning_rate": 1.5330669083081956e-05, + "loss": 0.4427, + "step": 516 + }, + { + "epoch": 0.7793480309025815, + "grad_norm": 0.49665657788094364, + "learning_rate": 1.5308370516957617e-05, + "loss": 0.4201, + "step": 517 + }, + { + "epoch": 0.7808554739023931, + "grad_norm": 0.5160010880115449, + "learning_rate": 1.528603513964511e-05, + "loss": 0.4261, + "step": 518 + }, + { + "epoch": 0.7823629169022046, + "grad_norm": 0.5468406227400142, + "learning_rate": 1.5263663106030347e-05, + "loss": 0.4116, + "step": 519 + }, + { + "epoch": 0.7838703599020163, + "grad_norm": 0.5236112386795565, + "learning_rate": 1.5241254571253433e-05, + "loss": 0.4317, + "step": 520 + }, + { + "epoch": 0.7853778029018278, + "grad_norm": 0.5715363020786929, + "learning_rate": 1.5218809690707583e-05, + "loss": 0.4288, + "step": 521 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.5191719390295657, + "learning_rate": 1.5196328620038059e-05, + "loss": 0.4126, + "step": 522 + }, + { + "epoch": 0.7883926889014509, + "grad_norm": 0.5236405890133281, + "learning_rate": 1.5173811515141083e-05, + "loss": 0.4024, + "step": 523 + }, + { + "epoch": 0.7899001319012625, + "grad_norm": 0.5689433953014548, + "learning_rate": 1.5151258532162771e-05, + "loss": 0.4377, + "step": 524 + }, + { + "epoch": 0.791407574901074, + "grad_norm": 0.5352416985872532, + "learning_rate": 1.5128669827498024e-05, + "loss": 0.4354, + "step": 525 + }, + { + "epoch": 0.7929150179008856, + "grad_norm": 0.5274897373659767, + "learning_rate": 1.5106045557789453e-05, + "loss": 0.4391, + "step": 526 + }, + { + "epoch": 0.7944224609006972, + "grad_norm": 0.5240353462138522, + "learning_rate": 1.5083385879926309e-05, + "loss": 0.4461, + "step": 527 + }, + { + "epoch": 0.7959299039005088, + "grad_norm": 0.5040339622037668, + "learning_rate": 1.5060690951043385e-05, + "loss": 0.428, + "step": 528 + }, + { + "epoch": 0.7974373469003203, + "grad_norm": 0.6056664440579997, + "learning_rate": 1.5037960928519902e-05, + "loss": 0.4667, + "step": 529 + }, + { + "epoch": 0.7989447899001318, + "grad_norm": 0.5064874652403102, + "learning_rate": 1.501519596997847e-05, + "loss": 0.4174, + "step": 530 + }, + { + "epoch": 0.8004522328999435, + "grad_norm": 0.5178815992344113, + "learning_rate": 1.499239623328394e-05, + "loss": 0.4143, + "step": 531 + }, + { + "epoch": 0.801959675899755, + "grad_norm": 0.5433275328773004, + "learning_rate": 1.4969561876542348e-05, + "loss": 0.4308, + "step": 532 + }, + { + "epoch": 0.8034671188995666, + "grad_norm": 0.5067490187395532, + "learning_rate": 1.4946693058099802e-05, + "loss": 0.4383, + "step": 533 + }, + { + "epoch": 0.8049745618993782, + "grad_norm": 0.49712616081242367, + "learning_rate": 1.4923789936541378e-05, + "loss": 0.423, + "step": 534 + }, + { + "epoch": 0.8064820048991898, + "grad_norm": 0.5142222567824052, + "learning_rate": 1.4900852670690044e-05, + "loss": 0.4427, + "step": 535 + }, + { + "epoch": 0.8079894478990013, + "grad_norm": 0.5138167933634391, + "learning_rate": 1.487788141960553e-05, + "loss": 0.426, + "step": 536 + }, + { + "epoch": 0.8094968908988129, + "grad_norm": 0.49938679145962556, + "learning_rate": 1.4854876342583246e-05, + "loss": 0.4116, + "step": 537 + }, + { + "epoch": 0.8110043338986245, + "grad_norm": 0.5630302514996013, + "learning_rate": 1.4831837599153165e-05, + "loss": 0.4569, + "step": 538 + }, + { + "epoch": 0.812511776898436, + "grad_norm": 0.5068845911186761, + "learning_rate": 1.4808765349078729e-05, + "loss": 0.4174, + "step": 539 + }, + { + "epoch": 0.8140192198982475, + "grad_norm": 0.5402742918446363, + "learning_rate": 1.4785659752355724e-05, + "loss": 0.4046, + "step": 540 + }, + { + "epoch": 0.8155266628980592, + "grad_norm": 0.5486844481668101, + "learning_rate": 1.4762520969211186e-05, + "loss": 0.4225, + "step": 541 + }, + { + "epoch": 0.8170341058978707, + "grad_norm": 0.5290035366810187, + "learning_rate": 1.4739349160102285e-05, + "loss": 0.4378, + "step": 542 + }, + { + "epoch": 0.8185415488976823, + "grad_norm": 0.5374079241254692, + "learning_rate": 1.4716144485715209e-05, + "loss": 0.4299, + "step": 543 + }, + { + "epoch": 0.8200489918974939, + "grad_norm": 0.4778906030205072, + "learning_rate": 1.4692907106964051e-05, + "loss": 0.3992, + "step": 544 + }, + { + "epoch": 0.8215564348973055, + "grad_norm": 0.49060078784195343, + "learning_rate": 1.4669637184989696e-05, + "loss": 0.4243, + "step": 545 + }, + { + "epoch": 0.823063877897117, + "grad_norm": 0.5253862030306666, + "learning_rate": 1.4646334881158704e-05, + "loss": 0.4236, + "step": 546 + }, + { + "epoch": 0.8245713208969286, + "grad_norm": 0.5215051723939326, + "learning_rate": 1.4623000357062184e-05, + "loss": 0.4274, + "step": 547 + }, + { + "epoch": 0.8260787638967402, + "grad_norm": 0.5071119070406966, + "learning_rate": 1.459963377451468e-05, + "loss": 0.4081, + "step": 548 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.5180772114309931, + "learning_rate": 1.457623529555305e-05, + "loss": 0.4228, + "step": 549 + }, + { + "epoch": 0.8290936498963632, + "grad_norm": 0.5198434876057629, + "learning_rate": 1.4552805082435333e-05, + "loss": 0.4328, + "step": 550 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.53696356685593, + "learning_rate": 1.4529343297639638e-05, + "loss": 0.4311, + "step": 551 + }, + { + "epoch": 0.8321085358959864, + "grad_norm": 0.5522072703618133, + "learning_rate": 1.4505850103863007e-05, + "loss": 0.4441, + "step": 552 + }, + { + "epoch": 0.833615978895798, + "grad_norm": 0.5022303098504759, + "learning_rate": 1.448232566402028e-05, + "loss": 0.4233, + "step": 553 + }, + { + "epoch": 0.8351234218956096, + "grad_norm": 0.5522095422296431, + "learning_rate": 1.4458770141242992e-05, + "loss": 0.4333, + "step": 554 + }, + { + "epoch": 0.8366308648954212, + "grad_norm": 0.5232096502230357, + "learning_rate": 1.4435183698878212e-05, + "loss": 0.4286, + "step": 555 + }, + { + "epoch": 0.8381383078952327, + "grad_norm": 0.46986995612699417, + "learning_rate": 1.4411566500487425e-05, + "loss": 0.4049, + "step": 556 + }, + { + "epoch": 0.8396457508950442, + "grad_norm": 0.5077507527784849, + "learning_rate": 1.4387918709845395e-05, + "loss": 0.4144, + "step": 557 + }, + { + "epoch": 0.8411531938948559, + "grad_norm": 0.5253570052023816, + "learning_rate": 1.4364240490939032e-05, + "loss": 0.4547, + "step": 558 + }, + { + "epoch": 0.8426606368946674, + "grad_norm": 0.49921819408434215, + "learning_rate": 1.4340532007966252e-05, + "loss": 0.3949, + "step": 559 + }, + { + "epoch": 0.844168079894479, + "grad_norm": 0.5411234788441551, + "learning_rate": 1.4316793425334836e-05, + "loss": 0.4445, + "step": 560 + }, + { + "epoch": 0.8456755228942906, + "grad_norm": 0.5264546536830835, + "learning_rate": 1.4293024907661295e-05, + "loss": 0.4117, + "step": 561 + }, + { + "epoch": 0.8471829658941021, + "grad_norm": 0.518655972625287, + "learning_rate": 1.4269226619769727e-05, + "loss": 0.4159, + "step": 562 + }, + { + "epoch": 0.8486904088939137, + "grad_norm": 0.537382287002897, + "learning_rate": 1.424539872669067e-05, + "loss": 0.4395, + "step": 563 + }, + { + "epoch": 0.8501978518937253, + "grad_norm": 0.4871628601960703, + "learning_rate": 1.4221541393659966e-05, + "loss": 0.4244, + "step": 564 + }, + { + "epoch": 0.8517052948935369, + "grad_norm": 0.5323818502275258, + "learning_rate": 1.4197654786117604e-05, + "loss": 0.442, + "step": 565 + }, + { + "epoch": 0.8532127378933484, + "grad_norm": 0.49211277864065, + "learning_rate": 1.4173739069706586e-05, + "loss": 0.4333, + "step": 566 + }, + { + "epoch": 0.8547201808931599, + "grad_norm": 0.5016763716077036, + "learning_rate": 1.414979441027176e-05, + "loss": 0.4223, + "step": 567 + }, + { + "epoch": 0.8562276238929716, + "grad_norm": 0.5072197589397037, + "learning_rate": 1.4125820973858693e-05, + "loss": 0.4166, + "step": 568 + }, + { + "epoch": 0.8577350668927831, + "grad_norm": 0.5379841247223495, + "learning_rate": 1.41018189267125e-05, + "loss": 0.4457, + "step": 569 + }, + { + "epoch": 0.8592425098925947, + "grad_norm": 0.5156171430561991, + "learning_rate": 1.4077788435276701e-05, + "loss": 0.4154, + "step": 570 + }, + { + "epoch": 0.8607499528924063, + "grad_norm": 0.5377878469372074, + "learning_rate": 1.4053729666192067e-05, + "loss": 0.4437, + "step": 571 + }, + { + "epoch": 0.8622573958922178, + "grad_norm": 0.5606843337820052, + "learning_rate": 1.4029642786295452e-05, + "loss": 0.4479, + "step": 572 + }, + { + "epoch": 0.8637648388920294, + "grad_norm": 0.4989731388746451, + "learning_rate": 1.400552796261866e-05, + "loss": 0.407, + "step": 573 + }, + { + "epoch": 0.8652722818918409, + "grad_norm": 0.5136932503470173, + "learning_rate": 1.3981385362387268e-05, + "loss": 0.4211, + "step": 574 + }, + { + "epoch": 0.8667797248916526, + "grad_norm": 0.495625389098895, + "learning_rate": 1.3957215153019463e-05, + "loss": 0.4203, + "step": 575 + }, + { + "epoch": 0.8682871678914641, + "grad_norm": 0.49590492700182753, + "learning_rate": 1.3933017502124897e-05, + "loss": 0.4123, + "step": 576 + }, + { + "epoch": 0.8697946108912756, + "grad_norm": 0.5389299185456149, + "learning_rate": 1.3908792577503514e-05, + "loss": 0.4309, + "step": 577 + }, + { + "epoch": 0.8713020538910873, + "grad_norm": 0.5014871721652727, + "learning_rate": 1.3884540547144393e-05, + "loss": 0.4159, + "step": 578 + }, + { + "epoch": 0.8728094968908988, + "grad_norm": 0.49719473763201644, + "learning_rate": 1.3860261579224574e-05, + "loss": 0.4191, + "step": 579 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.5102002869995407, + "learning_rate": 1.3835955842107897e-05, + "loss": 0.418, + "step": 580 + }, + { + "epoch": 0.875824382890522, + "grad_norm": 0.497268362475834, + "learning_rate": 1.3811623504343845e-05, + "loss": 0.4092, + "step": 581 + }, + { + "epoch": 0.8773318258903335, + "grad_norm": 0.49466892349875324, + "learning_rate": 1.378726473466635e-05, + "loss": 0.4154, + "step": 582 + }, + { + "epoch": 0.8788392688901451, + "grad_norm": 0.5485556900908343, + "learning_rate": 1.3762879701992642e-05, + "loss": 0.4327, + "step": 583 + }, + { + "epoch": 0.8803467118899566, + "grad_norm": 0.49193915962474927, + "learning_rate": 1.373846857542208e-05, + "loss": 0.4233, + "step": 584 + }, + { + "epoch": 0.8818541548897683, + "grad_norm": 0.49157440442050665, + "learning_rate": 1.3714031524234965e-05, + "loss": 0.4255, + "step": 585 + }, + { + "epoch": 0.8833615978895798, + "grad_norm": 0.5153566919676954, + "learning_rate": 1.3689568717891381e-05, + "loss": 0.4433, + "step": 586 + }, + { + "epoch": 0.8848690408893913, + "grad_norm": 0.5151771531878016, + "learning_rate": 1.3665080326029997e-05, + "loss": 0.4313, + "step": 587 + }, + { + "epoch": 0.886376483889203, + "grad_norm": 0.5172038128981158, + "learning_rate": 1.364056651846693e-05, + "loss": 0.4025, + "step": 588 + }, + { + "epoch": 0.8878839268890145, + "grad_norm": 0.5197034910270297, + "learning_rate": 1.3616027465194525e-05, + "loss": 0.432, + "step": 589 + }, + { + "epoch": 0.889391369888826, + "grad_norm": 0.5280686849313844, + "learning_rate": 1.35914633363802e-05, + "loss": 0.4093, + "step": 590 + }, + { + "epoch": 0.8908988128886377, + "grad_norm": 0.5192774851448931, + "learning_rate": 1.356687430236526e-05, + "loss": 0.426, + "step": 591 + }, + { + "epoch": 0.8924062558884492, + "grad_norm": 0.5407059497728999, + "learning_rate": 1.3542260533663723e-05, + "loss": 0.4408, + "step": 592 + }, + { + "epoch": 0.8939136988882608, + "grad_norm": 0.5029787366533781, + "learning_rate": 1.351762220096112e-05, + "loss": 0.4134, + "step": 593 + }, + { + "epoch": 0.8954211418880723, + "grad_norm": 0.5557133502339159, + "learning_rate": 1.3492959475113332e-05, + "loss": 0.4247, + "step": 594 + }, + { + "epoch": 0.896928584887884, + "grad_norm": 0.5446161829977666, + "learning_rate": 1.3468272527145388e-05, + "loss": 0.4133, + "step": 595 + }, + { + "epoch": 0.8984360278876955, + "grad_norm": 0.5055328441209378, + "learning_rate": 1.3443561528250295e-05, + "loss": 0.3916, + "step": 596 + }, + { + "epoch": 0.899943470887507, + "grad_norm": 0.5874519416857665, + "learning_rate": 1.3418826649787834e-05, + "loss": 0.4339, + "step": 597 + }, + { + "epoch": 0.9014509138873187, + "grad_norm": 0.5577170031704589, + "learning_rate": 1.3394068063283387e-05, + "loss": 0.458, + "step": 598 + }, + { + "epoch": 0.9029583568871302, + "grad_norm": 0.5332814444729285, + "learning_rate": 1.3369285940426737e-05, + "loss": 0.4206, + "step": 599 + }, + { + "epoch": 0.9044657998869418, + "grad_norm": 0.5654643143753597, + "learning_rate": 1.334448045307088e-05, + "loss": 0.4113, + "step": 600 + }, + { + "epoch": 0.9059732428867533, + "grad_norm": 0.4979334800098818, + "learning_rate": 1.331965177323084e-05, + "loss": 0.4093, + "step": 601 + }, + { + "epoch": 0.9074806858865649, + "grad_norm": 0.5415874467915235, + "learning_rate": 1.3294800073082464e-05, + "loss": 0.4366, + "step": 602 + }, + { + "epoch": 0.9089881288863765, + "grad_norm": 0.5813207766062746, + "learning_rate": 1.3269925524961237e-05, + "loss": 0.4448, + "step": 603 + }, + { + "epoch": 0.910495571886188, + "grad_norm": 0.5078359282634053, + "learning_rate": 1.3245028301361086e-05, + "loss": 0.4161, + "step": 604 + }, + { + "epoch": 0.9120030148859997, + "grad_norm": 0.5539022471684321, + "learning_rate": 1.3220108574933185e-05, + "loss": 0.4056, + "step": 605 + }, + { + "epoch": 0.9135104578858112, + "grad_norm": 0.48460567118259956, + "learning_rate": 1.3195166518484748e-05, + "loss": 0.4009, + "step": 606 + }, + { + "epoch": 0.9150179008856227, + "grad_norm": 0.4843343744091719, + "learning_rate": 1.317020230497784e-05, + "loss": 0.4231, + "step": 607 + }, + { + "epoch": 0.9165253438854344, + "grad_norm": 0.5190197613843625, + "learning_rate": 1.3145216107528178e-05, + "loss": 0.4029, + "step": 608 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.4867573763184133, + "learning_rate": 1.3120208099403926e-05, + "loss": 0.3801, + "step": 609 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.48900894299598635, + "learning_rate": 1.3095178454024496e-05, + "loss": 0.4413, + "step": 610 + }, + { + "epoch": 0.921047672884869, + "grad_norm": 0.5555266761898254, + "learning_rate": 1.3070127344959348e-05, + "loss": 0.4144, + "step": 611 + }, + { + "epoch": 0.9225551158846806, + "grad_norm": 0.5128649153965221, + "learning_rate": 1.3045054945926775e-05, + "loss": 0.4616, + "step": 612 + }, + { + "epoch": 0.9240625588844922, + "grad_norm": 0.5194503259126656, + "learning_rate": 1.3019961430792711e-05, + "loss": 0.4229, + "step": 613 + }, + { + "epoch": 0.9255700018843037, + "grad_norm": 0.49285532678009114, + "learning_rate": 1.2994846973569524e-05, + "loss": 0.4165, + "step": 614 + }, + { + "epoch": 0.9270774448841154, + "grad_norm": 0.5197963588456296, + "learning_rate": 1.2969711748414804e-05, + "loss": 0.3947, + "step": 615 + }, + { + "epoch": 0.9285848878839269, + "grad_norm": 0.542725727252665, + "learning_rate": 1.2944555929630152e-05, + "loss": 0.4261, + "step": 616 + }, + { + "epoch": 0.9300923308837384, + "grad_norm": 0.5068570325444082, + "learning_rate": 1.2919379691659979e-05, + "loss": 0.453, + "step": 617 + }, + { + "epoch": 0.93159977388355, + "grad_norm": 0.5138431602453551, + "learning_rate": 1.2894183209090304e-05, + "loss": 0.4482, + "step": 618 + }, + { + "epoch": 0.9331072168833616, + "grad_norm": 0.5098264236378465, + "learning_rate": 1.2868966656647522e-05, + "loss": 0.4344, + "step": 619 + }, + { + "epoch": 0.9346146598831732, + "grad_norm": 0.4932368518544031, + "learning_rate": 1.2843730209197203e-05, + "loss": 0.4444, + "step": 620 + }, + { + "epoch": 0.9361221028829847, + "grad_norm": 0.48787838834596486, + "learning_rate": 1.2818474041742885e-05, + "loss": 0.3909, + "step": 621 + }, + { + "epoch": 0.9376295458827963, + "grad_norm": 0.5042148044417084, + "learning_rate": 1.2793198329424858e-05, + "loss": 0.4114, + "step": 622 + }, + { + "epoch": 0.9391369888826079, + "grad_norm": 0.5164275014163481, + "learning_rate": 1.2767903247518945e-05, + "loss": 0.4042, + "step": 623 + }, + { + "epoch": 0.9406444318824194, + "grad_norm": 0.4878553181808082, + "learning_rate": 1.2742588971435276e-05, + "loss": 0.4108, + "step": 624 + }, + { + "epoch": 0.9421518748822311, + "grad_norm": 0.4953872026297146, + "learning_rate": 1.2717255676717106e-05, + "loss": 0.4227, + "step": 625 + }, + { + "epoch": 0.9436593178820426, + "grad_norm": 0.5623597137703112, + "learning_rate": 1.2691903539039563e-05, + "loss": 0.4436, + "step": 626 + }, + { + "epoch": 0.9451667608818541, + "grad_norm": 0.539298059881258, + "learning_rate": 1.2666532734208437e-05, + "loss": 0.4384, + "step": 627 + }, + { + "epoch": 0.9466742038816657, + "grad_norm": 0.5443120200340641, + "learning_rate": 1.264114343815898e-05, + "loss": 0.4413, + "step": 628 + }, + { + "epoch": 0.9481816468814773, + "grad_norm": 0.5142650264217846, + "learning_rate": 1.2615735826954664e-05, + "loss": 0.4231, + "step": 629 + }, + { + "epoch": 0.9496890898812889, + "grad_norm": 0.5566560995617864, + "learning_rate": 1.2590310076785974e-05, + "loss": 0.4458, + "step": 630 + }, + { + "epoch": 0.9511965328811004, + "grad_norm": 0.484643722468428, + "learning_rate": 1.256486636396917e-05, + "loss": 0.3868, + "step": 631 + }, + { + "epoch": 0.952703975880912, + "grad_norm": 0.5278211197592041, + "learning_rate": 1.2539404864945087e-05, + "loss": 0.3956, + "step": 632 + }, + { + "epoch": 0.9542114188807236, + "grad_norm": 0.5339784329738423, + "learning_rate": 1.2513925756277894e-05, + "loss": 0.4065, + "step": 633 + }, + { + "epoch": 0.9557188618805351, + "grad_norm": 0.4808436521240299, + "learning_rate": 1.2488429214653871e-05, + "loss": 0.3733, + "step": 634 + }, + { + "epoch": 0.9572263048803467, + "grad_norm": 0.5245674565988473, + "learning_rate": 1.24629154168802e-05, + "loss": 0.4206, + "step": 635 + }, + { + "epoch": 0.9587337478801583, + "grad_norm": 0.5091922264135481, + "learning_rate": 1.2437384539883715e-05, + "loss": 0.4321, + "step": 636 + }, + { + "epoch": 0.9602411908799698, + "grad_norm": 0.48729820029525145, + "learning_rate": 1.2411836760709686e-05, + "loss": 0.3961, + "step": 637 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.5224677796102979, + "learning_rate": 1.2386272256520606e-05, + "loss": 0.4094, + "step": 638 + }, + { + "epoch": 0.963256076879593, + "grad_norm": 0.5291193644566966, + "learning_rate": 1.2360691204594937e-05, + "loss": 0.4202, + "step": 639 + }, + { + "epoch": 0.9647635198794046, + "grad_norm": 0.5090746283917961, + "learning_rate": 1.2335093782325889e-05, + "loss": 0.4115, + "step": 640 + }, + { + "epoch": 0.9662709628792161, + "grad_norm": 0.49844277614657384, + "learning_rate": 1.2309480167220203e-05, + "loss": 0.4138, + "step": 641 + }, + { + "epoch": 0.9677784058790277, + "grad_norm": 0.5085446797250271, + "learning_rate": 1.2283850536896907e-05, + "loss": 0.4403, + "step": 642 + }, + { + "epoch": 0.9692858488788393, + "grad_norm": 0.48811956114780947, + "learning_rate": 1.2258205069086082e-05, + "loss": 0.4132, + "step": 643 + }, + { + "epoch": 0.9707932918786508, + "grad_norm": 0.5804699645229868, + "learning_rate": 1.2232543941627641e-05, + "loss": 0.4145, + "step": 644 + }, + { + "epoch": 0.9723007348784624, + "grad_norm": 0.5223286630706884, + "learning_rate": 1.2206867332470091e-05, + "loss": 0.4451, + "step": 645 + }, + { + "epoch": 0.973808177878274, + "grad_norm": 0.5431240213202171, + "learning_rate": 1.2181175419669293e-05, + "loss": 0.4106, + "step": 646 + }, + { + "epoch": 0.9753156208780855, + "grad_norm": 0.4788749668502741, + "learning_rate": 1.215546838138723e-05, + "loss": 0.3947, + "step": 647 + }, + { + "epoch": 0.9768230638778971, + "grad_norm": 0.4823666614879542, + "learning_rate": 1.212974639589078e-05, + "loss": 0.3805, + "step": 648 + }, + { + "epoch": 0.9783305068777087, + "grad_norm": 0.5272835049687891, + "learning_rate": 1.2104009641550472e-05, + "loss": 0.4192, + "step": 649 + }, + { + "epoch": 0.9798379498775203, + "grad_norm": 0.4899435333806439, + "learning_rate": 1.2078258296839245e-05, + "loss": 0.4242, + "step": 650 + }, + { + "epoch": 0.9813453928773318, + "grad_norm": 0.48267520902055755, + "learning_rate": 1.2052492540331218e-05, + "loss": 0.3819, + "step": 651 + }, + { + "epoch": 0.9828528358771434, + "grad_norm": 0.5208472855722491, + "learning_rate": 1.2026712550700457e-05, + "loss": 0.4268, + "step": 652 + }, + { + "epoch": 0.984360278876955, + "grad_norm": 0.5182048450359825, + "learning_rate": 1.200091850671972e-05, + "loss": 0.3833, + "step": 653 + }, + { + "epoch": 0.9858677218767665, + "grad_norm": 0.5524884939555313, + "learning_rate": 1.1975110587259222e-05, + "loss": 0.4099, + "step": 654 + }, + { + "epoch": 0.9873751648765781, + "grad_norm": 0.5724743146915252, + "learning_rate": 1.1949288971285411e-05, + "loss": 0.4451, + "step": 655 + }, + { + "epoch": 0.9888826078763897, + "grad_norm": 0.5560489536019798, + "learning_rate": 1.1923453837859706e-05, + "loss": 0.4245, + "step": 656 + }, + { + "epoch": 0.9903900508762012, + "grad_norm": 0.5241693566205756, + "learning_rate": 1.1897605366137264e-05, + "loss": 0.426, + "step": 657 + }, + { + "epoch": 0.9918974938760128, + "grad_norm": 0.5078011601273249, + "learning_rate": 1.1871743735365735e-05, + "loss": 0.4147, + "step": 658 + }, + { + "epoch": 0.9934049368758244, + "grad_norm": 0.5014207467428378, + "learning_rate": 1.1845869124884027e-05, + "loss": 0.4029, + "step": 659 + }, + { + "epoch": 0.994912379875636, + "grad_norm": 0.5184450473918536, + "learning_rate": 1.1819981714121054e-05, + "loss": 0.4338, + "step": 660 + }, + { + "epoch": 0.9964198228754475, + "grad_norm": 0.5218529509897015, + "learning_rate": 1.1794081682594491e-05, + "loss": 0.4001, + "step": 661 + }, + { + "epoch": 0.997927265875259, + "grad_norm": 0.5277285874094648, + "learning_rate": 1.176816920990954e-05, + "loss": 0.4225, + "step": 662 + }, + { + "epoch": 0.9994347088750707, + "grad_norm": 0.506018413554039, + "learning_rate": 1.174224447575767e-05, + "loss": 0.4398, + "step": 663 + }, + { + "epoch": 1.0009421518748822, + "grad_norm": 0.6655724719416495, + "learning_rate": 1.171630765991538e-05, + "loss": 0.377, + "step": 664 + }, + { + "epoch": 1.0024495948746939, + "grad_norm": 0.5752880840432146, + "learning_rate": 1.169035894224295e-05, + "loss": 0.325, + "step": 665 + }, + { + "epoch": 1.0039570378745053, + "grad_norm": 0.547046172496627, + "learning_rate": 1.1664398502683194e-05, + "loss": 0.3422, + "step": 666 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.6183956576461548, + "learning_rate": 1.1638426521260211e-05, + "loss": 0.3551, + "step": 667 + }, + { + "epoch": 1.0069719238741286, + "grad_norm": 0.6272202909322583, + "learning_rate": 1.1612443178078138e-05, + "loss": 0.3293, + "step": 668 + }, + { + "epoch": 1.00847936687394, + "grad_norm": 0.6733584649632783, + "learning_rate": 1.1586448653319908e-05, + "loss": 0.3495, + "step": 669 + }, + { + "epoch": 1.0099868098737517, + "grad_norm": 0.5974677434978642, + "learning_rate": 1.156044312724598e-05, + "loss": 0.3339, + "step": 670 + }, + { + "epoch": 1.0114942528735633, + "grad_norm": 0.5526640261136243, + "learning_rate": 1.153442678019311e-05, + "loss": 0.3629, + "step": 671 + }, + { + "epoch": 1.0130016958733747, + "grad_norm": 0.5666634240071511, + "learning_rate": 1.1508399792573095e-05, + "loss": 0.3361, + "step": 672 + }, + { + "epoch": 1.0145091388731864, + "grad_norm": 0.6224882966351409, + "learning_rate": 1.1482362344871514e-05, + "loss": 0.3545, + "step": 673 + }, + { + "epoch": 1.0160165818729978, + "grad_norm": 0.5615749930186623, + "learning_rate": 1.1456314617646482e-05, + "loss": 0.3208, + "step": 674 + }, + { + "epoch": 1.0175240248728095, + "grad_norm": 0.548490348465347, + "learning_rate": 1.1430256791527406e-05, + "loss": 0.3278, + "step": 675 + }, + { + "epoch": 1.019031467872621, + "grad_norm": 0.6137191595237155, + "learning_rate": 1.1404189047213716e-05, + "loss": 0.3684, + "step": 676 + }, + { + "epoch": 1.0205389108724325, + "grad_norm": 0.6128432091688398, + "learning_rate": 1.137811156547362e-05, + "loss": 0.3479, + "step": 677 + }, + { + "epoch": 1.0220463538722442, + "grad_norm": 0.5530398492501923, + "learning_rate": 1.1352024527142855e-05, + "loss": 0.3258, + "step": 678 + }, + { + "epoch": 1.0235537968720558, + "grad_norm": 0.5691801541559598, + "learning_rate": 1.1325928113123431e-05, + "loss": 0.3359, + "step": 679 + }, + { + "epoch": 1.0250612398718673, + "grad_norm": 0.5996898750429057, + "learning_rate": 1.129982250438237e-05, + "loss": 0.34, + "step": 680 + }, + { + "epoch": 1.026568682871679, + "grad_norm": 0.5203916917045198, + "learning_rate": 1.1273707881950445e-05, + "loss": 0.3194, + "step": 681 + }, + { + "epoch": 1.0280761258714906, + "grad_norm": 0.5706678991613441, + "learning_rate": 1.1247584426920962e-05, + "loss": 0.3394, + "step": 682 + }, + { + "epoch": 1.029583568871302, + "grad_norm": 0.558797423405198, + "learning_rate": 1.1221452320448449e-05, + "loss": 0.3476, + "step": 683 + }, + { + "epoch": 1.0310910118711136, + "grad_norm": 0.5491796357132722, + "learning_rate": 1.1195311743747445e-05, + "loss": 0.3287, + "step": 684 + }, + { + "epoch": 1.0325984548709253, + "grad_norm": 0.5423270097914835, + "learning_rate": 1.116916287809122e-05, + "loss": 0.3315, + "step": 685 + }, + { + "epoch": 1.0341058978707367, + "grad_norm": 0.5440784988767636, + "learning_rate": 1.1143005904810527e-05, + "loss": 0.3409, + "step": 686 + }, + { + "epoch": 1.0356133408705483, + "grad_norm": 0.5506460404964368, + "learning_rate": 1.1116841005292339e-05, + "loss": 0.3665, + "step": 687 + }, + { + "epoch": 1.03712078387036, + "grad_norm": 0.5271450898091751, + "learning_rate": 1.1090668360978589e-05, + "loss": 0.3354, + "step": 688 + }, + { + "epoch": 1.0386282268701714, + "grad_norm": 0.5116723363561022, + "learning_rate": 1.106448815336493e-05, + "loss": 0.3055, + "step": 689 + }, + { + "epoch": 1.040135669869983, + "grad_norm": 0.5261827472069973, + "learning_rate": 1.1038300563999455e-05, + "loss": 0.3141, + "step": 690 + }, + { + "epoch": 1.0416431128697947, + "grad_norm": 0.5675715863653521, + "learning_rate": 1.1012105774481446e-05, + "loss": 0.3576, + "step": 691 + }, + { + "epoch": 1.0431505558696061, + "grad_norm": 0.542765155631167, + "learning_rate": 1.0985903966460115e-05, + "loss": 0.337, + "step": 692 + }, + { + "epoch": 1.0446579988694178, + "grad_norm": 0.576467518182856, + "learning_rate": 1.0959695321633346e-05, + "loss": 0.3345, + "step": 693 + }, + { + "epoch": 1.0461654418692292, + "grad_norm": 0.5261227763098979, + "learning_rate": 1.0933480021746432e-05, + "loss": 0.3137, + "step": 694 + }, + { + "epoch": 1.0476728848690409, + "grad_norm": 0.5529375328569147, + "learning_rate": 1.0907258248590816e-05, + "loss": 0.332, + "step": 695 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.5136240223834705, + "learning_rate": 1.0881030184002827e-05, + "loss": 0.3276, + "step": 696 + }, + { + "epoch": 1.050687770868664, + "grad_norm": 0.5367848385477425, + "learning_rate": 1.0854796009862434e-05, + "loss": 0.3163, + "step": 697 + }, + { + "epoch": 1.0521952138684756, + "grad_norm": 0.544930166455388, + "learning_rate": 1.0828555908091958e-05, + "loss": 0.359, + "step": 698 + }, + { + "epoch": 1.0537026568682872, + "grad_norm": 0.5387564352002492, + "learning_rate": 1.0802310060654832e-05, + "loss": 0.339, + "step": 699 + }, + { + "epoch": 1.0552100998680987, + "grad_norm": 0.5496802508408758, + "learning_rate": 1.0776058649554336e-05, + "loss": 0.3535, + "step": 700 + }, + { + "epoch": 1.0567175428679103, + "grad_norm": 0.5348548485090446, + "learning_rate": 1.0749801856832325e-05, + "loss": 0.3368, + "step": 701 + }, + { + "epoch": 1.058224985867722, + "grad_norm": 0.5794289951348468, + "learning_rate": 1.0723539864567983e-05, + "loss": 0.3596, + "step": 702 + }, + { + "epoch": 1.0597324288675334, + "grad_norm": 0.5365708234277743, + "learning_rate": 1.0697272854876537e-05, + "loss": 0.3421, + "step": 703 + }, + { + "epoch": 1.061239871867345, + "grad_norm": 0.5904399117303262, + "learning_rate": 1.0671001009908015e-05, + "loss": 0.3348, + "step": 704 + }, + { + "epoch": 1.0627473148671567, + "grad_norm": 0.5204976732643493, + "learning_rate": 1.0644724511845976e-05, + "loss": 0.3525, + "step": 705 + }, + { + "epoch": 1.064254757866968, + "grad_norm": 0.6120309774969117, + "learning_rate": 1.0618443542906251e-05, + "loss": 0.3727, + "step": 706 + }, + { + "epoch": 1.0657622008667798, + "grad_norm": 0.6091575812702822, + "learning_rate": 1.059215828533566e-05, + "loss": 0.3588, + "step": 707 + }, + { + "epoch": 1.0672696438665912, + "grad_norm": 0.591151755333861, + "learning_rate": 1.0565868921410776e-05, + "loss": 0.3363, + "step": 708 + }, + { + "epoch": 1.0687770868664028, + "grad_norm": 0.5984602535754296, + "learning_rate": 1.0539575633436645e-05, + "loss": 0.3616, + "step": 709 + }, + { + "epoch": 1.0702845298662145, + "grad_norm": 0.5604228857922577, + "learning_rate": 1.0513278603745523e-05, + "loss": 0.3398, + "step": 710 + }, + { + "epoch": 1.071791972866026, + "grad_norm": 0.5557030870304388, + "learning_rate": 1.0486978014695606e-05, + "loss": 0.338, + "step": 711 + }, + { + "epoch": 1.0732994158658375, + "grad_norm": 0.5730991612503363, + "learning_rate": 1.0460674048669783e-05, + "loss": 0.3219, + "step": 712 + }, + { + "epoch": 1.0748068588656492, + "grad_norm": 0.6009828081011681, + "learning_rate": 1.0434366888074363e-05, + "loss": 0.3237, + "step": 713 + }, + { + "epoch": 1.0763143018654606, + "grad_norm": 0.5386294130513889, + "learning_rate": 1.0408056715337797e-05, + "loss": 0.3391, + "step": 714 + }, + { + "epoch": 1.0778217448652723, + "grad_norm": 0.5345878263288965, + "learning_rate": 1.0381743712909424e-05, + "loss": 0.3384, + "step": 715 + }, + { + "epoch": 1.079329187865084, + "grad_norm": 0.6369538253688138, + "learning_rate": 1.0355428063258224e-05, + "loss": 0.35, + "step": 716 + }, + { + "epoch": 1.0808366308648953, + "grad_norm": 0.5615591275271141, + "learning_rate": 1.0329109948871512e-05, + "loss": 0.3467, + "step": 717 + }, + { + "epoch": 1.082344073864707, + "grad_norm": 0.6406352309238248, + "learning_rate": 1.0302789552253702e-05, + "loss": 0.3523, + "step": 718 + }, + { + "epoch": 1.0838515168645186, + "grad_norm": 0.5212977047595297, + "learning_rate": 1.0276467055925044e-05, + "loss": 0.3185, + "step": 719 + }, + { + "epoch": 1.08535895986433, + "grad_norm": 0.5443802073020193, + "learning_rate": 1.0250142642420335e-05, + "loss": 0.3396, + "step": 720 + }, + { + "epoch": 1.0868664028641417, + "grad_norm": 0.5516128695838226, + "learning_rate": 1.0223816494287675e-05, + "loss": 0.3199, + "step": 721 + }, + { + "epoch": 1.0883738458639534, + "grad_norm": 0.5459335385131995, + "learning_rate": 1.0197488794087188e-05, + "loss": 0.2979, + "step": 722 + }, + { + "epoch": 1.0898812888637648, + "grad_norm": 0.5660471338581954, + "learning_rate": 1.0171159724389766e-05, + "loss": 0.3578, + "step": 723 + }, + { + "epoch": 1.0913887318635764, + "grad_norm": 0.577383627814168, + "learning_rate": 1.0144829467775794e-05, + "loss": 0.3253, + "step": 724 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.5656943231881854, + "learning_rate": 1.0118498206833886e-05, + "loss": 0.3559, + "step": 725 + }, + { + "epoch": 1.0944036178631995, + "grad_norm": 0.5427797556871369, + "learning_rate": 1.0092166124159628e-05, + "loss": 0.3299, + "step": 726 + }, + { + "epoch": 1.0959110608630112, + "grad_norm": 0.5582731085039236, + "learning_rate": 1.0065833402354302e-05, + "loss": 0.342, + "step": 727 + }, + { + "epoch": 1.0974185038628228, + "grad_norm": 0.5809252708008414, + "learning_rate": 1.003950022402361e-05, + "loss": 0.3553, + "step": 728 + }, + { + "epoch": 1.0989259468626342, + "grad_norm": 0.5400373499865376, + "learning_rate": 1.0013166771776441e-05, + "loss": 0.3283, + "step": 729 + }, + { + "epoch": 1.1004333898624459, + "grad_norm": 0.5280335723569519, + "learning_rate": 9.986833228223562e-06, + "loss": 0.3567, + "step": 730 + }, + { + "epoch": 1.1019408328622573, + "grad_norm": 0.5756207231701386, + "learning_rate": 9.96049977597639e-06, + "loss": 0.3422, + "step": 731 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.5917844968799806, + "learning_rate": 9.934166597645703e-06, + "loss": 0.3803, + "step": 732 + }, + { + "epoch": 1.1049557188618806, + "grad_norm": 0.5375048549376539, + "learning_rate": 9.907833875840374e-06, + "loss": 0.3421, + "step": 733 + }, + { + "epoch": 1.106463161861692, + "grad_norm": 0.5685857382900585, + "learning_rate": 9.881501793166117e-06, + "loss": 0.3658, + "step": 734 + }, + { + "epoch": 1.1079706048615037, + "grad_norm": 0.6069823667279429, + "learning_rate": 9.85517053222421e-06, + "loss": 0.3273, + "step": 735 + }, + { + "epoch": 1.1094780478613153, + "grad_norm": 0.5196609920900314, + "learning_rate": 9.82884027561024e-06, + "loss": 0.3233, + "step": 736 + }, + { + "epoch": 1.1109854908611267, + "grad_norm": 0.5702922246540342, + "learning_rate": 9.802511205912815e-06, + "loss": 0.35, + "step": 737 + }, + { + "epoch": 1.1124929338609384, + "grad_norm": 0.563216447988931, + "learning_rate": 9.776183505712327e-06, + "loss": 0.3578, + "step": 738 + }, + { + "epoch": 1.11400037686075, + "grad_norm": 0.5835000476343075, + "learning_rate": 9.749857357579667e-06, + "loss": 0.3753, + "step": 739 + }, + { + "epoch": 1.1155078198605615, + "grad_norm": 0.5822012862085456, + "learning_rate": 9.723532944074961e-06, + "loss": 0.3035, + "step": 740 + }, + { + "epoch": 1.1170152628603731, + "grad_norm": 0.5625362231656639, + "learning_rate": 9.6972104477463e-06, + "loss": 0.3669, + "step": 741 + }, + { + "epoch": 1.1185227058601848, + "grad_norm": 0.5816421569187623, + "learning_rate": 9.670890051128493e-06, + "loss": 0.3264, + "step": 742 + }, + { + "epoch": 1.1200301488599962, + "grad_norm": 0.6076866614497781, + "learning_rate": 9.644571936741778e-06, + "loss": 0.3448, + "step": 743 + }, + { + "epoch": 1.1215375918598078, + "grad_norm": 0.5868211335333723, + "learning_rate": 9.618256287090576e-06, + "loss": 0.3453, + "step": 744 + }, + { + "epoch": 1.1230450348596195, + "grad_norm": 0.5784910781884745, + "learning_rate": 9.591943284662206e-06, + "loss": 0.3543, + "step": 745 + }, + { + "epoch": 1.124552477859431, + "grad_norm": 0.5577968039251089, + "learning_rate": 9.56563311192564e-06, + "loss": 0.356, + "step": 746 + }, + { + "epoch": 1.1260599208592426, + "grad_norm": 0.5624603535612774, + "learning_rate": 9.53932595133022e-06, + "loss": 0.322, + "step": 747 + }, + { + "epoch": 1.127567363859054, + "grad_norm": 0.5863194939952109, + "learning_rate": 9.513021985304399e-06, + "loss": 0.341, + "step": 748 + }, + { + "epoch": 1.1290748068588656, + "grad_norm": 0.5297072497497793, + "learning_rate": 9.486721396254484e-06, + "loss": 0.3263, + "step": 749 + }, + { + "epoch": 1.1305822498586773, + "grad_norm": 0.5597259831895821, + "learning_rate": 9.460424366563355e-06, + "loss": 0.3243, + "step": 750 + }, + { + "epoch": 1.1320896928584887, + "grad_norm": 0.5464179018975297, + "learning_rate": 9.434131078589224e-06, + "loss": 0.3206, + "step": 751 + }, + { + "epoch": 1.1335971358583004, + "grad_norm": 0.5464450895465798, + "learning_rate": 9.407841714664343e-06, + "loss": 0.3387, + "step": 752 + }, + { + "epoch": 1.135104578858112, + "grad_norm": 0.5546542012199714, + "learning_rate": 9.381556457093752e-06, + "loss": 0.337, + "step": 753 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.5753540187155672, + "learning_rate": 9.355275488154025e-06, + "loss": 0.3644, + "step": 754 + }, + { + "epoch": 1.138119464857735, + "grad_norm": 0.571883771055452, + "learning_rate": 9.32899899009199e-06, + "loss": 0.332, + "step": 755 + }, + { + "epoch": 1.1396269078575467, + "grad_norm": 0.5383157533846678, + "learning_rate": 9.30272714512347e-06, + "loss": 0.3397, + "step": 756 + }, + { + "epoch": 1.1411343508573581, + "grad_norm": 0.564086061412075, + "learning_rate": 9.276460135432019e-06, + "loss": 0.3592, + "step": 757 + }, + { + "epoch": 1.1426417938571698, + "grad_norm": 0.510671608636206, + "learning_rate": 9.250198143167675e-06, + "loss": 0.3301, + "step": 758 + }, + { + "epoch": 1.1441492368569812, + "grad_norm": 0.5323793942216957, + "learning_rate": 9.223941350445666e-06, + "loss": 0.3341, + "step": 759 + }, + { + "epoch": 1.1456566798567929, + "grad_norm": 0.5359098725485097, + "learning_rate": 9.19768993934517e-06, + "loss": 0.3214, + "step": 760 + }, + { + "epoch": 1.1471641228566045, + "grad_norm": 0.5451361788825891, + "learning_rate": 9.171444091908046e-06, + "loss": 0.3195, + "step": 761 + }, + { + "epoch": 1.1486715658564162, + "grad_norm": 0.5246946467273069, + "learning_rate": 9.145203990137571e-06, + "loss": 0.3417, + "step": 762 + }, + { + "epoch": 1.1501790088562276, + "grad_norm": 0.5919155354849388, + "learning_rate": 9.118969815997174e-06, + "loss": 0.3417, + "step": 763 + }, + { + "epoch": 1.1516864518560392, + "grad_norm": 0.549339616533448, + "learning_rate": 9.092741751409186e-06, + "loss": 0.3323, + "step": 764 + }, + { + "epoch": 1.1531938948558507, + "grad_norm": 0.5520952523067345, + "learning_rate": 9.06651997825357e-06, + "loss": 0.325, + "step": 765 + }, + { + "epoch": 1.1547013378556623, + "grad_norm": 0.5546050111571403, + "learning_rate": 9.040304678366658e-06, + "loss": 0.3798, + "step": 766 + }, + { + "epoch": 1.156208780855474, + "grad_norm": 0.5355016099382737, + "learning_rate": 9.014096033539889e-06, + "loss": 0.3324, + "step": 767 + }, + { + "epoch": 1.1577162238552854, + "grad_norm": 0.5063535090434689, + "learning_rate": 8.987894225518556e-06, + "loss": 0.3098, + "step": 768 + }, + { + "epoch": 1.159223666855097, + "grad_norm": 0.538083701203612, + "learning_rate": 8.961699436000548e-06, + "loss": 0.3378, + "step": 769 + }, + { + "epoch": 1.1607311098549087, + "grad_norm": 0.5611833134978637, + "learning_rate": 8.93551184663507e-06, + "loss": 0.3291, + "step": 770 + }, + { + "epoch": 1.16223855285472, + "grad_norm": 0.5334833426063799, + "learning_rate": 8.909331639021414e-06, + "loss": 0.3265, + "step": 771 + }, + { + "epoch": 1.1637459958545318, + "grad_norm": 0.5370028500892263, + "learning_rate": 8.883158994707666e-06, + "loss": 0.353, + "step": 772 + }, + { + "epoch": 1.1652534388543434, + "grad_norm": 0.5564851227581507, + "learning_rate": 8.856994095189477e-06, + "loss": 0.314, + "step": 773 + }, + { + "epoch": 1.1667608818541548, + "grad_norm": 0.5516816420442727, + "learning_rate": 8.830837121908783e-06, + "loss": 0.3459, + "step": 774 + }, + { + "epoch": 1.1682683248539665, + "grad_norm": 0.546010896691211, + "learning_rate": 8.804688256252557e-06, + "loss": 0.3564, + "step": 775 + }, + { + "epoch": 1.1697757678537781, + "grad_norm": 0.5204242216440147, + "learning_rate": 8.778547679551555e-06, + "loss": 0.3093, + "step": 776 + }, + { + "epoch": 1.1712832108535896, + "grad_norm": 0.5530932960461594, + "learning_rate": 8.75241557307904e-06, + "loss": 0.3169, + "step": 777 + }, + { + "epoch": 1.1727906538534012, + "grad_norm": 0.5433657189299205, + "learning_rate": 8.726292118049555e-06, + "loss": 0.3238, + "step": 778 + }, + { + "epoch": 1.1742980968532128, + "grad_norm": 0.536612091168906, + "learning_rate": 8.700177495617635e-06, + "loss": 0.3375, + "step": 779 + }, + { + "epoch": 1.1758055398530243, + "grad_norm": 0.5547355998217709, + "learning_rate": 8.674071886876572e-06, + "loss": 0.3285, + "step": 780 + }, + { + "epoch": 1.177312982852836, + "grad_norm": 0.6048276095962777, + "learning_rate": 8.647975472857148e-06, + "loss": 0.3704, + "step": 781 + }, + { + "epoch": 1.1788204258526473, + "grad_norm": 0.571295755561053, + "learning_rate": 8.621888434526382e-06, + "loss": 0.374, + "step": 782 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.5189210618741348, + "learning_rate": 8.595810952786289e-06, + "loss": 0.3247, + "step": 783 + }, + { + "epoch": 1.1818353118522706, + "grad_norm": 0.5407807768349286, + "learning_rate": 8.569743208472594e-06, + "loss": 0.318, + "step": 784 + }, + { + "epoch": 1.183342754852082, + "grad_norm": 0.5555658150734397, + "learning_rate": 8.543685382353518e-06, + "loss": 0.342, + "step": 785 + }, + { + "epoch": 1.1848501978518937, + "grad_norm": 0.5478498420172522, + "learning_rate": 8.51763765512849e-06, + "loss": 0.3386, + "step": 786 + }, + { + "epoch": 1.1863576408517054, + "grad_norm": 0.5229096665922429, + "learning_rate": 8.491600207426907e-06, + "loss": 0.3218, + "step": 787 + }, + { + "epoch": 1.1878650838515168, + "grad_norm": 0.5706786350698708, + "learning_rate": 8.465573219806893e-06, + "loss": 0.3491, + "step": 788 + }, + { + "epoch": 1.1893725268513284, + "grad_norm": 0.5792169640912351, + "learning_rate": 8.439556872754025e-06, + "loss": 0.3482, + "step": 789 + }, + { + "epoch": 1.19087996985114, + "grad_norm": 0.5219487046954187, + "learning_rate": 8.413551346680095e-06, + "loss": 0.3183, + "step": 790 + }, + { + "epoch": 1.1923874128509515, + "grad_norm": 0.5680359320055756, + "learning_rate": 8.38755682192186e-06, + "loss": 0.3257, + "step": 791 + }, + { + "epoch": 1.1938948558507632, + "grad_norm": 0.54324153892485, + "learning_rate": 8.36157347873979e-06, + "loss": 0.3151, + "step": 792 + }, + { + "epoch": 1.1954022988505748, + "grad_norm": 0.5584145695371312, + "learning_rate": 8.335601497316809e-06, + "loss": 0.3474, + "step": 793 + }, + { + "epoch": 1.1969097418503862, + "grad_norm": 0.5414937178807059, + "learning_rate": 8.309641057757052e-06, + "loss": 0.3348, + "step": 794 + }, + { + "epoch": 1.1984171848501979, + "grad_norm": 0.5933495505366142, + "learning_rate": 8.283692340084623e-06, + "loss": 0.3743, + "step": 795 + }, + { + "epoch": 1.1999246278500095, + "grad_norm": 0.5730820400742883, + "learning_rate": 8.257755524242333e-06, + "loss": 0.3437, + "step": 796 + }, + { + "epoch": 1.201432070849821, + "grad_norm": 0.5154842086228131, + "learning_rate": 8.231830790090461e-06, + "loss": 0.3271, + "step": 797 + }, + { + "epoch": 1.2029395138496326, + "grad_norm": 0.5445619151521616, + "learning_rate": 8.205918317405508e-06, + "loss": 0.3229, + "step": 798 + }, + { + "epoch": 1.204446956849444, + "grad_norm": 0.6016710522110904, + "learning_rate": 8.18001828587895e-06, + "loss": 0.3609, + "step": 799 + }, + { + "epoch": 1.2059543998492557, + "grad_norm": 0.5457762036159068, + "learning_rate": 8.154130875115978e-06, + "loss": 0.318, + "step": 800 + }, + { + "epoch": 1.2074618428490673, + "grad_norm": 0.5404902176604001, + "learning_rate": 8.12825626463427e-06, + "loss": 0.3323, + "step": 801 + }, + { + "epoch": 1.2089692858488787, + "grad_norm": 0.5722847292063646, + "learning_rate": 8.102394633862743e-06, + "loss": 0.3147, + "step": 802 + }, + { + "epoch": 1.2104767288486904, + "grad_norm": 0.5531842770730636, + "learning_rate": 8.0765461621403e-06, + "loss": 0.331, + "step": 803 + }, + { + "epoch": 1.211984171848502, + "grad_norm": 0.5256780853712785, + "learning_rate": 8.050711028714589e-06, + "loss": 0.3176, + "step": 804 + }, + { + "epoch": 1.2134916148483135, + "grad_norm": 0.6144603881477418, + "learning_rate": 8.02488941274078e-06, + "loss": 0.3383, + "step": 805 + }, + { + "epoch": 1.2149990578481251, + "grad_norm": 0.571788365434139, + "learning_rate": 7.999081493280283e-06, + "loss": 0.3258, + "step": 806 + }, + { + "epoch": 1.2165065008479368, + "grad_norm": 0.5982762464323738, + "learning_rate": 7.973287449299545e-06, + "loss": 0.3503, + "step": 807 + }, + { + "epoch": 1.2180139438477482, + "grad_norm": 0.5363356894959806, + "learning_rate": 7.947507459668784e-06, + "loss": 0.3436, + "step": 808 + }, + { + "epoch": 1.2195213868475598, + "grad_norm": 0.5730894211276505, + "learning_rate": 7.921741703160758e-06, + "loss": 0.3584, + "step": 809 + }, + { + "epoch": 1.2210288298473715, + "grad_norm": 0.563926690224309, + "learning_rate": 7.895990358449533e-06, + "loss": 0.3291, + "step": 810 + }, + { + "epoch": 1.222536272847183, + "grad_norm": 0.5254920217508706, + "learning_rate": 7.87025360410922e-06, + "loss": 0.316, + "step": 811 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.5313261676986573, + "learning_rate": 7.844531618612772e-06, + "loss": 0.3319, + "step": 812 + }, + { + "epoch": 1.2255511588468062, + "grad_norm": 0.5790168954324271, + "learning_rate": 7.81882458033071e-06, + "loss": 0.3202, + "step": 813 + }, + { + "epoch": 1.2270586018466176, + "grad_norm": 0.5385942852927429, + "learning_rate": 7.79313266752991e-06, + "loss": 0.3259, + "step": 814 + }, + { + "epoch": 1.2285660448464293, + "grad_norm": 0.5551972130449111, + "learning_rate": 7.767456058372362e-06, + "loss": 0.3385, + "step": 815 + }, + { + "epoch": 1.2300734878462407, + "grad_norm": 0.5322043372006761, + "learning_rate": 7.741794930913922e-06, + "loss": 0.3215, + "step": 816 + }, + { + "epoch": 1.2315809308460524, + "grad_norm": 0.5541120887430956, + "learning_rate": 7.7161494631031e-06, + "loss": 0.3428, + "step": 817 + }, + { + "epoch": 1.233088373845864, + "grad_norm": 0.5527885462222231, + "learning_rate": 7.690519832779799e-06, + "loss": 0.3389, + "step": 818 + }, + { + "epoch": 1.2345958168456754, + "grad_norm": 0.5406331170872595, + "learning_rate": 7.664906217674115e-06, + "loss": 0.3112, + "step": 819 + }, + { + "epoch": 1.236103259845487, + "grad_norm": 0.5055150883042695, + "learning_rate": 7.639308795405066e-06, + "loss": 0.3202, + "step": 820 + }, + { + "epoch": 1.2376107028452987, + "grad_norm": 0.5563269801825349, + "learning_rate": 7.613727743479395e-06, + "loss": 0.3571, + "step": 821 + }, + { + "epoch": 1.2391181458451102, + "grad_norm": 0.5792057709615847, + "learning_rate": 7.588163239290316e-06, + "loss": 0.3329, + "step": 822 + }, + { + "epoch": 1.2406255888449218, + "grad_norm": 0.5666249401867434, + "learning_rate": 7.562615460116289e-06, + "loss": 0.351, + "step": 823 + }, + { + "epoch": 1.2421330318447334, + "grad_norm": 0.5265355387938444, + "learning_rate": 7.537084583119802e-06, + "loss": 0.3701, + "step": 824 + }, + { + "epoch": 1.2436404748445449, + "grad_norm": 0.5495841716595921, + "learning_rate": 7.511570785346129e-06, + "loss": 0.329, + "step": 825 + }, + { + "epoch": 1.2451479178443565, + "grad_norm": 0.5587199026990006, + "learning_rate": 7.486074243722109e-06, + "loss": 0.3252, + "step": 826 + }, + { + "epoch": 1.2466553608441682, + "grad_norm": 0.5211341468152613, + "learning_rate": 7.460595135054916e-06, + "loss": 0.3311, + "step": 827 + }, + { + "epoch": 1.2481628038439796, + "grad_norm": 0.5364245497529563, + "learning_rate": 7.435133636030831e-06, + "loss": 0.3208, + "step": 828 + }, + { + "epoch": 1.2496702468437912, + "grad_norm": 0.5314247345107659, + "learning_rate": 7.4096899232140295e-06, + "loss": 0.3317, + "step": 829 + }, + { + "epoch": 1.251177689843603, + "grad_norm": 0.5630710895853528, + "learning_rate": 7.384264173045339e-06, + "loss": 0.3351, + "step": 830 + }, + { + "epoch": 1.2526851328434143, + "grad_norm": 0.5197283769421239, + "learning_rate": 7.358856561841021e-06, + "loss": 0.3065, + "step": 831 + }, + { + "epoch": 1.254192575843226, + "grad_norm": 0.5568788382198039, + "learning_rate": 7.333467265791563e-06, + "loss": 0.351, + "step": 832 + }, + { + "epoch": 1.2557000188430374, + "grad_norm": 0.5725220505007355, + "learning_rate": 7.308096460960441e-06, + "loss": 0.3439, + "step": 833 + }, + { + "epoch": 1.257207461842849, + "grad_norm": 0.5304098730159461, + "learning_rate": 7.282744323282895e-06, + "loss": 0.3188, + "step": 834 + }, + { + "epoch": 1.2587149048426607, + "grad_norm": 0.5263594760039901, + "learning_rate": 7.2574110285647244e-06, + "loss": 0.3209, + "step": 835 + }, + { + "epoch": 1.2602223478424723, + "grad_norm": 0.6039158733618325, + "learning_rate": 7.232096752481061e-06, + "loss": 0.3366, + "step": 836 + }, + { + "epoch": 1.2617297908422838, + "grad_norm": 0.5807414247418556, + "learning_rate": 7.206801670575145e-06, + "loss": 0.3446, + "step": 837 + }, + { + "epoch": 1.2632372338420954, + "grad_norm": 0.5398549865816707, + "learning_rate": 7.181525958257116e-06, + "loss": 0.2976, + "step": 838 + }, + { + "epoch": 1.2647446768419068, + "grad_norm": 0.5502842121004295, + "learning_rate": 7.156269790802801e-06, + "loss": 0.3308, + "step": 839 + }, + { + "epoch": 1.2662521198417185, + "grad_norm": 0.5520318040890088, + "learning_rate": 7.131033343352483e-06, + "loss": 0.3347, + "step": 840 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.5430821284421434, + "learning_rate": 7.105816790909699e-06, + "loss": 0.3199, + "step": 841 + }, + { + "epoch": 1.2692670058413416, + "grad_norm": 0.5268656785617308, + "learning_rate": 7.080620308340024e-06, + "loss": 0.3368, + "step": 842 + }, + { + "epoch": 1.2707744488411532, + "grad_norm": 0.5488558866283424, + "learning_rate": 7.055444070369852e-06, + "loss": 0.3185, + "step": 843 + }, + { + "epoch": 1.2722818918409646, + "grad_norm": 0.5234636191148432, + "learning_rate": 7.0302882515852025e-06, + "loss": 0.3156, + "step": 844 + }, + { + "epoch": 1.2737893348407763, + "grad_norm": 0.5571922620156962, + "learning_rate": 7.005153026430476e-06, + "loss": 0.3475, + "step": 845 + }, + { + "epoch": 1.275296777840588, + "grad_norm": 0.6043172841328527, + "learning_rate": 6.980038569207291e-06, + "loss": 0.3535, + "step": 846 + }, + { + "epoch": 1.2768042208403996, + "grad_norm": 0.5449462283830545, + "learning_rate": 6.954945054073228e-06, + "loss": 0.3485, + "step": 847 + }, + { + "epoch": 1.278311663840211, + "grad_norm": 0.553125976275942, + "learning_rate": 6.929872655040655e-06, + "loss": 0.3392, + "step": 848 + }, + { + "epoch": 1.2798191068400226, + "grad_norm": 0.5313032640250875, + "learning_rate": 6.904821545975507e-06, + "loss": 0.3533, + "step": 849 + }, + { + "epoch": 1.281326549839834, + "grad_norm": 0.5461530058972931, + "learning_rate": 6.879791900596077e-06, + "loss": 0.3082, + "step": 850 + }, + { + "epoch": 1.2828339928396457, + "grad_norm": 0.5268975792503748, + "learning_rate": 6.854783892471823e-06, + "loss": 0.3507, + "step": 851 + }, + { + "epoch": 1.2843414358394574, + "grad_norm": 0.525335909935522, + "learning_rate": 6.829797695022163e-06, + "loss": 0.3137, + "step": 852 + }, + { + "epoch": 1.285848878839269, + "grad_norm": 0.5439698304073414, + "learning_rate": 6.804833481515256e-06, + "loss": 0.3269, + "step": 853 + }, + { + "epoch": 1.2873563218390804, + "grad_norm": 0.5426503592650488, + "learning_rate": 6.7798914250668154e-06, + "loss": 0.3255, + "step": 854 + }, + { + "epoch": 1.288863764838892, + "grad_norm": 0.546578985401071, + "learning_rate": 6.7549716986389146e-06, + "loss": 0.3357, + "step": 855 + }, + { + "epoch": 1.2903712078387035, + "grad_norm": 0.5433998763126892, + "learning_rate": 6.730074475038766e-06, + "loss": 0.3316, + "step": 856 + }, + { + "epoch": 1.2918786508385152, + "grad_norm": 0.5364588967630985, + "learning_rate": 6.7051999269175405e-06, + "loss": 0.3305, + "step": 857 + }, + { + "epoch": 1.2933860938383268, + "grad_norm": 0.5658934841388523, + "learning_rate": 6.680348226769162e-06, + "loss": 0.329, + "step": 858 + }, + { + "epoch": 1.2948935368381382, + "grad_norm": 0.5643062239325746, + "learning_rate": 6.655519546929121e-06, + "loss": 0.3297, + "step": 859 + }, + { + "epoch": 1.2964009798379499, + "grad_norm": 0.5371342456598566, + "learning_rate": 6.630714059573267e-06, + "loss": 0.3411, + "step": 860 + }, + { + "epoch": 1.2979084228377613, + "grad_norm": 0.5429869820067992, + "learning_rate": 6.6059319367166165e-06, + "loss": 0.3162, + "step": 861 + }, + { + "epoch": 1.299415865837573, + "grad_norm": 0.6163498341710386, + "learning_rate": 6.581173350212169e-06, + "loss": 0.3346, + "step": 862 + }, + { + "epoch": 1.3009233088373846, + "grad_norm": 0.5249574401357171, + "learning_rate": 6.55643847174971e-06, + "loss": 0.3184, + "step": 863 + }, + { + "epoch": 1.3024307518371963, + "grad_norm": 0.5652427669527782, + "learning_rate": 6.531727472854617e-06, + "loss": 0.3277, + "step": 864 + }, + { + "epoch": 1.3039381948370077, + "grad_norm": 0.5499255875094143, + "learning_rate": 6.507040524886672e-06, + "loss": 0.3099, + "step": 865 + }, + { + "epoch": 1.3054456378368193, + "grad_norm": 0.5395982289283698, + "learning_rate": 6.482377799038882e-06, + "loss": 0.312, + "step": 866 + }, + { + "epoch": 1.3069530808366308, + "grad_norm": 0.5425266392409812, + "learning_rate": 6.45773946633628e-06, + "loss": 0.3288, + "step": 867 + }, + { + "epoch": 1.3084605238364424, + "grad_norm": 0.5289252666187554, + "learning_rate": 6.4331256976347434e-06, + "loss": 0.3143, + "step": 868 + }, + { + "epoch": 1.309967966836254, + "grad_norm": 0.5829209174715098, + "learning_rate": 6.408536663619803e-06, + "loss": 0.3215, + "step": 869 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.5955771972087047, + "learning_rate": 6.383972534805478e-06, + "loss": 0.3232, + "step": 870 + }, + { + "epoch": 1.3129828528358771, + "grad_norm": 0.5434757595303122, + "learning_rate": 6.359433481533074e-06, + "loss": 0.318, + "step": 871 + }, + { + "epoch": 1.3144902958356888, + "grad_norm": 0.5313303734643687, + "learning_rate": 6.3349196739700024e-06, + "loss": 0.3159, + "step": 872 + }, + { + "epoch": 1.3159977388355002, + "grad_norm": 0.5678985167703594, + "learning_rate": 6.310431282108622e-06, + "loss": 0.3757, + "step": 873 + }, + { + "epoch": 1.3175051818353118, + "grad_norm": 0.5648846998829979, + "learning_rate": 6.2859684757650365e-06, + "loss": 0.3493, + "step": 874 + }, + { + "epoch": 1.3190126248351235, + "grad_norm": 0.5488508237006199, + "learning_rate": 6.261531424577923e-06, + "loss": 0.3219, + "step": 875 + }, + { + "epoch": 1.320520067834935, + "grad_norm": 0.5438041497270804, + "learning_rate": 6.2371202980073596e-06, + "loss": 0.3417, + "step": 876 + }, + { + "epoch": 1.3220275108347466, + "grad_norm": 0.5400748419899576, + "learning_rate": 6.212735265333655e-06, + "loss": 0.3025, + "step": 877 + }, + { + "epoch": 1.323534953834558, + "grad_norm": 0.5299843425249701, + "learning_rate": 6.188376495656156e-06, + "loss": 0.3374, + "step": 878 + }, + { + "epoch": 1.3250423968343696, + "grad_norm": 0.5236709426443396, + "learning_rate": 6.164044157892102e-06, + "loss": 0.3221, + "step": 879 + }, + { + "epoch": 1.3265498398341813, + "grad_norm": 0.5699051867060005, + "learning_rate": 6.13973842077543e-06, + "loss": 0.3226, + "step": 880 + }, + { + "epoch": 1.328057282833993, + "grad_norm": 0.5655665319377791, + "learning_rate": 6.11545945285561e-06, + "loss": 0.3175, + "step": 881 + }, + { + "epoch": 1.3295647258338044, + "grad_norm": 0.5387118438674878, + "learning_rate": 6.091207422496489e-06, + "loss": 0.3243, + "step": 882 + }, + { + "epoch": 1.331072168833616, + "grad_norm": 0.5744706591584219, + "learning_rate": 6.066982497875109e-06, + "loss": 0.3286, + "step": 883 + }, + { + "epoch": 1.3325796118334274, + "grad_norm": 0.542466249812019, + "learning_rate": 6.042784846980542e-06, + "loss": 0.3225, + "step": 884 + }, + { + "epoch": 1.334087054833239, + "grad_norm": 0.5515488785701044, + "learning_rate": 6.018614637612733e-06, + "loss": 0.3238, + "step": 885 + }, + { + "epoch": 1.3355944978330507, + "grad_norm": 0.5349896204088196, + "learning_rate": 5.99447203738134e-06, + "loss": 0.324, + "step": 886 + }, + { + "epoch": 1.3371019408328624, + "grad_norm": 0.5371367792089301, + "learning_rate": 5.9703572137045495e-06, + "loss": 0.3369, + "step": 887 + }, + { + "epoch": 1.3386093838326738, + "grad_norm": 0.5615358147993731, + "learning_rate": 5.946270333807937e-06, + "loss": 0.3052, + "step": 888 + }, + { + "epoch": 1.3401168268324855, + "grad_norm": 0.5743727933679714, + "learning_rate": 5.922211564723302e-06, + "loss": 0.3455, + "step": 889 + }, + { + "epoch": 1.3416242698322969, + "grad_norm": 0.5336291605723125, + "learning_rate": 5.898181073287504e-06, + "loss": 0.3226, + "step": 890 + }, + { + "epoch": 1.3431317128321085, + "grad_norm": 0.5722163135210774, + "learning_rate": 5.87417902614131e-06, + "loss": 0.3646, + "step": 891 + }, + { + "epoch": 1.3446391558319202, + "grad_norm": 0.5524401803992677, + "learning_rate": 5.850205589728239e-06, + "loss": 0.3016, + "step": 892 + }, + { + "epoch": 1.3461465988317316, + "grad_norm": 0.5311906031167264, + "learning_rate": 5.826260930293417e-06, + "loss": 0.3174, + "step": 893 + }, + { + "epoch": 1.3476540418315432, + "grad_norm": 0.5385817256109608, + "learning_rate": 5.802345213882396e-06, + "loss": 0.3447, + "step": 894 + }, + { + "epoch": 1.349161484831355, + "grad_norm": 0.5443226270708521, + "learning_rate": 5.778458606340037e-06, + "loss": 0.3056, + "step": 895 + }, + { + "epoch": 1.3506689278311663, + "grad_norm": 0.5247659222065347, + "learning_rate": 5.754601273309333e-06, + "loss": 0.3045, + "step": 896 + }, + { + "epoch": 1.352176370830978, + "grad_norm": 0.5555702487315548, + "learning_rate": 5.730773380230276e-06, + "loss": 0.3186, + "step": 897 + }, + { + "epoch": 1.3536838138307896, + "grad_norm": 0.5661524854903914, + "learning_rate": 5.70697509233871e-06, + "loss": 0.3248, + "step": 898 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.5284857763614461, + "learning_rate": 5.683206574665165e-06, + "loss": 0.3018, + "step": 899 + }, + { + "epoch": 1.3566986998304127, + "grad_norm": 0.5740850180912764, + "learning_rate": 5.6594679920337514e-06, + "loss": 0.3529, + "step": 900 + }, + { + "epoch": 1.3582061428302241, + "grad_norm": 0.5299086867590524, + "learning_rate": 5.635759509060969e-06, + "loss": 0.32, + "step": 901 + }, + { + "epoch": 1.3597135858300358, + "grad_norm": 0.5330329949433913, + "learning_rate": 5.612081290154607e-06, + "loss": 0.3156, + "step": 902 + }, + { + "epoch": 1.3612210288298474, + "grad_norm": 0.5347611903592508, + "learning_rate": 5.58843349951258e-06, + "loss": 0.3183, + "step": 903 + }, + { + "epoch": 1.362728471829659, + "grad_norm": 0.5409021340662399, + "learning_rate": 5.564816301121792e-06, + "loss": 0.3411, + "step": 904 + }, + { + "epoch": 1.3642359148294705, + "grad_norm": 0.5560565848550149, + "learning_rate": 5.541229858757011e-06, + "loss": 0.3508, + "step": 905 + }, + { + "epoch": 1.3657433578292821, + "grad_norm": 0.5040665273430834, + "learning_rate": 5.517674335979721e-06, + "loss": 0.3038, + "step": 906 + }, + { + "epoch": 1.3672508008290936, + "grad_norm": 0.5520505173652595, + "learning_rate": 5.494149896136998e-06, + "loss": 0.3342, + "step": 907 + }, + { + "epoch": 1.3687582438289052, + "grad_norm": 0.5286100688050495, + "learning_rate": 5.470656702360367e-06, + "loss": 0.3051, + "step": 908 + }, + { + "epoch": 1.3702656868287169, + "grad_norm": 0.5540464877346475, + "learning_rate": 5.447194917564671e-06, + "loss": 0.3327, + "step": 909 + }, + { + "epoch": 1.3717731298285283, + "grad_norm": 0.5401690086723988, + "learning_rate": 5.423764704446954e-06, + "loss": 0.332, + "step": 910 + }, + { + "epoch": 1.37328057282834, + "grad_norm": 0.5440262612621518, + "learning_rate": 5.400366225485326e-06, + "loss": 0.3326, + "step": 911 + }, + { + "epoch": 1.3747880158281516, + "grad_norm": 0.5291318028597245, + "learning_rate": 5.376999642937817e-06, + "loss": 0.3262, + "step": 912 + }, + { + "epoch": 1.376295458827963, + "grad_norm": 0.5361093139503608, + "learning_rate": 5.353665118841296e-06, + "loss": 0.3258, + "step": 913 + }, + { + "epoch": 1.3778029018277747, + "grad_norm": 0.5442991814951846, + "learning_rate": 5.330362815010306e-06, + "loss": 0.3162, + "step": 914 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5384147655921361, + "learning_rate": 5.307092893035951e-06, + "loss": 0.3381, + "step": 915 + }, + { + "epoch": 1.3808177878273977, + "grad_norm": 0.539100490777508, + "learning_rate": 5.2838555142847925e-06, + "loss": 0.3423, + "step": 916 + }, + { + "epoch": 1.3823252308272094, + "grad_norm": 0.5427293981456651, + "learning_rate": 5.260650839897719e-06, + "loss": 0.3217, + "step": 917 + }, + { + "epoch": 1.3838326738270208, + "grad_norm": 0.5726046303707281, + "learning_rate": 5.237479030788817e-06, + "loss": 0.3507, + "step": 918 + }, + { + "epoch": 1.3853401168268324, + "grad_norm": 0.5527176893463295, + "learning_rate": 5.214340247644278e-06, + "loss": 0.331, + "step": 919 + }, + { + "epoch": 1.386847559826644, + "grad_norm": 0.5481340240469819, + "learning_rate": 5.191234650921273e-06, + "loss": 0.318, + "step": 920 + }, + { + "epoch": 1.3883550028264557, + "grad_norm": 0.5543962320620248, + "learning_rate": 5.168162400846835e-06, + "loss": 0.3155, + "step": 921 + }, + { + "epoch": 1.3898624458262672, + "grad_norm": 0.5367203136891187, + "learning_rate": 5.145123657416759e-06, + "loss": 0.3326, + "step": 922 + }, + { + "epoch": 1.3913698888260788, + "grad_norm": 0.5460167764137122, + "learning_rate": 5.122118580394473e-06, + "loss": 0.337, + "step": 923 + }, + { + "epoch": 1.3928773318258902, + "grad_norm": 0.5195547700814616, + "learning_rate": 5.099147329309959e-06, + "loss": 0.326, + "step": 924 + }, + { + "epoch": 1.394384774825702, + "grad_norm": 0.5456994955845843, + "learning_rate": 5.076210063458622e-06, + "loss": 0.3322, + "step": 925 + }, + { + "epoch": 1.3958922178255135, + "grad_norm": 0.5843461937914468, + "learning_rate": 5.0533069419002e-06, + "loss": 0.339, + "step": 926 + }, + { + "epoch": 1.397399660825325, + "grad_norm": 0.5150644731537505, + "learning_rate": 5.030438123457655e-06, + "loss": 0.2913, + "step": 927 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.5258159746479392, + "learning_rate": 5.007603766716063e-06, + "loss": 0.3107, + "step": 928 + }, + { + "epoch": 1.4004145468249483, + "grad_norm": 0.5748165660930256, + "learning_rate": 4.984804030021533e-06, + "loss": 0.3328, + "step": 929 + }, + { + "epoch": 1.4019219898247597, + "grad_norm": 0.5630833950584739, + "learning_rate": 4.962039071480102e-06, + "loss": 0.3233, + "step": 930 + }, + { + "epoch": 1.4034294328245713, + "grad_norm": 0.537736675931464, + "learning_rate": 4.939309048956622e-06, + "loss": 0.3451, + "step": 931 + }, + { + "epoch": 1.404936875824383, + "grad_norm": 0.5230851918523695, + "learning_rate": 4.9166141200736885e-06, + "loss": 0.3389, + "step": 932 + }, + { + "epoch": 1.4064443188241944, + "grad_norm": 0.5434274109432955, + "learning_rate": 4.89395444221055e-06, + "loss": 0.3189, + "step": 933 + }, + { + "epoch": 1.407951761824006, + "grad_norm": 0.5467058284642171, + "learning_rate": 4.871330172501979e-06, + "loss": 0.3218, + "step": 934 + }, + { + "epoch": 1.4094592048238175, + "grad_norm": 0.5643806859737526, + "learning_rate": 4.848741467837228e-06, + "loss": 0.339, + "step": 935 + }, + { + "epoch": 1.4109666478236291, + "grad_norm": 0.5210588389675654, + "learning_rate": 4.826188484858918e-06, + "loss": 0.2865, + "step": 936 + }, + { + "epoch": 1.4124740908234408, + "grad_norm": 0.5575173474168307, + "learning_rate": 4.803671379961945e-06, + "loss": 0.3448, + "step": 937 + }, + { + "epoch": 1.4139815338232524, + "grad_norm": 0.5553202634668, + "learning_rate": 4.781190309292421e-06, + "loss": 0.318, + "step": 938 + }, + { + "epoch": 1.4154889768230638, + "grad_norm": 0.5277473116633332, + "learning_rate": 4.758745428746569e-06, + "loss": 0.3047, + "step": 939 + }, + { + "epoch": 1.4169964198228755, + "grad_norm": 0.5308118737562897, + "learning_rate": 4.736336893969652e-06, + "loss": 0.3126, + "step": 940 + }, + { + "epoch": 1.418503862822687, + "grad_norm": 0.562016412444855, + "learning_rate": 4.7139648603548925e-06, + "loss": 0.3306, + "step": 941 + }, + { + "epoch": 1.4200113058224986, + "grad_norm": 0.5112809802949265, + "learning_rate": 4.691629483042387e-06, + "loss": 0.2877, + "step": 942 + }, + { + "epoch": 1.4215187488223102, + "grad_norm": 0.555811318693021, + "learning_rate": 4.669330916918043e-06, + "loss": 0.3346, + "step": 943 + }, + { + "epoch": 1.4230261918221216, + "grad_norm": 0.5388925234150407, + "learning_rate": 4.647069316612502e-06, + "loss": 0.3137, + "step": 944 + }, + { + "epoch": 1.4245336348219333, + "grad_norm": 0.5265475799399302, + "learning_rate": 4.624844836500052e-06, + "loss": 0.3162, + "step": 945 + }, + { + "epoch": 1.426041077821745, + "grad_norm": 0.5112485522585755, + "learning_rate": 4.60265763069758e-06, + "loss": 0.2914, + "step": 946 + }, + { + "epoch": 1.4275485208215564, + "grad_norm": 0.5285723749640436, + "learning_rate": 4.580507853063487e-06, + "loss": 0.3098, + "step": 947 + }, + { + "epoch": 1.429055963821368, + "grad_norm": 0.5408160656578395, + "learning_rate": 4.5583956571966295e-06, + "loss": 0.3365, + "step": 948 + }, + { + "epoch": 1.4305634068211797, + "grad_norm": 0.5598936258222863, + "learning_rate": 4.5363211964352524e-06, + "loss": 0.3292, + "step": 949 + }, + { + "epoch": 1.432070849820991, + "grad_norm": 0.5180426805197446, + "learning_rate": 4.514284623855915e-06, + "loss": 0.3174, + "step": 950 + }, + { + "epoch": 1.4335782928208027, + "grad_norm": 0.5639401953538313, + "learning_rate": 4.4922860922724466e-06, + "loss": 0.3617, + "step": 951 + }, + { + "epoch": 1.4350857358206142, + "grad_norm": 0.5482846937319309, + "learning_rate": 4.470325754234881e-06, + "loss": 0.3256, + "step": 952 + }, + { + "epoch": 1.4365931788204258, + "grad_norm": 0.530946653125974, + "learning_rate": 4.448403762028391e-06, + "loss": 0.3367, + "step": 953 + }, + { + "epoch": 1.4381006218202375, + "grad_norm": 0.5630491613208096, + "learning_rate": 4.426520267672244e-06, + "loss": 0.33, + "step": 954 + }, + { + "epoch": 1.439608064820049, + "grad_norm": 0.5281029541497921, + "learning_rate": 4.40467542291874e-06, + "loss": 0.3266, + "step": 955 + }, + { + "epoch": 1.4411155078198605, + "grad_norm": 0.5134408808419982, + "learning_rate": 4.382869379252152e-06, + "loss": 0.3002, + "step": 956 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.5379209815628555, + "learning_rate": 4.361102287887698e-06, + "loss": 0.3197, + "step": 957 + }, + { + "epoch": 1.4441303938194836, + "grad_norm": 0.5118973398445678, + "learning_rate": 4.339374299770477e-06, + "loss": 0.316, + "step": 958 + }, + { + "epoch": 1.4456378368192953, + "grad_norm": 0.5340927744773125, + "learning_rate": 4.31768556557441e-06, + "loss": 0.2995, + "step": 959 + }, + { + "epoch": 1.447145279819107, + "grad_norm": 0.5574015013189253, + "learning_rate": 4.296036235701235e-06, + "loss": 0.3214, + "step": 960 + }, + { + "epoch": 1.4486527228189185, + "grad_norm": 0.544283711827625, + "learning_rate": 4.274426460279412e-06, + "loss": 0.309, + "step": 961 + }, + { + "epoch": 1.45016016581873, + "grad_norm": 0.5456093505990249, + "learning_rate": 4.252856389163128e-06, + "loss": 0.3056, + "step": 962 + }, + { + "epoch": 1.4516676088185416, + "grad_norm": 0.5524390480774677, + "learning_rate": 4.231326171931231e-06, + "loss": 0.2988, + "step": 963 + }, + { + "epoch": 1.453175051818353, + "grad_norm": 0.5357243619653109, + "learning_rate": 4.209835957886196e-06, + "loss": 0.3051, + "step": 964 + }, + { + "epoch": 1.4546824948181647, + "grad_norm": 0.5567188365205857, + "learning_rate": 4.188385896053098e-06, + "loss": 0.3211, + "step": 965 + }, + { + "epoch": 1.4561899378179763, + "grad_norm": 0.5571208137663407, + "learning_rate": 4.166976135178575e-06, + "loss": 0.3212, + "step": 966 + }, + { + "epoch": 1.4576973808177878, + "grad_norm": 0.5288681574805124, + "learning_rate": 4.1456068237297964e-06, + "loss": 0.3247, + "step": 967 + }, + { + "epoch": 1.4592048238175994, + "grad_norm": 0.5362640814930834, + "learning_rate": 4.124278109893432e-06, + "loss": 0.3206, + "step": 968 + }, + { + "epoch": 1.4607122668174108, + "grad_norm": 0.5300069434968542, + "learning_rate": 4.10299014157462e-06, + "loss": 0.299, + "step": 969 + }, + { + "epoch": 1.4622197098172225, + "grad_norm": 0.5207197137299924, + "learning_rate": 4.0817430663959536e-06, + "loss": 0.2872, + "step": 970 + }, + { + "epoch": 1.4637271528170341, + "grad_norm": 0.5361880777046366, + "learning_rate": 4.06053703169645e-06, + "loss": 0.3432, + "step": 971 + }, + { + "epoch": 1.4652345958168458, + "grad_norm": 0.5390277129867954, + "learning_rate": 4.039372184530521e-06, + "loss": 0.3121, + "step": 972 + }, + { + "epoch": 1.4667420388166572, + "grad_norm": 0.5098624467494199, + "learning_rate": 4.0182486716669656e-06, + "loss": 0.3057, + "step": 973 + }, + { + "epoch": 1.4682494818164689, + "grad_norm": 0.5530264319623549, + "learning_rate": 3.9971666395879605e-06, + "loss": 0.316, + "step": 974 + }, + { + "epoch": 1.4697569248162803, + "grad_norm": 0.5648165554049958, + "learning_rate": 3.9761262344880096e-06, + "loss": 0.3456, + "step": 975 + }, + { + "epoch": 1.471264367816092, + "grad_norm": 0.5376597362402104, + "learning_rate": 3.9551276022729644e-06, + "loss": 0.3075, + "step": 976 + }, + { + "epoch": 1.4727718108159036, + "grad_norm": 0.5207214018679573, + "learning_rate": 3.9341708885590034e-06, + "loss": 0.3043, + "step": 977 + }, + { + "epoch": 1.4742792538157152, + "grad_norm": 0.5346717454580582, + "learning_rate": 3.913256238671607e-06, + "loss": 0.3187, + "step": 978 + }, + { + "epoch": 1.4757866968155267, + "grad_norm": 0.5474625757974195, + "learning_rate": 3.89238379764457e-06, + "loss": 0.3341, + "step": 979 + }, + { + "epoch": 1.4772941398153383, + "grad_norm": 0.553265646517597, + "learning_rate": 3.871553710218988e-06, + "loss": 0.3615, + "step": 980 + }, + { + "epoch": 1.4788015828151497, + "grad_norm": 0.5178190237398634, + "learning_rate": 3.850766120842252e-06, + "loss": 0.3087, + "step": 981 + }, + { + "epoch": 1.4803090258149614, + "grad_norm": 0.5676605070232937, + "learning_rate": 3.830021173667048e-06, + "loss": 0.3331, + "step": 982 + }, + { + "epoch": 1.481816468814773, + "grad_norm": 0.5366490741054173, + "learning_rate": 3.809319012550352e-06, + "loss": 0.3134, + "step": 983 + }, + { + "epoch": 1.4833239118145845, + "grad_norm": 0.5237338303143243, + "learning_rate": 3.788659781052444e-06, + "loss": 0.3426, + "step": 984 + }, + { + "epoch": 1.484831354814396, + "grad_norm": 0.5118568891202759, + "learning_rate": 3.7680436224359084e-06, + "loss": 0.3049, + "step": 985 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.5474188971913913, + "learning_rate": 3.747470679664624e-06, + "loss": 0.3177, + "step": 986 + }, + { + "epoch": 1.4878462408140192, + "grad_norm": 0.5623337896836218, + "learning_rate": 3.7269410954028107e-06, + "loss": 0.3268, + "step": 987 + }, + { + "epoch": 1.4893536838138308, + "grad_norm": 0.5320249898828978, + "learning_rate": 3.706455012013994e-06, + "loss": 0.3135, + "step": 988 + }, + { + "epoch": 1.4908611268136425, + "grad_norm": 0.5258630499196119, + "learning_rate": 3.6860125715600513e-06, + "loss": 0.2922, + "step": 989 + }, + { + "epoch": 1.492368569813454, + "grad_norm": 0.5488691155290143, + "learning_rate": 3.665613915800217e-06, + "loss": 0.3093, + "step": 990 + }, + { + "epoch": 1.4938760128132655, + "grad_norm": 0.534561267695021, + "learning_rate": 3.6452591861900886e-06, + "loss": 0.3201, + "step": 991 + }, + { + "epoch": 1.495383455813077, + "grad_norm": 0.5493934402188156, + "learning_rate": 3.6249485238806637e-06, + "loss": 0.3258, + "step": 992 + }, + { + "epoch": 1.4968908988128886, + "grad_norm": 0.5213745241077384, + "learning_rate": 3.6046820697173514e-06, + "loss": 0.3206, + "step": 993 + }, + { + "epoch": 1.4983983418127003, + "grad_norm": 0.5189896345789112, + "learning_rate": 3.5844599642389965e-06, + "loss": 0.3093, + "step": 994 + }, + { + "epoch": 1.499905784812512, + "grad_norm": 0.535438446636319, + "learning_rate": 3.564282347676903e-06, + "loss": 0.3449, + "step": 995 + }, + { + "epoch": 1.5014132278123233, + "grad_norm": 0.5151601964534807, + "learning_rate": 3.54414935995387e-06, + "loss": 0.3002, + "step": 996 + }, + { + "epoch": 1.5029206708121348, + "grad_norm": 0.5669837505751246, + "learning_rate": 3.524061140683206e-06, + "loss": 0.3367, + "step": 997 + }, + { + "epoch": 1.5044281138119464, + "grad_norm": 0.5376128901605735, + "learning_rate": 3.5040178291677816e-06, + "loss": 0.3362, + "step": 998 + }, + { + "epoch": 1.505935556811758, + "grad_norm": 0.5816182611830706, + "learning_rate": 3.4840195643990383e-06, + "loss": 0.316, + "step": 999 + }, + { + "epoch": 1.5074429998115697, + "grad_norm": 0.5333548248485912, + "learning_rate": 3.464066485056048e-06, + "loss": 0.3223, + "step": 1000 + }, + { + "epoch": 1.5089504428113814, + "grad_norm": 0.5574217586347463, + "learning_rate": 3.444158729504549e-06, + "loss": 0.2994, + "step": 1001 + }, + { + "epoch": 1.5104578858111928, + "grad_norm": 0.5272699065186945, + "learning_rate": 3.4242964357959597e-06, + "loss": 0.3152, + "step": 1002 + }, + { + "epoch": 1.5119653288110042, + "grad_norm": 0.5352889213452704, + "learning_rate": 3.4044797416664564e-06, + "loss": 0.3103, + "step": 1003 + }, + { + "epoch": 1.5134727718108159, + "grad_norm": 0.5602956634920077, + "learning_rate": 3.3847087845359996e-06, + "loss": 0.334, + "step": 1004 + }, + { + "epoch": 1.5149802148106275, + "grad_norm": 0.5402201680847467, + "learning_rate": 3.364983701507376e-06, + "loss": 0.3291, + "step": 1005 + }, + { + "epoch": 1.5164876578104391, + "grad_norm": 0.5526297524617138, + "learning_rate": 3.3453046293652657e-06, + "loss": 0.3232, + "step": 1006 + }, + { + "epoch": 1.5179951008102506, + "grad_norm": 0.5401106392320315, + "learning_rate": 3.3256717045752794e-06, + "loss": 0.3219, + "step": 1007 + }, + { + "epoch": 1.5195025438100622, + "grad_norm": 0.5589978845369276, + "learning_rate": 3.3060850632830167e-06, + "loss": 0.3215, + "step": 1008 + }, + { + "epoch": 1.5210099868098736, + "grad_norm": 0.5628427903081042, + "learning_rate": 3.286544841313126e-06, + "loss": 0.3042, + "step": 1009 + }, + { + "epoch": 1.5225174298096853, + "grad_norm": 0.5291974603976658, + "learning_rate": 3.2670511741683475e-06, + "loss": 0.3039, + "step": 1010 + }, + { + "epoch": 1.524024872809497, + "grad_norm": 0.5307771510625195, + "learning_rate": 3.2476041970285945e-06, + "loss": 0.3225, + "step": 1011 + }, + { + "epoch": 1.5255323158093086, + "grad_norm": 0.5798408138665074, + "learning_rate": 3.2282040447500063e-06, + "loss": 0.3574, + "step": 1012 + }, + { + "epoch": 1.52703975880912, + "grad_norm": 0.5262954379509106, + "learning_rate": 3.208850851863998e-06, + "loss": 0.3074, + "step": 1013 + }, + { + "epoch": 1.5285472018089314, + "grad_norm": 0.5411329822808086, + "learning_rate": 3.189544752576369e-06, + "loss": 0.3291, + "step": 1014 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.512225504454937, + "learning_rate": 3.1702858807663175e-06, + "loss": 0.2967, + "step": 1015 + }, + { + "epoch": 1.5315620878085547, + "grad_norm": 0.5358326980021074, + "learning_rate": 3.151074369985556e-06, + "loss": 0.3281, + "step": 1016 + }, + { + "epoch": 1.5330695308083664, + "grad_norm": 0.5412115741377782, + "learning_rate": 3.131910353457369e-06, + "loss": 0.3321, + "step": 1017 + }, + { + "epoch": 1.534576973808178, + "grad_norm": 0.5355770866583788, + "learning_rate": 3.112793964075681e-06, + "loss": 0.2999, + "step": 1018 + }, + { + "epoch": 1.5360844168079895, + "grad_norm": 0.53044658803981, + "learning_rate": 3.0937253344041507e-06, + "loss": 0.3271, + "step": 1019 + }, + { + "epoch": 1.5375918598078009, + "grad_norm": 0.5274519029189704, + "learning_rate": 3.074704596675242e-06, + "loss": 0.3174, + "step": 1020 + }, + { + "epoch": 1.5390993028076125, + "grad_norm": 0.5410440028748313, + "learning_rate": 3.055731882789311e-06, + "loss": 0.3268, + "step": 1021 + }, + { + "epoch": 1.5406067458074242, + "grad_norm": 0.5400373191606019, + "learning_rate": 3.0368073243136874e-06, + "loss": 0.325, + "step": 1022 + }, + { + "epoch": 1.5421141888072358, + "grad_norm": 0.5520146881220487, + "learning_rate": 3.0179310524817707e-06, + "loss": 0.32, + "step": 1023 + }, + { + "epoch": 1.5436216318070473, + "grad_norm": 0.5351759755594372, + "learning_rate": 2.9991031981921026e-06, + "loss": 0.3269, + "step": 1024 + }, + { + "epoch": 1.545129074806859, + "grad_norm": 0.5327689357374978, + "learning_rate": 2.9803238920074784e-06, + "loss": 0.3082, + "step": 1025 + }, + { + "epoch": 1.5466365178066703, + "grad_norm": 0.5326589519994432, + "learning_rate": 2.961593264154038e-06, + "loss": 0.3157, + "step": 1026 + }, + { + "epoch": 1.548143960806482, + "grad_norm": 0.5249678984746798, + "learning_rate": 2.9429114445203423e-06, + "loss": 0.3119, + "step": 1027 + }, + { + "epoch": 1.5496514038062936, + "grad_norm": 0.5606341770532942, + "learning_rate": 2.924278562656514e-06, + "loss": 0.3351, + "step": 1028 + }, + { + "epoch": 1.5511588468061053, + "grad_norm": 0.5156779633424778, + "learning_rate": 2.90569474777329e-06, + "loss": 0.3256, + "step": 1029 + }, + { + "epoch": 1.5526662898059167, + "grad_norm": 0.5374626464876353, + "learning_rate": 2.8871601287411634e-06, + "loss": 0.3303, + "step": 1030 + }, + { + "epoch": 1.5541737328057281, + "grad_norm": 0.5262890780017794, + "learning_rate": 2.8686748340894744e-06, + "loss": 0.3114, + "step": 1031 + }, + { + "epoch": 1.5556811758055398, + "grad_norm": 0.5260995538471516, + "learning_rate": 2.850238992005514e-06, + "loss": 0.2979, + "step": 1032 + }, + { + "epoch": 1.5571886188053514, + "grad_norm": 0.5573768187241204, + "learning_rate": 2.8318527303336465e-06, + "loss": 0.3475, + "step": 1033 + }, + { + "epoch": 1.558696061805163, + "grad_norm": 0.5542071850347167, + "learning_rate": 2.81351617657442e-06, + "loss": 0.3359, + "step": 1034 + }, + { + "epoch": 1.5602035048049747, + "grad_norm": 0.5376949139111594, + "learning_rate": 2.795229457883678e-06, + "loss": 0.3299, + "step": 1035 + }, + { + "epoch": 1.5617109478047861, + "grad_norm": 0.5213690515169962, + "learning_rate": 2.7769927010716814e-06, + "loss": 0.3187, + "step": 1036 + }, + { + "epoch": 1.5632183908045976, + "grad_norm": 0.5282868034684867, + "learning_rate": 2.7588060326022205e-06, + "loss": 0.3124, + "step": 1037 + }, + { + "epoch": 1.5647258338044092, + "grad_norm": 0.5629003594886647, + "learning_rate": 2.740669578591755e-06, + "loss": 0.3453, + "step": 1038 + }, + { + "epoch": 1.5662332768042209, + "grad_norm": 0.5452741440289394, + "learning_rate": 2.7225834648085282e-06, + "loss": 0.3148, + "step": 1039 + }, + { + "epoch": 1.5677407198040325, + "grad_norm": 0.5576848374307647, + "learning_rate": 2.7045478166716843e-06, + "loss": 0.3362, + "step": 1040 + }, + { + "epoch": 1.569248162803844, + "grad_norm": 0.5230478868120295, + "learning_rate": 2.6865627592504295e-06, + "loss": 0.3074, + "step": 1041 + }, + { + "epoch": 1.5707556058036556, + "grad_norm": 0.5476529275243367, + "learning_rate": 2.668628417263137e-06, + "loss": 0.314, + "step": 1042 + }, + { + "epoch": 1.572263048803467, + "grad_norm": 0.5340674210452238, + "learning_rate": 2.6507449150764852e-06, + "loss": 0.3035, + "step": 1043 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.5245422709481129, + "learning_rate": 2.632912376704607e-06, + "loss": 0.3344, + "step": 1044 + }, + { + "epoch": 1.5752779348030903, + "grad_norm": 0.52661514981572, + "learning_rate": 2.615130925808228e-06, + "loss": 0.3054, + "step": 1045 + }, + { + "epoch": 1.576785377802902, + "grad_norm": 0.5147590149467712, + "learning_rate": 2.597400685693795e-06, + "loss": 0.2879, + "step": 1046 + }, + { + "epoch": 1.5782928208027134, + "grad_norm": 0.5476707768783776, + "learning_rate": 2.5797217793126373e-06, + "loss": 0.3395, + "step": 1047 + }, + { + "epoch": 1.5798002638025248, + "grad_norm": 0.5338841668417198, + "learning_rate": 2.5620943292601074e-06, + "loss": 0.3211, + "step": 1048 + }, + { + "epoch": 1.5813077068023365, + "grad_norm": 0.526860653464564, + "learning_rate": 2.5445184577747305e-06, + "loss": 0.3251, + "step": 1049 + }, + { + "epoch": 1.582815149802148, + "grad_norm": 0.5250152267933532, + "learning_rate": 2.52699428673736e-06, + "loss": 0.3126, + "step": 1050 + }, + { + "epoch": 1.5843225928019598, + "grad_norm": 0.5470636297967526, + "learning_rate": 2.5095219376703183e-06, + "loss": 0.3063, + "step": 1051 + }, + { + "epoch": 1.5858300358017714, + "grad_norm": 0.5581707395933467, + "learning_rate": 2.4921015317365794e-06, + "loss": 0.3624, + "step": 1052 + }, + { + "epoch": 1.5873374788015828, + "grad_norm": 0.5280009933911688, + "learning_rate": 2.4747331897389103e-06, + "loss": 0.3106, + "step": 1053 + }, + { + "epoch": 1.5888449218013942, + "grad_norm": 0.5234869653748981, + "learning_rate": 2.4574170321190305e-06, + "loss": 0.2956, + "step": 1054 + }, + { + "epoch": 1.590352364801206, + "grad_norm": 0.546217705596414, + "learning_rate": 2.440153178956798e-06, + "loss": 0.3215, + "step": 1055 + }, + { + "epoch": 1.5918598078010175, + "grad_norm": 0.5556302525952723, + "learning_rate": 2.42294174996935e-06, + "loss": 0.3204, + "step": 1056 + }, + { + "epoch": 1.5933672508008292, + "grad_norm": 0.5588880844097838, + "learning_rate": 2.40578286451029e-06, + "loss": 0.3282, + "step": 1057 + }, + { + "epoch": 1.5948746938006406, + "grad_norm": 0.5241614280996468, + "learning_rate": 2.38867664156886e-06, + "loss": 0.3255, + "step": 1058 + }, + { + "epoch": 1.5963821368004523, + "grad_norm": 0.5543274849783603, + "learning_rate": 2.3716231997691007e-06, + "loss": 0.3175, + "step": 1059 + }, + { + "epoch": 1.5978895798002637, + "grad_norm": 0.5306578564545272, + "learning_rate": 2.3546226573690444e-06, + "loss": 0.3211, + "step": 1060 + }, + { + "epoch": 1.5993970228000753, + "grad_norm": 0.5401209566379707, + "learning_rate": 2.3376751322599e-06, + "loss": 0.3117, + "step": 1061 + }, + { + "epoch": 1.600904465799887, + "grad_norm": 0.5339229576030943, + "learning_rate": 2.320780741965206e-06, + "loss": 0.3064, + "step": 1062 + }, + { + "epoch": 1.6024119087996986, + "grad_norm": 0.5291570037477905, + "learning_rate": 2.3039396036400463e-06, + "loss": 0.3001, + "step": 1063 + }, + { + "epoch": 1.60391935179951, + "grad_norm": 0.5544131085966325, + "learning_rate": 2.287151834070226e-06, + "loss": 0.3173, + "step": 1064 + }, + { + "epoch": 1.6054267947993215, + "grad_norm": 0.5042273491393638, + "learning_rate": 2.2704175496714552e-06, + "loss": 0.3035, + "step": 1065 + }, + { + "epoch": 1.6069342377991331, + "grad_norm": 0.5164264296676705, + "learning_rate": 2.2537368664885527e-06, + "loss": 0.306, + "step": 1066 + }, + { + "epoch": 1.6084416807989448, + "grad_norm": 0.540939444102417, + "learning_rate": 2.2371099001946385e-06, + "loss": 0.3417, + "step": 1067 + }, + { + "epoch": 1.6099491237987564, + "grad_norm": 0.5349172500611197, + "learning_rate": 2.2205367660903267e-06, + "loss": 0.3155, + "step": 1068 + }, + { + "epoch": 1.611456566798568, + "grad_norm": 0.5392150017492342, + "learning_rate": 2.2040175791029305e-06, + "loss": 0.334, + "step": 1069 + }, + { + "epoch": 1.6129640097983795, + "grad_norm": 0.5420224175155496, + "learning_rate": 2.187552453785662e-06, + "loss": 0.2981, + "step": 1070 + }, + { + "epoch": 1.614471452798191, + "grad_norm": 0.5385758816342323, + "learning_rate": 2.1711415043168395e-06, + "loss": 0.3313, + "step": 1071 + }, + { + "epoch": 1.6159788957980026, + "grad_norm": 0.5437131207841849, + "learning_rate": 2.1547848444991025e-06, + "loss": 0.3352, + "step": 1072 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.5395621366503963, + "learning_rate": 2.138482587758605e-06, + "loss": 0.308, + "step": 1073 + }, + { + "epoch": 1.6189937817976259, + "grad_norm": 0.5255773037738669, + "learning_rate": 2.1222348471442477e-06, + "loss": 0.3014, + "step": 1074 + }, + { + "epoch": 1.6205012247974373, + "grad_norm": 0.5226567653631905, + "learning_rate": 2.1060417353268845e-06, + "loss": 0.3143, + "step": 1075 + }, + { + "epoch": 1.622008667797249, + "grad_norm": 0.5449482690998529, + "learning_rate": 2.0899033645985423e-06, + "loss": 0.3091, + "step": 1076 + }, + { + "epoch": 1.6235161107970604, + "grad_norm": 0.5424238529202222, + "learning_rate": 2.073819846871646e-06, + "loss": 0.3185, + "step": 1077 + }, + { + "epoch": 1.625023553796872, + "grad_norm": 0.5311225228497766, + "learning_rate": 2.0577912936782317e-06, + "loss": 0.2983, + "step": 1078 + }, + { + "epoch": 1.6265309967966837, + "grad_norm": 0.5208053420833654, + "learning_rate": 2.041817816169187e-06, + "loss": 0.3295, + "step": 1079 + }, + { + "epoch": 1.6280384397964953, + "grad_norm": 0.5269145283569221, + "learning_rate": 2.025899525113474e-06, + "loss": 0.3026, + "step": 1080 + }, + { + "epoch": 1.6295458827963067, + "grad_norm": 0.5371643301644231, + "learning_rate": 2.010036530897359e-06, + "loss": 0.3196, + "step": 1081 + }, + { + "epoch": 1.6310533257961182, + "grad_norm": 0.5349956576564423, + "learning_rate": 1.9942289435236506e-06, + "loss": 0.3304, + "step": 1082 + }, + { + "epoch": 1.6325607687959298, + "grad_norm": 0.5604150538269126, + "learning_rate": 1.978476872610939e-06, + "loss": 0.3485, + "step": 1083 + }, + { + "epoch": 1.6340682117957415, + "grad_norm": 0.5245207862200475, + "learning_rate": 1.962780427392823e-06, + "loss": 0.2882, + "step": 1084 + }, + { + "epoch": 1.6355756547955531, + "grad_norm": 0.5195585896558206, + "learning_rate": 1.9471397167171714e-06, + "loss": 0.3051, + "step": 1085 + }, + { + "epoch": 1.6370830977953648, + "grad_norm": 0.5147063039454572, + "learning_rate": 1.931554849045355e-06, + "loss": 0.3078, + "step": 1086 + }, + { + "epoch": 1.6385905407951762, + "grad_norm": 0.5303520825987951, + "learning_rate": 1.916025932451493e-06, + "loss": 0.3141, + "step": 1087 + }, + { + "epoch": 1.6400979837949876, + "grad_norm": 0.5154838630662848, + "learning_rate": 1.9005530746217238e-06, + "loss": 0.2971, + "step": 1088 + }, + { + "epoch": 1.6416054267947993, + "grad_norm": 0.5537432078636199, + "learning_rate": 1.8851363828534253e-06, + "loss": 0.3124, + "step": 1089 + }, + { + "epoch": 1.643112869794611, + "grad_norm": 0.5634336334894083, + "learning_rate": 1.869775964054501e-06, + "loss": 0.3271, + "step": 1090 + }, + { + "epoch": 1.6446203127944226, + "grad_norm": 0.5433031560068617, + "learning_rate": 1.8544719247426224e-06, + "loss": 0.3191, + "step": 1091 + }, + { + "epoch": 1.646127755794234, + "grad_norm": 0.5357448136347239, + "learning_rate": 1.8392243710444911e-06, + "loss": 0.2982, + "step": 1092 + }, + { + "epoch": 1.6476351987940456, + "grad_norm": 0.5552897165798768, + "learning_rate": 1.8240334086951117e-06, + "loss": 0.3537, + "step": 1093 + }, + { + "epoch": 1.649142641793857, + "grad_norm": 0.5318934621576651, + "learning_rate": 1.8088991430370506e-06, + "loss": 0.3005, + "step": 1094 + }, + { + "epoch": 1.6506500847936687, + "grad_norm": 0.5465559179605479, + "learning_rate": 1.7938216790197071e-06, + "loss": 0.3207, + "step": 1095 + }, + { + "epoch": 1.6521575277934804, + "grad_norm": 0.5641671337079456, + "learning_rate": 1.77880112119859e-06, + "loss": 0.3095, + "step": 1096 + }, + { + "epoch": 1.653664970793292, + "grad_norm": 0.5270236586496325, + "learning_rate": 1.7638375737345804e-06, + "loss": 0.312, + "step": 1097 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.5192997218773957, + "learning_rate": 1.7489311403932274e-06, + "loss": 0.2937, + "step": 1098 + }, + { + "epoch": 1.656679856792915, + "grad_norm": 0.5620286825583494, + "learning_rate": 1.7340819245440166e-06, + "loss": 0.3186, + "step": 1099 + }, + { + "epoch": 1.6581872997927265, + "grad_norm": 0.5445642066374056, + "learning_rate": 1.7192900291596493e-06, + "loss": 0.3222, + "step": 1100 + }, + { + "epoch": 1.6596947427925381, + "grad_norm": 0.5157869374514513, + "learning_rate": 1.7045555568153415e-06, + "loss": 0.306, + "step": 1101 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.5138381277919514, + "learning_rate": 1.6898786096881104e-06, + "loss": 0.2715, + "step": 1102 + }, + { + "epoch": 1.6627096287921614, + "grad_norm": 0.5431676779116132, + "learning_rate": 1.6752592895560493e-06, + "loss": 0.3202, + "step": 1103 + }, + { + "epoch": 1.6642170717919729, + "grad_norm": 0.5396928068758252, + "learning_rate": 1.6606976977976408e-06, + "loss": 0.3122, + "step": 1104 + }, + { + "epoch": 1.6657245147917843, + "grad_norm": 0.5332820215409003, + "learning_rate": 1.6461939353910494e-06, + "loss": 0.3083, + "step": 1105 + }, + { + "epoch": 1.667231957791596, + "grad_norm": 0.5475766503326961, + "learning_rate": 1.631748102913412e-06, + "loss": 0.3166, + "step": 1106 + }, + { + "epoch": 1.6687394007914076, + "grad_norm": 0.5416380330717204, + "learning_rate": 1.6173603005401505e-06, + "loss": 0.3158, + "step": 1107 + }, + { + "epoch": 1.6702468437912192, + "grad_norm": 0.5416213794269614, + "learning_rate": 1.6030306280442764e-06, + "loss": 0.3077, + "step": 1108 + }, + { + "epoch": 1.6717542867910307, + "grad_norm": 0.5153535761957841, + "learning_rate": 1.588759184795694e-06, + "loss": 0.3064, + "step": 1109 + }, + { + "epoch": 1.6732617297908423, + "grad_norm": 0.5315610226872074, + "learning_rate": 1.574546069760514e-06, + "loss": 0.3241, + "step": 1110 + }, + { + "epoch": 1.6747691727906537, + "grad_norm": 0.5684878986820251, + "learning_rate": 1.5603913815003634e-06, + "loss": 0.3403, + "step": 1111 + }, + { + "epoch": 1.6762766157904654, + "grad_norm": 0.5361918937068931, + "learning_rate": 1.5462952181717117e-06, + "loss": 0.3157, + "step": 1112 + }, + { + "epoch": 1.677784058790277, + "grad_norm": 0.5495570916809654, + "learning_rate": 1.532257677525183e-06, + "loss": 0.3224, + "step": 1113 + }, + { + "epoch": 1.6792915017900887, + "grad_norm": 0.5281943000093583, + "learning_rate": 1.5182788569048689e-06, + "loss": 0.3209, + "step": 1114 + }, + { + "epoch": 1.6807989447899, + "grad_norm": 0.5572787989841019, + "learning_rate": 1.5043588532476827e-06, + "loss": 0.3663, + "step": 1115 + }, + { + "epoch": 1.6823063877897118, + "grad_norm": 0.5228968415248135, + "learning_rate": 1.49049776308265e-06, + "loss": 0.2889, + "step": 1116 + }, + { + "epoch": 1.6838138307895232, + "grad_norm": 0.5220477714238411, + "learning_rate": 1.476695682530268e-06, + "loss": 0.3031, + "step": 1117 + }, + { + "epoch": 1.6853212737893348, + "grad_norm": 0.5351071420566498, + "learning_rate": 1.4629527073018267e-06, + "loss": 0.3308, + "step": 1118 + }, + { + "epoch": 1.6868287167891465, + "grad_norm": 0.5396208371722178, + "learning_rate": 1.449268932698743e-06, + "loss": 0.2971, + "step": 1119 + }, + { + "epoch": 1.6883361597889581, + "grad_norm": 0.5416569763589452, + "learning_rate": 1.4356444536119085e-06, + "loss": 0.3024, + "step": 1120 + }, + { + "epoch": 1.6898436027887695, + "grad_norm": 0.5370439069377987, + "learning_rate": 1.422079364521024e-06, + "loss": 0.3169, + "step": 1121 + }, + { + "epoch": 1.691351045788581, + "grad_norm": 0.5450238281058462, + "learning_rate": 1.4085737594939497e-06, + "loss": 0.333, + "step": 1122 + }, + { + "epoch": 1.6928584887883926, + "grad_norm": 0.5168406644621856, + "learning_rate": 1.3951277321860468e-06, + "loss": 0.3006, + "step": 1123 + }, + { + "epoch": 1.6943659317882043, + "grad_norm": 0.5087028192552481, + "learning_rate": 1.381741375839537e-06, + "loss": 0.2664, + "step": 1124 + }, + { + "epoch": 1.695873374788016, + "grad_norm": 0.5165999383002566, + "learning_rate": 1.3684147832828409e-06, + "loss": 0.281, + "step": 1125 + }, + { + "epoch": 1.6973808177878273, + "grad_norm": 0.5150348541332692, + "learning_rate": 1.355148046929956e-06, + "loss": 0.307, + "step": 1126 + }, + { + "epoch": 1.698888260787639, + "grad_norm": 0.5168141041103775, + "learning_rate": 1.3419412587797908e-06, + "loss": 0.293, + "step": 1127 + }, + { + "epoch": 1.7003957037874504, + "grad_norm": 0.5133973098786774, + "learning_rate": 1.3287945104155487e-06, + "loss": 0.3015, + "step": 1128 + }, + { + "epoch": 1.701903146787262, + "grad_norm": 0.5513676691974454, + "learning_rate": 1.3157078930040856e-06, + "loss": 0.3179, + "step": 1129 + }, + { + "epoch": 1.7034105897870737, + "grad_norm": 0.546503387891844, + "learning_rate": 1.3026814972952674e-06, + "loss": 0.3043, + "step": 1130 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.5133928571679112, + "learning_rate": 1.2897154136213542e-06, + "loss": 0.2938, + "step": 1131 + }, + { + "epoch": 1.7064254757866968, + "grad_norm": 0.5441429881648664, + "learning_rate": 1.2768097318963701e-06, + "loss": 0.2978, + "step": 1132 + }, + { + "epoch": 1.7079329187865084, + "grad_norm": 0.5430821498300733, + "learning_rate": 1.2639645416154744e-06, + "loss": 0.3204, + "step": 1133 + }, + { + "epoch": 1.7094403617863199, + "grad_norm": 0.5536392631462185, + "learning_rate": 1.2511799318543493e-06, + "loss": 0.3196, + "step": 1134 + }, + { + "epoch": 1.7109478047861315, + "grad_norm": 0.5403946840340975, + "learning_rate": 1.2384559912685768e-06, + "loss": 0.3156, + "step": 1135 + }, + { + "epoch": 1.7124552477859432, + "grad_norm": 0.5360872549447121, + "learning_rate": 1.2257928080930236e-06, + "loss": 0.3275, + "step": 1136 + }, + { + "epoch": 1.7139626907857548, + "grad_norm": 0.5482125645711281, + "learning_rate": 1.2131904701412345e-06, + "loss": 0.3041, + "step": 1137 + }, + { + "epoch": 1.7154701337855662, + "grad_norm": 0.5497152879102077, + "learning_rate": 1.2006490648048118e-06, + "loss": 0.3148, + "step": 1138 + }, + { + "epoch": 1.7169775767853777, + "grad_norm": 0.5141035384965538, + "learning_rate": 1.1881686790528279e-06, + "loss": 0.3068, + "step": 1139 + }, + { + "epoch": 1.7184850197851893, + "grad_norm": 0.529010142033819, + "learning_rate": 1.1757493994312052e-06, + "loss": 0.3088, + "step": 1140 + }, + { + "epoch": 1.719992462785001, + "grad_norm": 0.5495066854090749, + "learning_rate": 1.1633913120621188e-06, + "loss": 0.3236, + "step": 1141 + }, + { + "epoch": 1.7214999057848126, + "grad_norm": 0.5492447937245492, + "learning_rate": 1.151094502643414e-06, + "loss": 0.3308, + "step": 1142 + }, + { + "epoch": 1.723007348784624, + "grad_norm": 0.5562630282965828, + "learning_rate": 1.1388590564479895e-06, + "loss": 0.299, + "step": 1143 + }, + { + "epoch": 1.7245147917844357, + "grad_norm": 0.5231751271166386, + "learning_rate": 1.1266850583232224e-06, + "loss": 0.3053, + "step": 1144 + }, + { + "epoch": 1.726022234784247, + "grad_norm": 0.5201419013982067, + "learning_rate": 1.1145725926903772e-06, + "loss": 0.3023, + "step": 1145 + }, + { + "epoch": 1.7275296777840587, + "grad_norm": 0.511856980805046, + "learning_rate": 1.1025217435440116e-06, + "loss": 0.2867, + "step": 1146 + }, + { + "epoch": 1.7290371207838704, + "grad_norm": 0.565983997374927, + "learning_rate": 1.0905325944514034e-06, + "loss": 0.3232, + "step": 1147 + }, + { + "epoch": 1.730544563783682, + "grad_norm": 0.5363045072212188, + "learning_rate": 1.078605228551971e-06, + "loss": 0.3182, + "step": 1148 + }, + { + "epoch": 1.7320520067834935, + "grad_norm": 0.5329886787330583, + "learning_rate": 1.0667397285566893e-06, + "loss": 0.3061, + "step": 1149 + }, + { + "epoch": 1.7335594497833051, + "grad_norm": 0.5397864180847504, + "learning_rate": 1.0549361767475241e-06, + "loss": 0.2873, + "step": 1150 + }, + { + "epoch": 1.7350668927831165, + "grad_norm": 0.5436027401118747, + "learning_rate": 1.0431946549768567e-06, + "loss": 0.3213, + "step": 1151 + }, + { + "epoch": 1.7365743357829282, + "grad_norm": 0.5304426225729307, + "learning_rate": 1.0315152446669142e-06, + "loss": 0.295, + "step": 1152 + }, + { + "epoch": 1.7380817787827398, + "grad_norm": 0.5119724476906113, + "learning_rate": 1.019898026809214e-06, + "loss": 0.3009, + "step": 1153 + }, + { + "epoch": 1.7395892217825515, + "grad_norm": 0.5399752438286287, + "learning_rate": 1.0083430819639962e-06, + "loss": 0.3097, + "step": 1154 + }, + { + "epoch": 1.741096664782363, + "grad_norm": 0.5329124149971953, + "learning_rate": 9.968504902596566e-07, + "loss": 0.3094, + "step": 1155 + }, + { + "epoch": 1.7426041077821743, + "grad_norm": 0.5265575018375785, + "learning_rate": 9.85420331392214e-07, + "loss": 0.3001, + "step": 1156 + }, + { + "epoch": 1.744111550781986, + "grad_norm": 0.5415027063140824, + "learning_rate": 9.74052684624731e-07, + "loss": 0.3052, + "step": 1157 + }, + { + "epoch": 1.7456189937817976, + "grad_norm": 0.5273083269054069, + "learning_rate": 9.62747628786782e-07, + "loss": 0.2918, + "step": 1158 + }, + { + "epoch": 1.7471264367816093, + "grad_norm": 0.5467675396074031, + "learning_rate": 9.515052422739035e-07, + "loss": 0.3013, + "step": 1159 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.5012647001758278, + "learning_rate": 9.403256030470386e-07, + "loss": 0.2922, + "step": 1160 + }, + { + "epoch": 1.7501413227812324, + "grad_norm": 0.5473662670334606, + "learning_rate": 9.292087886320166e-07, + "loss": 0.3275, + "step": 1161 + }, + { + "epoch": 1.7516487657810438, + "grad_norm": 0.5119711645632107, + "learning_rate": 9.181548761189996e-07, + "loss": 0.2997, + "step": 1162 + }, + { + "epoch": 1.7531562087808554, + "grad_norm": 0.5579153837729429, + "learning_rate": 9.071639421619527e-07, + "loss": 0.3373, + "step": 1163 + }, + { + "epoch": 1.754663651780667, + "grad_norm": 0.5369066551498206, + "learning_rate": 8.962360629781164e-07, + "loss": 0.3013, + "step": 1164 + }, + { + "epoch": 1.7561710947804787, + "grad_norm": 0.5299407293801213, + "learning_rate": 8.853713143474685e-07, + "loss": 0.2977, + "step": 1165 + }, + { + "epoch": 1.7576785377802902, + "grad_norm": 0.5246411563266595, + "learning_rate": 8.745697716122081e-07, + "loss": 0.3192, + "step": 1166 + }, + { + "epoch": 1.7591859807801018, + "grad_norm": 0.5450557723814945, + "learning_rate": 8.638315096762318e-07, + "loss": 0.3075, + "step": 1167 + }, + { + "epoch": 1.7606934237799132, + "grad_norm": 0.5392032704205785, + "learning_rate": 8.531566030046035e-07, + "loss": 0.3231, + "step": 1168 + }, + { + "epoch": 1.7622008667797249, + "grad_norm": 0.5370373229347402, + "learning_rate": 8.425451256230588e-07, + "loss": 0.3012, + "step": 1169 + }, + { + "epoch": 1.7637083097795365, + "grad_norm": 0.5405109358545394, + "learning_rate": 8.319971511174718e-07, + "loss": 0.3165, + "step": 1170 + }, + { + "epoch": 1.7652157527793482, + "grad_norm": 0.528010212076121, + "learning_rate": 8.215127526333499e-07, + "loss": 0.3236, + "step": 1171 + }, + { + "epoch": 1.7667231957791596, + "grad_norm": 0.5226712118154457, + "learning_rate": 8.110920028753355e-07, + "loss": 0.3088, + "step": 1172 + }, + { + "epoch": 1.768230638778971, + "grad_norm": 0.5574504221106463, + "learning_rate": 8.007349741066939e-07, + "loss": 0.3258, + "step": 1173 + }, + { + "epoch": 1.7697380817787827, + "grad_norm": 0.5452638755092976, + "learning_rate": 7.904417381488083e-07, + "loss": 0.3167, + "step": 1174 + }, + { + "epoch": 1.7712455247785943, + "grad_norm": 0.5517680932401637, + "learning_rate": 7.802123663806938e-07, + "loss": 0.3227, + "step": 1175 + }, + { + "epoch": 1.772752967778406, + "grad_norm": 0.5480833770797733, + "learning_rate": 7.700469297384927e-07, + "loss": 0.3307, + "step": 1176 + }, + { + "epoch": 1.7742604107782176, + "grad_norm": 0.5430614726412718, + "learning_rate": 7.599454987149868e-07, + "loss": 0.3463, + "step": 1177 + }, + { + "epoch": 1.775767853778029, + "grad_norm": 0.549255722400039, + "learning_rate": 7.499081433591071e-07, + "loss": 0.3284, + "step": 1178 + }, + { + "epoch": 1.7772752967778405, + "grad_norm": 0.5410909371721678, + "learning_rate": 7.399349332754458e-07, + "loss": 0.315, + "step": 1179 + }, + { + "epoch": 1.778782739777652, + "grad_norm": 0.530772119547457, + "learning_rate": 7.300259376237795e-07, + "loss": 0.3101, + "step": 1180 + }, + { + "epoch": 1.7802901827774638, + "grad_norm": 0.5484763677836378, + "learning_rate": 7.201812251185869e-07, + "loss": 0.3192, + "step": 1181 + }, + { + "epoch": 1.7817976257772754, + "grad_norm": 0.5299690672941552, + "learning_rate": 7.104008640285642e-07, + "loss": 0.3115, + "step": 1182 + }, + { + "epoch": 1.7833050687770868, + "grad_norm": 0.5424348226189657, + "learning_rate": 7.006849221761736e-07, + "loss": 0.3119, + "step": 1183 + }, + { + "epoch": 1.7848125117768985, + "grad_norm": 0.5221149596372863, + "learning_rate": 6.910334669371433e-07, + "loss": 0.3078, + "step": 1184 + }, + { + "epoch": 1.78631995477671, + "grad_norm": 0.5581110749980865, + "learning_rate": 6.814465652400237e-07, + "loss": 0.3364, + "step": 1185 + }, + { + "epoch": 1.7878273977765216, + "grad_norm": 0.5227271898985753, + "learning_rate": 6.719242835657147e-07, + "loss": 0.3057, + "step": 1186 + }, + { + "epoch": 1.7893348407763332, + "grad_norm": 0.5541663205023336, + "learning_rate": 6.62466687947001e-07, + "loss": 0.335, + "step": 1187 + }, + { + "epoch": 1.7908422837761448, + "grad_norm": 0.5269336686543489, + "learning_rate": 6.530738439681017e-07, + "loss": 0.3151, + "step": 1188 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.5786593133724774, + "learning_rate": 6.437458167642164e-07, + "loss": 0.3366, + "step": 1189 + }, + { + "epoch": 1.7938571697757677, + "grad_norm": 0.5253461874134103, + "learning_rate": 6.344826710210584e-07, + "loss": 0.3127, + "step": 1190 + }, + { + "epoch": 1.7953646127755793, + "grad_norm": 0.5299856089834871, + "learning_rate": 6.252844709744255e-07, + "loss": 0.3135, + "step": 1191 + }, + { + "epoch": 1.796872055775391, + "grad_norm": 0.521456043233, + "learning_rate": 6.161512804097436e-07, + "loss": 0.2977, + "step": 1192 + }, + { + "epoch": 1.7983794987752026, + "grad_norm": 0.5469215546867617, + "learning_rate": 6.070831626616236e-07, + "loss": 0.3255, + "step": 1193 + }, + { + "epoch": 1.7998869417750143, + "grad_norm": 0.5228672904607026, + "learning_rate": 5.980801806134318e-07, + "loss": 0.2866, + "step": 1194 + }, + { + "epoch": 1.8013943847748257, + "grad_norm": 0.5146912409802952, + "learning_rate": 5.891423966968413e-07, + "loss": 0.2881, + "step": 1195 + }, + { + "epoch": 1.8029018277746371, + "grad_norm": 0.5474095770320819, + "learning_rate": 5.80269872891408e-07, + "loss": 0.3036, + "step": 1196 + }, + { + "epoch": 1.8044092707744488, + "grad_norm": 0.5298374503272721, + "learning_rate": 5.714626707241411e-07, + "loss": 0.3032, + "step": 1197 + }, + { + "epoch": 1.8059167137742604, + "grad_norm": 0.5577685295614593, + "learning_rate": 5.627208512690641e-07, + "loss": 0.3136, + "step": 1198 + }, + { + "epoch": 1.807424156774072, + "grad_norm": 0.5641313668289314, + "learning_rate": 5.5404447514681e-07, + "loss": 0.3057, + "step": 1199 + }, + { + "epoch": 1.8089315997738835, + "grad_norm": 0.5295651592847044, + "learning_rate": 5.45433602524188e-07, + "loss": 0.292, + "step": 1200 + }, + { + "epoch": 1.8104390427736952, + "grad_norm": 0.5164790354946905, + "learning_rate": 5.368882931137675e-07, + "loss": 0.3171, + "step": 1201 + }, + { + "epoch": 1.8119464857735066, + "grad_norm": 0.5440394178727653, + "learning_rate": 5.284086061734672e-07, + "loss": 0.3389, + "step": 1202 + }, + { + "epoch": 1.8134539287733182, + "grad_norm": 0.5379140129646219, + "learning_rate": 5.199946005061462e-07, + "loss": 0.3191, + "step": 1203 + }, + { + "epoch": 1.8149613717731299, + "grad_norm": 0.5586596394798488, + "learning_rate": 5.116463344591893e-07, + "loss": 0.3297, + "step": 1204 + }, + { + "epoch": 1.8164688147729415, + "grad_norm": 0.5460139307968361, + "learning_rate": 5.033638659241102e-07, + "loss": 0.3179, + "step": 1205 + }, + { + "epoch": 1.817976257772753, + "grad_norm": 0.5099561737975997, + "learning_rate": 4.951472523361401e-07, + "loss": 0.2881, + "step": 1206 + }, + { + "epoch": 1.8194837007725644, + "grad_norm": 0.5127038476010487, + "learning_rate": 4.869965506738416e-07, + "loss": 0.301, + "step": 1207 + }, + { + "epoch": 1.820991143772376, + "grad_norm": 0.5160382306170839, + "learning_rate": 4.789118174587071e-07, + "loss": 0.2951, + "step": 1208 + }, + { + "epoch": 1.8224985867721877, + "grad_norm": 0.5368235750006268, + "learning_rate": 4.7089310875475856e-07, + "loss": 0.3244, + "step": 1209 + }, + { + "epoch": 1.8240060297719993, + "grad_norm": 0.5486310091200752, + "learning_rate": 4.6294048016817917e-07, + "loss": 0.3357, + "step": 1210 + }, + { + "epoch": 1.825513472771811, + "grad_norm": 0.5531427959479509, + "learning_rate": 4.550539868469106e-07, + "loss": 0.3404, + "step": 1211 + }, + { + "epoch": 1.8270209157716224, + "grad_norm": 0.5715387730686571, + "learning_rate": 4.4723368348027375e-07, + "loss": 0.3172, + "step": 1212 + }, + { + "epoch": 1.8285283587714338, + "grad_norm": 0.5208566706980682, + "learning_rate": 4.394796242985933e-07, + "loss": 0.3334, + "step": 1213 + }, + { + "epoch": 1.8300358017712455, + "grad_norm": 0.5088911051543478, + "learning_rate": 4.317918630728235e-07, + "loss": 0.3022, + "step": 1214 + }, + { + "epoch": 1.8315432447710571, + "grad_norm": 0.5459330178466746, + "learning_rate": 4.241704531141633e-07, + "loss": 0.3192, + "step": 1215 + }, + { + "epoch": 1.8330506877708688, + "grad_norm": 0.5256076032695434, + "learning_rate": 4.166154472737061e-07, + "loss": 0.2962, + "step": 1216 + }, + { + "epoch": 1.8345581307706802, + "grad_norm": 0.525111212488327, + "learning_rate": 4.091268979420537e-07, + "loss": 0.3015, + "step": 1217 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.5131054923837834, + "learning_rate": 4.0170485704896453e-07, + "loss": 0.2984, + "step": 1218 + }, + { + "epoch": 1.8375730167703033, + "grad_norm": 0.5222731798701179, + "learning_rate": 3.943493760629924e-07, + "loss": 0.3007, + "step": 1219 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.526864271558084, + "learning_rate": 3.8706050599112363e-07, + "loss": 0.3265, + "step": 1220 + }, + { + "epoch": 1.8405879027699266, + "grad_norm": 0.5429845817805197, + "learning_rate": 3.798382973784298e-07, + "loss": 0.3126, + "step": 1221 + }, + { + "epoch": 1.8420953457697382, + "grad_norm": 0.5203043666384023, + "learning_rate": 3.7268280030771655e-07, + "loss": 0.3005, + "step": 1222 + }, + { + "epoch": 1.8436027887695496, + "grad_norm": 0.5553304676785623, + "learning_rate": 3.655940643991718e-07, + "loss": 0.3033, + "step": 1223 + }, + { + "epoch": 1.845110231769361, + "grad_norm": 0.5453329521030166, + "learning_rate": 3.585721388100283e-07, + "loss": 0.3197, + "step": 1224 + }, + { + "epoch": 1.8466176747691727, + "grad_norm": 0.5358448339719871, + "learning_rate": 3.516170722342127e-07, + "loss": 0.3123, + "step": 1225 + }, + { + "epoch": 1.8481251177689844, + "grad_norm": 0.5255670215605667, + "learning_rate": 3.4472891290201927e-07, + "loss": 0.3052, + "step": 1226 + }, + { + "epoch": 1.849632560768796, + "grad_norm": 0.5558001652650641, + "learning_rate": 3.3790770857976995e-07, + "loss": 0.326, + "step": 1227 + }, + { + "epoch": 1.8511400037686077, + "grad_norm": 0.5383925013665675, + "learning_rate": 3.3115350656948043e-07, + "loss": 0.3074, + "step": 1228 + }, + { + "epoch": 1.852647446768419, + "grad_norm": 0.537705868540796, + "learning_rate": 3.2446635370853686e-07, + "loss": 0.3304, + "step": 1229 + }, + { + "epoch": 1.8541548897682305, + "grad_norm": 0.5382781367285551, + "learning_rate": 3.1784629636937404e-07, + "loss": 0.2883, + "step": 1230 + }, + { + "epoch": 1.8556623327680422, + "grad_norm": 0.5198656289929648, + "learning_rate": 3.1129338045914004e-07, + "loss": 0.3067, + "step": 1231 + }, + { + "epoch": 1.8571697757678538, + "grad_norm": 0.5472844326917599, + "learning_rate": 3.0480765141939316e-07, + "loss": 0.2992, + "step": 1232 + }, + { + "epoch": 1.8586772187676655, + "grad_norm": 0.5507075256404199, + "learning_rate": 2.9838915422578e-07, + "loss": 0.3179, + "step": 1233 + }, + { + "epoch": 1.8601846617674769, + "grad_norm": 0.5510217424809797, + "learning_rate": 2.920379333877221e-07, + "loss": 0.2994, + "step": 1234 + }, + { + "epoch": 1.8616921047672885, + "grad_norm": 0.5400374711049234, + "learning_rate": 2.8575403294811123e-07, + "loss": 0.32, + "step": 1235 + }, + { + "epoch": 1.8631995477671, + "grad_norm": 0.5378450793044806, + "learning_rate": 2.795374964830022e-07, + "loss": 0.2982, + "step": 1236 + }, + { + "epoch": 1.8647069907669116, + "grad_norm": 0.5537869321901812, + "learning_rate": 2.733883671013082e-07, + "loss": 0.3215, + "step": 1237 + }, + { + "epoch": 1.8662144337667232, + "grad_norm": 0.5214658307160525, + "learning_rate": 2.673066874445096e-07, + "loss": 0.2967, + "step": 1238 + }, + { + "epoch": 1.867721876766535, + "grad_norm": 0.5396057264477051, + "learning_rate": 2.612924996863453e-07, + "loss": 0.3323, + "step": 1239 + }, + { + "epoch": 1.8692293197663463, + "grad_norm": 0.5029335509841266, + "learning_rate": 2.5534584553253526e-07, + "loss": 0.2874, + "step": 1240 + }, + { + "epoch": 1.8707367627661577, + "grad_norm": 0.5258804019291271, + "learning_rate": 2.494667662204797e-07, + "loss": 0.2899, + "step": 1241 + }, + { + "epoch": 1.8722442057659694, + "grad_norm": 0.5248481914254555, + "learning_rate": 2.436553025189758e-07, + "loss": 0.3024, + "step": 1242 + }, + { + "epoch": 1.873751648765781, + "grad_norm": 0.5561247405618174, + "learning_rate": 2.3791149472794373e-07, + "loss": 0.3224, + "step": 1243 + }, + { + "epoch": 1.8752590917655927, + "grad_norm": 0.5386355445772364, + "learning_rate": 2.3223538267813317e-07, + "loss": 0.3252, + "step": 1244 + }, + { + "epoch": 1.8767665347654043, + "grad_norm": 0.5387316814949316, + "learning_rate": 2.2662700573085505e-07, + "loss": 0.3188, + "step": 1245 + }, + { + "epoch": 1.8782739777652158, + "grad_norm": 0.5140491567851894, + "learning_rate": 2.2108640277771153e-07, + "loss": 0.3087, + "step": 1246 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.5578574961077984, + "learning_rate": 2.156136122403174e-07, + "loss": 0.3339, + "step": 1247 + }, + { + "epoch": 1.8812888637648388, + "grad_norm": 0.531923059262347, + "learning_rate": 2.1020867207004026e-07, + "loss": 0.302, + "step": 1248 + }, + { + "epoch": 1.8827963067646505, + "grad_norm": 0.5199091046599704, + "learning_rate": 2.048716197477374e-07, + "loss": 0.3, + "step": 1249 + }, + { + "epoch": 1.8843037497644621, + "grad_norm": 0.5292062225255757, + "learning_rate": 1.996024922834905e-07, + "loss": 0.3136, + "step": 1250 + }, + { + "epoch": 1.8858111927642736, + "grad_norm": 0.5116718173644801, + "learning_rate": 1.9440132621635687e-07, + "loss": 0.3022, + "step": 1251 + }, + { + "epoch": 1.8873186357640852, + "grad_norm": 0.5311851666913588, + "learning_rate": 1.8926815761410867e-07, + "loss": 0.3119, + "step": 1252 + }, + { + "epoch": 1.8888260787638966, + "grad_norm": 0.5164961460809835, + "learning_rate": 1.8420302207298623e-07, + "loss": 0.3366, + "step": 1253 + }, + { + "epoch": 1.8903335217637083, + "grad_norm": 0.504315215516738, + "learning_rate": 1.792059547174507e-07, + "loss": 0.2975, + "step": 1254 + }, + { + "epoch": 1.89184096476352, + "grad_norm": 0.5345352832708335, + "learning_rate": 1.7427699019994415e-07, + "loss": 0.3236, + "step": 1255 + }, + { + "epoch": 1.8933484077633316, + "grad_norm": 0.5381742517110331, + "learning_rate": 1.6941616270063854e-07, + "loss": 0.3279, + "step": 1256 + }, + { + "epoch": 1.894855850763143, + "grad_norm": 0.5375566393713683, + "learning_rate": 1.6462350592721498e-07, + "loss": 0.3362, + "step": 1257 + }, + { + "epoch": 1.8963632937629544, + "grad_norm": 0.5285514266127366, + "learning_rate": 1.5989905311461274e-07, + "loss": 0.3204, + "step": 1258 + }, + { + "epoch": 1.897870736762766, + "grad_norm": 0.5414536639413304, + "learning_rate": 1.5524283702481158e-07, + "loss": 0.3335, + "step": 1259 + }, + { + "epoch": 1.8993781797625777, + "grad_norm": 0.512538356667461, + "learning_rate": 1.5065488994659983e-07, + "loss": 0.3053, + "step": 1260 + }, + { + "epoch": 1.9008856227623894, + "grad_norm": 0.5324212249495981, + "learning_rate": 1.461352436953478e-07, + "loss": 0.3072, + "step": 1261 + }, + { + "epoch": 1.902393065762201, + "grad_norm": 0.5455697348576503, + "learning_rate": 1.4168392961279254e-07, + "loss": 0.3316, + "step": 1262 + }, + { + "epoch": 1.9039005087620124, + "grad_norm": 0.5466375519251029, + "learning_rate": 1.3730097856681668e-07, + "loss": 0.3226, + "step": 1263 + }, + { + "epoch": 1.9054079517618239, + "grad_norm": 0.5312632713929628, + "learning_rate": 1.329864209512377e-07, + "loss": 0.315, + "step": 1264 + }, + { + "epoch": 1.9069153947616355, + "grad_norm": 0.5425648068314173, + "learning_rate": 1.2874028668559247e-07, + "loss": 0.3235, + "step": 1265 + }, + { + "epoch": 1.9084228377614472, + "grad_norm": 0.5312642091039448, + "learning_rate": 1.245626052149318e-07, + "loss": 0.3203, + "step": 1266 + }, + { + "epoch": 1.9099302807612588, + "grad_norm": 0.532495465640754, + "learning_rate": 1.2045340550961958e-07, + "loss": 0.3155, + "step": 1267 + }, + { + "epoch": 1.9114377237610702, + "grad_norm": 0.5246778980321247, + "learning_rate": 1.164127160651285e-07, + "loss": 0.2926, + "step": 1268 + }, + { + "epoch": 1.9129451667608819, + "grad_norm": 0.5339514500193528, + "learning_rate": 1.1244056490184008e-07, + "loss": 0.3029, + "step": 1269 + }, + { + "epoch": 1.9144526097606933, + "grad_norm": 0.520828858822998, + "learning_rate": 1.0853697956485942e-07, + "loss": 0.3065, + "step": 1270 + }, + { + "epoch": 1.915960052760505, + "grad_norm": 0.520817868672033, + "learning_rate": 1.0470198712381086e-07, + "loss": 0.307, + "step": 1271 + }, + { + "epoch": 1.9174674957603166, + "grad_norm": 0.516414932582989, + "learning_rate": 1.009356141726614e-07, + "loss": 0.3101, + "step": 1272 + }, + { + "epoch": 1.9189749387601283, + "grad_norm": 0.549210829131398, + "learning_rate": 9.723788682953539e-08, + "loss": 0.3562, + "step": 1273 + }, + { + "epoch": 1.9204823817599397, + "grad_norm": 0.5457067373758283, + "learning_rate": 9.360883073652238e-08, + "loss": 0.3179, + "step": 1274 + }, + { + "epoch": 1.921989824759751, + "grad_norm": 0.5418508804321499, + "learning_rate": 9.004847105951509e-08, + "loss": 0.3159, + "step": 1275 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.5366441286826634, + "learning_rate": 8.655683248802282e-08, + "loss": 0.2996, + "step": 1276 + }, + { + "epoch": 1.9250047107593744, + "grad_norm": 0.5442333602669928, + "learning_rate": 8.313393923500613e-08, + "loss": 0.3088, + "step": 1277 + }, + { + "epoch": 1.926512153759186, + "grad_norm": 0.5391901662166373, + "learning_rate": 7.977981503670795e-08, + "loss": 0.3061, + "step": 1278 + }, + { + "epoch": 1.9280195967589977, + "grad_norm": 0.5435340810409717, + "learning_rate": 7.64944831524872e-08, + "loss": 0.3285, + "step": 1279 + }, + { + "epoch": 1.9295270397588091, + "grad_norm": 0.521864945549257, + "learning_rate": 7.327796636465767e-08, + "loss": 0.3076, + "step": 1280 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.5514257916288331, + "learning_rate": 7.01302869783338e-08, + "loss": 0.3094, + "step": 1281 + }, + { + "epoch": 1.9325419257584322, + "grad_norm": 0.5219787357249853, + "learning_rate": 6.705146682127184e-08, + "loss": 0.2995, + "step": 1282 + }, + { + "epoch": 1.9340493687582438, + "grad_norm": 0.5276229371813537, + "learning_rate": 6.404152724371892e-08, + "loss": 0.3091, + "step": 1283 + }, + { + "epoch": 1.9355568117580555, + "grad_norm": 0.5314226178684127, + "learning_rate": 6.110048911826871e-08, + "loss": 0.3158, + "step": 1284 + }, + { + "epoch": 1.937064254757867, + "grad_norm": 0.5502887577785275, + "learning_rate": 5.82283728397115e-08, + "loss": 0.3215, + "step": 1285 + }, + { + "epoch": 1.9385716977576786, + "grad_norm": 0.5486357521857442, + "learning_rate": 5.542519832489546e-08, + "loss": 0.3386, + "step": 1286 + }, + { + "epoch": 1.94007914075749, + "grad_norm": 0.5249846416693436, + "learning_rate": 5.269098501259007e-08, + "loss": 0.3097, + "step": 1287 + }, + { + "epoch": 1.9415865837573016, + "grad_norm": 0.5254444548298214, + "learning_rate": 5.002575186334735e-08, + "loss": 0.3357, + "step": 1288 + }, + { + "epoch": 1.9430940267571133, + "grad_norm": 0.5333212942639225, + "learning_rate": 4.742951735937418e-08, + "loss": 0.3051, + "step": 1289 + }, + { + "epoch": 1.944601469756925, + "grad_norm": 0.551663577132892, + "learning_rate": 4.490229950440239e-08, + "loss": 0.3004, + "step": 1290 + }, + { + "epoch": 1.9461089127567364, + "grad_norm": 0.5255799475848183, + "learning_rate": 4.2444115823562226e-08, + "loss": 0.2978, + "step": 1291 + }, + { + "epoch": 1.9476163557565478, + "grad_norm": 0.5122243774948546, + "learning_rate": 4.005498336326463e-08, + "loss": 0.2904, + "step": 1292 + }, + { + "epoch": 1.9491237987563594, + "grad_norm": 0.5387853567764707, + "learning_rate": 3.773491869108137e-08, + "loss": 0.3186, + "step": 1293 + }, + { + "epoch": 1.950631241756171, + "grad_norm": 0.5188343179014033, + "learning_rate": 3.548393789562732e-08, + "loss": 0.3009, + "step": 1294 + }, + { + "epoch": 1.9521386847559827, + "grad_norm": 0.5377374320626865, + "learning_rate": 3.3302056586453916e-08, + "loss": 0.3054, + "step": 1295 + }, + { + "epoch": 1.9536461277557944, + "grad_norm": 0.535561804417277, + "learning_rate": 3.118928989393699e-08, + "loss": 0.296, + "step": 1296 + }, + { + "epoch": 1.9551535707556058, + "grad_norm": 0.5298311028053398, + "learning_rate": 2.9145652469174666e-08, + "loss": 0.3049, + "step": 1297 + }, + { + "epoch": 1.9566610137554172, + "grad_norm": 0.5269215357647239, + "learning_rate": 2.7171158483882963e-08, + "loss": 0.2986, + "step": 1298 + }, + { + "epoch": 1.9581684567552289, + "grad_norm": 0.5270798678914951, + "learning_rate": 2.5265821630298116e-08, + "loss": 0.3214, + "step": 1299 + }, + { + "epoch": 1.9596758997550405, + "grad_norm": 0.5488284780441306, + "learning_rate": 2.3429655121085525e-08, + "loss": 0.3293, + "step": 1300 + }, + { + "epoch": 1.9611833427548522, + "grad_norm": 0.5281296618472574, + "learning_rate": 2.1662671689242076e-08, + "loss": 0.3269, + "step": 1301 + }, + { + "epoch": 1.9626907857546636, + "grad_norm": 0.5477047385786338, + "learning_rate": 1.996488358801174e-08, + "loss": 0.3116, + "step": 1302 + }, + { + "epoch": 1.9641982287544753, + "grad_norm": 0.548270877454329, + "learning_rate": 1.8336302590798992e-08, + "loss": 0.3415, + "step": 1303 + }, + { + "epoch": 1.9657056717542867, + "grad_norm": 0.5385366961987965, + "learning_rate": 1.677693999109109e-08, + "loss": 0.3036, + "step": 1304 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.5125316134927453, + "learning_rate": 1.5286806602372583e-08, + "loss": 0.2899, + "step": 1305 + }, + { + "epoch": 1.96872055775391, + "grad_norm": 0.5211226615097172, + "learning_rate": 1.3865912758054267e-08, + "loss": 0.3025, + "step": 1306 + }, + { + "epoch": 1.9702280007537216, + "grad_norm": 0.5265304987884217, + "learning_rate": 1.2514268311405452e-08, + "loss": 0.3005, + "step": 1307 + }, + { + "epoch": 1.971735443753533, + "grad_norm": 0.5410147654111483, + "learning_rate": 1.1231882635477364e-08, + "loss": 0.3119, + "step": 1308 + }, + { + "epoch": 1.9732428867533447, + "grad_norm": 0.5170988890501786, + "learning_rate": 1.0018764623045407e-08, + "loss": 0.2958, + "step": 1309 + }, + { + "epoch": 1.9747503297531561, + "grad_norm": 0.5184171784095163, + "learning_rate": 8.874922686541442e-09, + "loss": 0.2924, + "step": 1310 + }, + { + "epoch": 1.9762577727529678, + "grad_norm": 0.5140179548472411, + "learning_rate": 7.800364758002721e-09, + "loss": 0.2935, + "step": 1311 + }, + { + "epoch": 1.9777652157527794, + "grad_norm": 0.5486141459025122, + "learning_rate": 6.795098289008595e-09, + "loss": 0.315, + "step": 1312 + }, + { + "epoch": 1.979272658752591, + "grad_norm": 0.5311267803536656, + "learning_rate": 5.859130250636113e-09, + "loss": 0.3115, + "step": 1313 + }, + { + "epoch": 1.9807801017524025, + "grad_norm": 0.5127976418049099, + "learning_rate": 4.992467133406731e-09, + "loss": 0.2853, + "step": 1314 + }, + { + "epoch": 1.982287544752214, + "grad_norm": 0.5328293684979241, + "learning_rate": 4.195114947244117e-09, + "loss": 0.3117, + "step": 1315 + }, + { + "epoch": 1.9837949877520256, + "grad_norm": 0.5211491118658048, + "learning_rate": 3.4670792214297476e-09, + "loss": 0.3049, + "step": 1316 + }, + { + "epoch": 1.9853024307518372, + "grad_norm": 0.532080710582646, + "learning_rate": 2.808365004569602e-09, + "loss": 0.3075, + "step": 1317 + }, + { + "epoch": 1.9868098737516489, + "grad_norm": 0.530526147652671, + "learning_rate": 2.2189768645519693e-09, + "loss": 0.3158, + "step": 1318 + }, + { + "epoch": 1.9883173167514603, + "grad_norm": 0.5335931657065038, + "learning_rate": 1.6989188885219165e-09, + "loss": 0.319, + "step": 1319 + }, + { + "epoch": 1.989824759751272, + "grad_norm": 0.527218281586083, + "learning_rate": 1.2481946828502011e-09, + "loss": 0.2986, + "step": 1320 + }, + { + "epoch": 1.9913322027510834, + "grad_norm": 0.5444631806162264, + "learning_rate": 8.668073731088467e-10, + "loss": 0.3414, + "step": 1321 + }, + { + "epoch": 1.992839645750895, + "grad_norm": 0.5458452249259766, + "learning_rate": 5.547596040489378e-10, + "loss": 0.3312, + "step": 1322 + }, + { + "epoch": 1.9943470887507067, + "grad_norm": 0.5337875506880636, + "learning_rate": 3.1205353958285724e-10, + "loss": 0.3065, + "step": 1323 + }, + { + "epoch": 1.9958545317505183, + "grad_norm": 0.5477429410153635, + "learning_rate": 1.3869086276985243e-10, + "loss": 0.308, + "step": 1324 + }, + { + "epoch": 1.9973619747503297, + "grad_norm": 0.5355633680169556, + "learning_rate": 3.467277580271322e-11, + "loss": 0.3114, + "step": 1325 + }, + { + "epoch": 1.9988694177501414, + "grad_norm": 0.5487135118890082, + "learning_rate": 0.0, + "loss": 0.3367, + "step": 1326 + }, + { + "epoch": 1.9988694177501414, + "step": 1326, + "total_flos": 5.576345153511096e+17, + "train_loss": 0.3973805017061363, + "train_runtime": 5664.6789, + "train_samples_per_second": 29.975, + "train_steps_per_second": 0.234 + } + ], + "logging_steps": 1, + "max_steps": 1326, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.576345153511096e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}