{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.7289930164451452, "eval_steps": 555, "global_step": 6105, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004505519261094841, "grad_norm": 99.63095972753177, "learning_rate": 2.8571428571428573e-06, "loss": 1.146, "step": 1 }, { "epoch": 0.0004505519261094841, "eval_loss": 1.1064108610153198, "eval_runtime": 24.3891, "eval_samples_per_second": 11.44, "eval_steps_per_second": 0.492, "step": 1 }, { "epoch": 0.0009011038522189682, "grad_norm": 95.08439228116274, "learning_rate": 5.7142857142857145e-06, "loss": 1.0828, "step": 2 }, { "epoch": 0.0013516557783284523, "grad_norm": 47.652821936811655, "learning_rate": 8.571428571428571e-06, "loss": 1.0279, "step": 3 }, { "epoch": 0.0018022077044379365, "grad_norm": 9.093043843691751, "learning_rate": 1.1428571428571429e-05, "loss": 0.8974, "step": 4 }, { "epoch": 0.0022527596305474207, "grad_norm": 26.41300983782548, "learning_rate": 1.4285714285714287e-05, "loss": 0.9852, "step": 5 }, { "epoch": 0.0027033115566569045, "grad_norm": 4.6162529969653425, "learning_rate": 1.7142857142857142e-05, "loss": 0.9132, "step": 6 }, { "epoch": 0.0031538634827663887, "grad_norm": 5.46013721363312, "learning_rate": 2e-05, "loss": 0.8526, "step": 7 }, { "epoch": 0.003604415408875873, "grad_norm": 2.9041572607151553, "learning_rate": 1.9999998884096988e-05, "loss": 0.8701, "step": 8 }, { "epoch": 0.004054967334985357, "grad_norm": 4.508418955407787, "learning_rate": 1.999999553638819e-05, "loss": 0.824, "step": 9 }, { "epoch": 0.004505519261094841, "grad_norm": 3.454097719677071, "learning_rate": 1.999998995687436e-05, "loss": 0.8096, "step": 10 }, { "epoch": 0.004956071187204325, "grad_norm": 1.9760023367230475, "learning_rate": 1.999998214555674e-05, "loss": 0.8101, "step": 11 }, { "epoch": 0.005406623113313809, "grad_norm": 2.2623400225393233, "learning_rate": 1.9999972102437076e-05, "loss": 0.8088, "step": 12 }, { "epoch": 0.005857175039423294, "grad_norm": 1.7292132487590668, "learning_rate": 1.999995982751761e-05, "loss": 0.7863, "step": 13 }, { "epoch": 0.0063077269655327775, "grad_norm": 1.377481724995843, "learning_rate": 1.9999945320801072e-05, "loss": 0.7839, "step": 14 }, { "epoch": 0.006758278891642262, "grad_norm": 1.533311014469047, "learning_rate": 1.9999928582290714e-05, "loss": 0.7871, "step": 15 }, { "epoch": 0.007208830817751746, "grad_norm": 1.299836262802147, "learning_rate": 1.9999909611990262e-05, "loss": 0.8091, "step": 16 }, { "epoch": 0.00765938274386123, "grad_norm": 1.5166484111852367, "learning_rate": 1.9999888409903948e-05, "loss": 0.802, "step": 17 }, { "epoch": 0.008109934669970714, "grad_norm": 1.1913368971230844, "learning_rate": 1.9999864976036515e-05, "loss": 0.7765, "step": 18 }, { "epoch": 0.008560486596080198, "grad_norm": 1.0588845107784532, "learning_rate": 1.9999839310393185e-05, "loss": 0.7872, "step": 19 }, { "epoch": 0.009011038522189683, "grad_norm": 1.2486700221805187, "learning_rate": 1.9999811412979687e-05, "loss": 0.7979, "step": 20 }, { "epoch": 0.009461590448299166, "grad_norm": 0.98286942320374, "learning_rate": 1.9999781283802247e-05, "loss": 0.7794, "step": 21 }, { "epoch": 0.00991214237440865, "grad_norm": 1.025471907745087, "learning_rate": 1.9999748922867592e-05, "loss": 0.7831, "step": 22 }, { "epoch": 0.010362694300518135, "grad_norm": 1.057369650221414, "learning_rate": 1.999971433018294e-05, "loss": 0.7957, "step": 23 }, { "epoch": 0.010813246226627618, "grad_norm": 0.9539412516302141, "learning_rate": 1.9999677505756015e-05, "loss": 0.7721, "step": 24 }, { "epoch": 0.011263798152737103, "grad_norm": 0.8598328537315673, "learning_rate": 1.9999638449595034e-05, "loss": 0.7333, "step": 25 }, { "epoch": 0.011714350078846587, "grad_norm": 0.937410972709224, "learning_rate": 1.9999597161708713e-05, "loss": 0.7574, "step": 26 }, { "epoch": 0.012164902004956072, "grad_norm": 0.8522185663054327, "learning_rate": 1.9999553642106267e-05, "loss": 0.7789, "step": 27 }, { "epoch": 0.012615453931065555, "grad_norm": 0.9189468351436945, "learning_rate": 1.9999507890797408e-05, "loss": 0.7673, "step": 28 }, { "epoch": 0.01306600585717504, "grad_norm": 0.8889565436370142, "learning_rate": 1.999945990779235e-05, "loss": 0.7696, "step": 29 }, { "epoch": 0.013516557783284524, "grad_norm": 0.920543035177071, "learning_rate": 1.99994096931018e-05, "loss": 0.7684, "step": 30 }, { "epoch": 0.013967109709394007, "grad_norm": 0.936396514152432, "learning_rate": 1.9999357246736967e-05, "loss": 0.7756, "step": 31 }, { "epoch": 0.014417661635503492, "grad_norm": 0.8325949562275448, "learning_rate": 1.9999302568709548e-05, "loss": 0.725, "step": 32 }, { "epoch": 0.014868213561612977, "grad_norm": 0.9447755143297217, "learning_rate": 1.9999245659031755e-05, "loss": 0.7852, "step": 33 }, { "epoch": 0.01531876548772246, "grad_norm": 0.885996706505372, "learning_rate": 1.999918651771629e-05, "loss": 0.7872, "step": 34 }, { "epoch": 0.015769317413831946, "grad_norm": 0.8743230303179347, "learning_rate": 1.999912514477634e-05, "loss": 0.7357, "step": 35 }, { "epoch": 0.016219869339941427, "grad_norm": 1.0664191458904753, "learning_rate": 1.9999061540225616e-05, "loss": 0.8013, "step": 36 }, { "epoch": 0.01667042126605091, "grad_norm": 0.8393595579050336, "learning_rate": 1.9998995704078305e-05, "loss": 0.7697, "step": 37 }, { "epoch": 0.017120973192160396, "grad_norm": 0.870811267399356, "learning_rate": 1.9998927636349107e-05, "loss": 0.7278, "step": 38 }, { "epoch": 0.01757152511826988, "grad_norm": 0.8257957398744968, "learning_rate": 1.9998857337053205e-05, "loss": 0.748, "step": 39 }, { "epoch": 0.018022077044379366, "grad_norm": 0.8008831577712419, "learning_rate": 1.9998784806206297e-05, "loss": 0.7481, "step": 40 }, { "epoch": 0.01847262897048885, "grad_norm": 0.766467044007061, "learning_rate": 1.999871004382456e-05, "loss": 0.7501, "step": 41 }, { "epoch": 0.01892318089659833, "grad_norm": 0.8061050864652226, "learning_rate": 1.9998633049924693e-05, "loss": 0.7752, "step": 42 }, { "epoch": 0.019373732822707816, "grad_norm": 0.7845645971693079, "learning_rate": 1.9998553824523867e-05, "loss": 0.7487, "step": 43 }, { "epoch": 0.0198242847488173, "grad_norm": 0.9175475318384826, "learning_rate": 1.9998472367639772e-05, "loss": 0.7635, "step": 44 }, { "epoch": 0.020274836674926786, "grad_norm": 1.0743516995599138, "learning_rate": 1.9998388679290583e-05, "loss": 0.7349, "step": 45 }, { "epoch": 0.02072538860103627, "grad_norm": 0.748092903906611, "learning_rate": 1.9998302759494982e-05, "loss": 0.7411, "step": 46 }, { "epoch": 0.021175940527145755, "grad_norm": 0.8788045410383496, "learning_rate": 1.9998214608272136e-05, "loss": 0.7751, "step": 47 }, { "epoch": 0.021626492453255236, "grad_norm": 0.7761754504639647, "learning_rate": 1.999812422564173e-05, "loss": 0.7533, "step": 48 }, { "epoch": 0.02207704437936472, "grad_norm": 0.855064043688775, "learning_rate": 1.999803161162393e-05, "loss": 0.7449, "step": 49 }, { "epoch": 0.022527596305474205, "grad_norm": 0.9536908928613824, "learning_rate": 1.9997936766239406e-05, "loss": 0.7643, "step": 50 }, { "epoch": 0.02297814823158369, "grad_norm": 0.7515595686422573, "learning_rate": 1.9997839689509322e-05, "loss": 0.7283, "step": 51 }, { "epoch": 0.023428700157693175, "grad_norm": 0.9784329854352253, "learning_rate": 1.9997740381455348e-05, "loss": 0.77, "step": 52 }, { "epoch": 0.02387925208380266, "grad_norm": 0.8340040843271409, "learning_rate": 1.9997638842099648e-05, "loss": 0.7535, "step": 53 }, { "epoch": 0.024329804009912144, "grad_norm": 0.8462056021512305, "learning_rate": 1.9997535071464883e-05, "loss": 0.7465, "step": 54 }, { "epoch": 0.024780355936021625, "grad_norm": 0.8589942624081021, "learning_rate": 1.999742906957421e-05, "loss": 0.7365, "step": 55 }, { "epoch": 0.02523090786213111, "grad_norm": 0.9851953558074718, "learning_rate": 1.999732083645129e-05, "loss": 0.7448, "step": 56 }, { "epoch": 0.025681459788240595, "grad_norm": 0.9636786947779777, "learning_rate": 1.9997210372120276e-05, "loss": 0.7783, "step": 57 }, { "epoch": 0.02613201171435008, "grad_norm": 0.8360297492014297, "learning_rate": 1.9997097676605823e-05, "loss": 0.7781, "step": 58 }, { "epoch": 0.026582563640459564, "grad_norm": 0.9408740305960915, "learning_rate": 1.9996982749933084e-05, "loss": 0.7182, "step": 59 }, { "epoch": 0.02703311556656905, "grad_norm": 0.7921588196043418, "learning_rate": 1.9996865592127702e-05, "loss": 0.7725, "step": 60 }, { "epoch": 0.02748366749267853, "grad_norm": 0.846642612374121, "learning_rate": 1.9996746203215832e-05, "loss": 0.7273, "step": 61 }, { "epoch": 0.027934219418788014, "grad_norm": 0.8716978681946517, "learning_rate": 1.9996624583224112e-05, "loss": 0.7825, "step": 62 }, { "epoch": 0.0283847713448975, "grad_norm": 0.8581180784812269, "learning_rate": 1.9996500732179695e-05, "loss": 0.7697, "step": 63 }, { "epoch": 0.028835323271006984, "grad_norm": 0.9559495160688986, "learning_rate": 1.9996374650110214e-05, "loss": 0.7252, "step": 64 }, { "epoch": 0.02928587519711647, "grad_norm": 0.8255579015796423, "learning_rate": 1.999624633704381e-05, "loss": 0.7673, "step": 65 }, { "epoch": 0.029736427123225953, "grad_norm": 0.9724136093892255, "learning_rate": 1.9996115793009122e-05, "loss": 0.7265, "step": 66 }, { "epoch": 0.030186979049335434, "grad_norm": 0.9720973711086457, "learning_rate": 1.999598301803528e-05, "loss": 0.6969, "step": 67 }, { "epoch": 0.03063753097544492, "grad_norm": 0.8244742727703804, "learning_rate": 1.999584801215192e-05, "loss": 0.7615, "step": 68 }, { "epoch": 0.031088082901554404, "grad_norm": 0.9348220350637594, "learning_rate": 1.9995710775389176e-05, "loss": 0.7441, "step": 69 }, { "epoch": 0.03153863482766389, "grad_norm": 0.7965961783981419, "learning_rate": 1.999557130777767e-05, "loss": 0.7489, "step": 70 }, { "epoch": 0.03198918675377337, "grad_norm": 0.842675412329229, "learning_rate": 1.9995429609348537e-05, "loss": 0.7322, "step": 71 }, { "epoch": 0.032439738679882854, "grad_norm": 0.9260020435165747, "learning_rate": 1.9995285680133393e-05, "loss": 0.7567, "step": 72 }, { "epoch": 0.03289029060599234, "grad_norm": 0.8492858061784027, "learning_rate": 1.9995139520164364e-05, "loss": 0.7615, "step": 73 }, { "epoch": 0.03334084253210182, "grad_norm": 0.931079111933786, "learning_rate": 1.9994991129474068e-05, "loss": 0.7318, "step": 74 }, { "epoch": 0.03379139445821131, "grad_norm": 0.747497413698367, "learning_rate": 1.9994840508095627e-05, "loss": 0.7224, "step": 75 }, { "epoch": 0.03424194638432079, "grad_norm": 0.9184603014545504, "learning_rate": 1.9994687656062655e-05, "loss": 0.738, "step": 76 }, { "epoch": 0.034692498310430274, "grad_norm": 0.7901844760990023, "learning_rate": 1.999453257340926e-05, "loss": 0.7365, "step": 77 }, { "epoch": 0.03514305023653976, "grad_norm": 0.8433883092220861, "learning_rate": 1.9994375260170064e-05, "loss": 0.7093, "step": 78 }, { "epoch": 0.03559360216264924, "grad_norm": 0.7520695459713753, "learning_rate": 1.9994215716380165e-05, "loss": 0.7457, "step": 79 }, { "epoch": 0.03604415408875873, "grad_norm": 0.8568874887552957, "learning_rate": 1.999405394207518e-05, "loss": 0.7134, "step": 80 }, { "epoch": 0.03649470601486821, "grad_norm": 0.8372897747467463, "learning_rate": 1.9993889937291208e-05, "loss": 0.7317, "step": 81 }, { "epoch": 0.0369452579409777, "grad_norm": 0.7875525665015404, "learning_rate": 1.9993723702064852e-05, "loss": 0.7417, "step": 82 }, { "epoch": 0.03739580986708718, "grad_norm": 0.8457437036234493, "learning_rate": 1.9993555236433216e-05, "loss": 0.7784, "step": 83 }, { "epoch": 0.03784636179319666, "grad_norm": 0.9790460885925418, "learning_rate": 1.9993384540433892e-05, "loss": 0.7837, "step": 84 }, { "epoch": 0.03829691371930615, "grad_norm": 0.7661628572937788, "learning_rate": 1.9993211614104982e-05, "loss": 0.7502, "step": 85 }, { "epoch": 0.03874746564541563, "grad_norm": 0.7509186227058579, "learning_rate": 1.9993036457485078e-05, "loss": 0.7206, "step": 86 }, { "epoch": 0.03919801757152512, "grad_norm": 0.8541613279061934, "learning_rate": 1.9992859070613275e-05, "loss": 0.7516, "step": 87 }, { "epoch": 0.0396485694976346, "grad_norm": 0.6542332433733011, "learning_rate": 1.9992679453529153e-05, "loss": 0.6866, "step": 88 }, { "epoch": 0.04009912142374409, "grad_norm": 0.7640095940020493, "learning_rate": 1.9992497606272808e-05, "loss": 0.7404, "step": 89 }, { "epoch": 0.04054967334985357, "grad_norm": 0.7170466362657578, "learning_rate": 1.999231352888482e-05, "loss": 0.7286, "step": 90 }, { "epoch": 0.04100022527596305, "grad_norm": 0.7627014641254132, "learning_rate": 1.9992127221406276e-05, "loss": 0.6912, "step": 91 }, { "epoch": 0.04145077720207254, "grad_norm": 0.8712430402883448, "learning_rate": 1.9991938683878746e-05, "loss": 0.7286, "step": 92 }, { "epoch": 0.04190132912818202, "grad_norm": 0.7238903051640995, "learning_rate": 1.999174791634432e-05, "loss": 0.7355, "step": 93 }, { "epoch": 0.04235188105429151, "grad_norm": 0.7727844390194998, "learning_rate": 1.9991554918845567e-05, "loss": 0.7119, "step": 94 }, { "epoch": 0.04280243298040099, "grad_norm": 0.7936948654544004, "learning_rate": 1.9991359691425566e-05, "loss": 0.7211, "step": 95 }, { "epoch": 0.04325298490651047, "grad_norm": 0.7789941485084827, "learning_rate": 1.9991162234127878e-05, "loss": 0.7446, "step": 96 }, { "epoch": 0.04370353683261996, "grad_norm": 0.8320231103193262, "learning_rate": 1.9990962546996583e-05, "loss": 0.7466, "step": 97 }, { "epoch": 0.04415408875872944, "grad_norm": 0.7524092739539326, "learning_rate": 1.9990760630076238e-05, "loss": 0.7368, "step": 98 }, { "epoch": 0.04460464068483893, "grad_norm": 0.8112713830054827, "learning_rate": 1.9990556483411912e-05, "loss": 0.7399, "step": 99 }, { "epoch": 0.04505519261094841, "grad_norm": 0.6830385220418604, "learning_rate": 1.9990350107049166e-05, "loss": 0.7006, "step": 100 }, { "epoch": 0.0455057445370579, "grad_norm": 0.8079704492319753, "learning_rate": 1.999014150103406e-05, "loss": 0.7546, "step": 101 }, { "epoch": 0.04595629646316738, "grad_norm": 0.728520850336044, "learning_rate": 1.9989930665413148e-05, "loss": 0.7352, "step": 102 }, { "epoch": 0.04640684838927686, "grad_norm": 0.80138567974725, "learning_rate": 1.9989717600233485e-05, "loss": 0.7504, "step": 103 }, { "epoch": 0.04685740031538635, "grad_norm": 0.7400081652604458, "learning_rate": 1.9989502305542624e-05, "loss": 0.7131, "step": 104 }, { "epoch": 0.04730795224149583, "grad_norm": 0.7961638807738207, "learning_rate": 1.9989284781388617e-05, "loss": 0.757, "step": 105 }, { "epoch": 0.04775850416760532, "grad_norm": 0.8398331565262936, "learning_rate": 1.9989065027820003e-05, "loss": 0.7219, "step": 106 }, { "epoch": 0.0482090560937148, "grad_norm": 0.6991491774951051, "learning_rate": 1.998884304488584e-05, "loss": 0.7564, "step": 107 }, { "epoch": 0.04865960801982429, "grad_norm": 0.7595588865199265, "learning_rate": 1.9988618832635657e-05, "loss": 0.731, "step": 108 }, { "epoch": 0.04911015994593377, "grad_norm": 0.703155493213542, "learning_rate": 1.99883923911195e-05, "loss": 0.7508, "step": 109 }, { "epoch": 0.04956071187204325, "grad_norm": 0.712289600356415, "learning_rate": 1.998816372038791e-05, "loss": 0.7647, "step": 110 }, { "epoch": 0.05001126379815274, "grad_norm": 0.7389717263037616, "learning_rate": 1.998793282049191e-05, "loss": 0.7684, "step": 111 }, { "epoch": 0.05046181572426222, "grad_norm": 0.6884439425981158, "learning_rate": 1.998769969148305e-05, "loss": 0.7434, "step": 112 }, { "epoch": 0.05091236765037171, "grad_norm": 0.6991203047814216, "learning_rate": 1.9987464333413343e-05, "loss": 0.7343, "step": 113 }, { "epoch": 0.05136291957648119, "grad_norm": 0.7594587881472421, "learning_rate": 1.9987226746335328e-05, "loss": 0.7479, "step": 114 }, { "epoch": 0.05181347150259067, "grad_norm": 0.6985040266030997, "learning_rate": 1.998698693030202e-05, "loss": 0.7297, "step": 115 }, { "epoch": 0.05226402342870016, "grad_norm": 0.7089025991666758, "learning_rate": 1.998674488536695e-05, "loss": 0.7356, "step": 116 }, { "epoch": 0.05271457535480964, "grad_norm": 0.7336187588866805, "learning_rate": 1.9986500611584133e-05, "loss": 0.7157, "step": 117 }, { "epoch": 0.05316512728091913, "grad_norm": 0.7250745073781437, "learning_rate": 1.9986254109008088e-05, "loss": 0.7476, "step": 118 }, { "epoch": 0.05361567920702861, "grad_norm": 0.7630027453643728, "learning_rate": 1.9986005377693826e-05, "loss": 0.7376, "step": 119 }, { "epoch": 0.0540662311331381, "grad_norm": 0.6874300857399823, "learning_rate": 1.9985754417696864e-05, "loss": 0.7551, "step": 120 }, { "epoch": 0.05451678305924758, "grad_norm": 0.7612728153629468, "learning_rate": 1.9985501229073213e-05, "loss": 0.7164, "step": 121 }, { "epoch": 0.05496733498535706, "grad_norm": 0.8445209335416375, "learning_rate": 1.9985245811879372e-05, "loss": 0.7248, "step": 122 }, { "epoch": 0.05541788691146655, "grad_norm": 0.6950570895969769, "learning_rate": 1.998498816617235e-05, "loss": 0.7193, "step": 123 }, { "epoch": 0.05586843883757603, "grad_norm": 0.7637425205183362, "learning_rate": 1.9984728292009647e-05, "loss": 0.7177, "step": 124 }, { "epoch": 0.05631899076368552, "grad_norm": 0.730811809765136, "learning_rate": 1.9984466189449264e-05, "loss": 0.7184, "step": 125 }, { "epoch": 0.056769542689795, "grad_norm": 0.7214019806334142, "learning_rate": 1.9984201858549695e-05, "loss": 0.6885, "step": 126 }, { "epoch": 0.057220094615904486, "grad_norm": 0.7191896128935257, "learning_rate": 1.9983935299369934e-05, "loss": 0.7121, "step": 127 }, { "epoch": 0.05767064654201397, "grad_norm": 0.8368111259162899, "learning_rate": 1.998366651196947e-05, "loss": 0.7768, "step": 128 }, { "epoch": 0.05812119846812345, "grad_norm": 0.7650282763279617, "learning_rate": 1.99833954964083e-05, "loss": 0.6942, "step": 129 }, { "epoch": 0.05857175039423294, "grad_norm": 0.7600492402289759, "learning_rate": 1.99831222527469e-05, "loss": 0.6943, "step": 130 }, { "epoch": 0.05902230232034242, "grad_norm": 0.7714352096749771, "learning_rate": 1.9982846781046255e-05, "loss": 0.7218, "step": 131 }, { "epoch": 0.059472854246451906, "grad_norm": 0.6911578076249117, "learning_rate": 1.9982569081367844e-05, "loss": 0.7309, "step": 132 }, { "epoch": 0.05992340617256139, "grad_norm": 0.7295155176535983, "learning_rate": 1.9982289153773648e-05, "loss": 0.6911, "step": 133 }, { "epoch": 0.06037395809867087, "grad_norm": 0.7943443079066267, "learning_rate": 1.9982006998326136e-05, "loss": 0.7523, "step": 134 }, { "epoch": 0.06082451002478036, "grad_norm": 0.7593578313342246, "learning_rate": 1.9981722615088285e-05, "loss": 0.7467, "step": 135 }, { "epoch": 0.06127506195088984, "grad_norm": 0.7957909437883874, "learning_rate": 1.9981436004123563e-05, "loss": 0.7553, "step": 136 }, { "epoch": 0.061725613876999326, "grad_norm": 0.7607996815700375, "learning_rate": 1.998114716549593e-05, "loss": 0.7144, "step": 137 }, { "epoch": 0.06217616580310881, "grad_norm": 0.7952536918607739, "learning_rate": 1.9980856099269856e-05, "loss": 0.7222, "step": 138 }, { "epoch": 0.0626267177292183, "grad_norm": 0.6840761612806879, "learning_rate": 1.99805628055103e-05, "loss": 0.7177, "step": 139 }, { "epoch": 0.06307726965532778, "grad_norm": 0.7458788692927119, "learning_rate": 1.9980267284282718e-05, "loss": 0.7332, "step": 140 }, { "epoch": 0.06352782158143726, "grad_norm": 0.7081233639383879, "learning_rate": 1.9979969535653065e-05, "loss": 0.7582, "step": 141 }, { "epoch": 0.06397837350754675, "grad_norm": 0.6957031314358039, "learning_rate": 1.997966955968779e-05, "loss": 0.7333, "step": 142 }, { "epoch": 0.06442892543365623, "grad_norm": 0.7356045007588866, "learning_rate": 1.997936735645385e-05, "loss": 0.6886, "step": 143 }, { "epoch": 0.06487947735976571, "grad_norm": 0.6658747342291623, "learning_rate": 1.997906292601868e-05, "loss": 0.7137, "step": 144 }, { "epoch": 0.0653300292858752, "grad_norm": 0.6945906966033781, "learning_rate": 1.9978756268450232e-05, "loss": 0.728, "step": 145 }, { "epoch": 0.06578058121198468, "grad_norm": 0.6580449591521758, "learning_rate": 1.9978447383816944e-05, "loss": 0.6963, "step": 146 }, { "epoch": 0.06623113313809416, "grad_norm": 0.7464337356438161, "learning_rate": 1.9978136272187745e-05, "loss": 0.7187, "step": 147 }, { "epoch": 0.06668168506420365, "grad_norm": 0.7168991399814145, "learning_rate": 1.9977822933632082e-05, "loss": 0.71, "step": 148 }, { "epoch": 0.06713223699031313, "grad_norm": 0.6753845531163639, "learning_rate": 1.9977507368219878e-05, "loss": 0.746, "step": 149 }, { "epoch": 0.06758278891642262, "grad_norm": 0.734951910824145, "learning_rate": 1.9977189576021563e-05, "loss": 0.716, "step": 150 }, { "epoch": 0.0680333408425321, "grad_norm": 0.6488103253266532, "learning_rate": 1.997686955710806e-05, "loss": 0.7461, "step": 151 }, { "epoch": 0.06848389276864159, "grad_norm": 0.6832266334288452, "learning_rate": 1.9976547311550796e-05, "loss": 0.7339, "step": 152 }, { "epoch": 0.06893444469475107, "grad_norm": 0.7271813368817847, "learning_rate": 1.9976222839421682e-05, "loss": 0.7369, "step": 153 }, { "epoch": 0.06938499662086055, "grad_norm": 0.7118291895294389, "learning_rate": 1.9975896140793144e-05, "loss": 0.7427, "step": 154 }, { "epoch": 0.06983554854697004, "grad_norm": 0.7526686686021379, "learning_rate": 1.9975567215738088e-05, "loss": 0.6978, "step": 155 }, { "epoch": 0.07028610047307952, "grad_norm": 0.7366622246148017, "learning_rate": 1.9975236064329923e-05, "loss": 0.717, "step": 156 }, { "epoch": 0.07073665239918901, "grad_norm": 0.8158551715690735, "learning_rate": 1.997490268664256e-05, "loss": 0.713, "step": 157 }, { "epoch": 0.07118720432529849, "grad_norm": 0.6925966743083742, "learning_rate": 1.99745670827504e-05, "loss": 0.6886, "step": 158 }, { "epoch": 0.07163775625140797, "grad_norm": 0.7135896944162324, "learning_rate": 1.9974229252728345e-05, "loss": 0.7083, "step": 159 }, { "epoch": 0.07208830817751746, "grad_norm": 0.7443906042594065, "learning_rate": 1.9973889196651786e-05, "loss": 0.7338, "step": 160 }, { "epoch": 0.07253886010362694, "grad_norm": 0.7350065096394042, "learning_rate": 1.9973546914596622e-05, "loss": 0.7163, "step": 161 }, { "epoch": 0.07298941202973643, "grad_norm": 0.763472977867096, "learning_rate": 1.9973202406639247e-05, "loss": 0.7107, "step": 162 }, { "epoch": 0.07343996395584591, "grad_norm": 0.698701951575692, "learning_rate": 1.9972855672856543e-05, "loss": 0.7401, "step": 163 }, { "epoch": 0.0738905158819554, "grad_norm": 0.7233167857854929, "learning_rate": 1.9972506713325895e-05, "loss": 0.7239, "step": 164 }, { "epoch": 0.07434106780806488, "grad_norm": 0.7094750318256714, "learning_rate": 1.9972155528125187e-05, "loss": 0.7274, "step": 165 }, { "epoch": 0.07479161973417436, "grad_norm": 0.7358284135596944, "learning_rate": 1.9971802117332796e-05, "loss": 0.7415, "step": 166 }, { "epoch": 0.07524217166028385, "grad_norm": 0.73853986929211, "learning_rate": 1.997144648102759e-05, "loss": 0.7139, "step": 167 }, { "epoch": 0.07569272358639333, "grad_norm": 0.8367657584529081, "learning_rate": 1.997108861928895e-05, "loss": 0.7431, "step": 168 }, { "epoch": 0.07614327551250281, "grad_norm": 0.8125121528834115, "learning_rate": 1.997072853219674e-05, "loss": 0.692, "step": 169 }, { "epoch": 0.0765938274386123, "grad_norm": 0.7296622680384187, "learning_rate": 1.997036621983132e-05, "loss": 0.713, "step": 170 }, { "epoch": 0.07704437936472179, "grad_norm": 0.7968366085166915, "learning_rate": 1.9970001682273553e-05, "loss": 0.7133, "step": 171 }, { "epoch": 0.07749493129083126, "grad_norm": 0.709607577804229, "learning_rate": 1.99696349196048e-05, "loss": 0.6983, "step": 172 }, { "epoch": 0.07794548321694075, "grad_norm": 0.8814252436815059, "learning_rate": 1.9969265931906915e-05, "loss": 0.7035, "step": 173 }, { "epoch": 0.07839603514305024, "grad_norm": 0.7552679399609616, "learning_rate": 1.9968894719262247e-05, "loss": 0.7104, "step": 174 }, { "epoch": 0.07884658706915972, "grad_norm": 0.8176648948471469, "learning_rate": 1.9968521281753642e-05, "loss": 0.699, "step": 175 }, { "epoch": 0.0792971389952692, "grad_norm": 0.8168592061749825, "learning_rate": 1.9968145619464453e-05, "loss": 0.6933, "step": 176 }, { "epoch": 0.07974769092137869, "grad_norm": 0.7864685934008501, "learning_rate": 1.9967767732478506e-05, "loss": 0.6927, "step": 177 }, { "epoch": 0.08019824284748818, "grad_norm": 0.7708490197996007, "learning_rate": 1.9967387620880147e-05, "loss": 0.7124, "step": 178 }, { "epoch": 0.08064879477359765, "grad_norm": 0.8737040792925655, "learning_rate": 1.9967005284754212e-05, "loss": 0.7167, "step": 179 }, { "epoch": 0.08109934669970714, "grad_norm": 0.8118567999474013, "learning_rate": 1.9966620724186022e-05, "loss": 0.7252, "step": 180 }, { "epoch": 0.08154989862581663, "grad_norm": 0.8590681872552048, "learning_rate": 1.9966233939261414e-05, "loss": 0.7362, "step": 181 }, { "epoch": 0.0820004505519261, "grad_norm": 0.7178708891459026, "learning_rate": 1.99658449300667e-05, "loss": 0.7335, "step": 182 }, { "epoch": 0.08245100247803559, "grad_norm": 0.9063180422055138, "learning_rate": 1.9965453696688704e-05, "loss": 0.7158, "step": 183 }, { "epoch": 0.08290155440414508, "grad_norm": 0.6962085935797148, "learning_rate": 1.996506023921475e-05, "loss": 0.6816, "step": 184 }, { "epoch": 0.08335210633025456, "grad_norm": 0.7235629420040247, "learning_rate": 1.996466455773264e-05, "loss": 0.7264, "step": 185 }, { "epoch": 0.08380265825636404, "grad_norm": 0.7067964282581642, "learning_rate": 1.996426665233068e-05, "loss": 0.6516, "step": 186 }, { "epoch": 0.08425321018247353, "grad_norm": 0.7431444851383832, "learning_rate": 1.9963866523097683e-05, "loss": 0.7201, "step": 187 }, { "epoch": 0.08470376210858302, "grad_norm": 0.7054019519296653, "learning_rate": 1.9963464170122947e-05, "loss": 0.708, "step": 188 }, { "epoch": 0.0851543140346925, "grad_norm": 0.6919614859230734, "learning_rate": 1.996305959349627e-05, "loss": 0.7451, "step": 189 }, { "epoch": 0.08560486596080198, "grad_norm": 0.6804576898778831, "learning_rate": 1.9962652793307944e-05, "loss": 0.7179, "step": 190 }, { "epoch": 0.08605541788691147, "grad_norm": 0.7673912510968406, "learning_rate": 1.996224376964876e-05, "loss": 0.7413, "step": 191 }, { "epoch": 0.08650596981302094, "grad_norm": 0.7592942696820764, "learning_rate": 1.9961832522610004e-05, "loss": 0.7217, "step": 192 }, { "epoch": 0.08695652173913043, "grad_norm": 0.667041963355119, "learning_rate": 1.9961419052283458e-05, "loss": 0.6959, "step": 193 }, { "epoch": 0.08740707366523992, "grad_norm": 0.6912435112177164, "learning_rate": 1.99610033587614e-05, "loss": 0.6939, "step": 194 }, { "epoch": 0.08785762559134941, "grad_norm": 0.6527078582242967, "learning_rate": 1.9960585442136608e-05, "loss": 0.7529, "step": 195 }, { "epoch": 0.08830817751745888, "grad_norm": 0.737337600113603, "learning_rate": 1.996016530250235e-05, "loss": 0.7093, "step": 196 }, { "epoch": 0.08875872944356837, "grad_norm": 0.7059062018639081, "learning_rate": 1.9959742939952393e-05, "loss": 0.6846, "step": 197 }, { "epoch": 0.08920928136967786, "grad_norm": 0.703157492759099, "learning_rate": 1.9959318354581e-05, "loss": 0.7298, "step": 198 }, { "epoch": 0.08965983329578733, "grad_norm": 0.6921367716246344, "learning_rate": 1.995889154648293e-05, "loss": 0.6908, "step": 199 }, { "epoch": 0.09011038522189682, "grad_norm": 0.6699678252018181, "learning_rate": 1.995846251575344e-05, "loss": 0.7028, "step": 200 }, { "epoch": 0.09056093714800631, "grad_norm": 0.6773542092191404, "learning_rate": 1.995803126248828e-05, "loss": 0.7126, "step": 201 }, { "epoch": 0.0910114890741158, "grad_norm": 0.7559921094794061, "learning_rate": 1.99575977867837e-05, "loss": 0.7494, "step": 202 }, { "epoch": 0.09146204100022527, "grad_norm": 0.6342393071875867, "learning_rate": 1.995716208873644e-05, "loss": 0.6896, "step": 203 }, { "epoch": 0.09191259292633476, "grad_norm": 0.7093011677801856, "learning_rate": 1.995672416844374e-05, "loss": 0.727, "step": 204 }, { "epoch": 0.09236314485244425, "grad_norm": 0.6443896879294535, "learning_rate": 1.9956284026003336e-05, "loss": 0.7487, "step": 205 }, { "epoch": 0.09281369677855372, "grad_norm": 0.707453200448554, "learning_rate": 1.995584166151346e-05, "loss": 0.7386, "step": 206 }, { "epoch": 0.09326424870466321, "grad_norm": 0.6664041277541776, "learning_rate": 1.995539707507284e-05, "loss": 0.7191, "step": 207 }, { "epoch": 0.0937148006307727, "grad_norm": 0.7490277261983995, "learning_rate": 1.9954950266780692e-05, "loss": 0.743, "step": 208 }, { "epoch": 0.09416535255688219, "grad_norm": 0.6668804092713876, "learning_rate": 1.9954501236736743e-05, "loss": 0.6946, "step": 209 }, { "epoch": 0.09461590448299166, "grad_norm": 0.7393686521496333, "learning_rate": 1.9954049985041208e-05, "loss": 0.7062, "step": 210 }, { "epoch": 0.09506645640910115, "grad_norm": 0.7781105434080061, "learning_rate": 1.995359651179479e-05, "loss": 0.7249, "step": 211 }, { "epoch": 0.09551700833521064, "grad_norm": 0.6786603967651151, "learning_rate": 1.99531408170987e-05, "loss": 0.6968, "step": 212 }, { "epoch": 0.09596756026132011, "grad_norm": 0.7460292858780141, "learning_rate": 1.9952682901054646e-05, "loss": 0.7419, "step": 213 }, { "epoch": 0.0964181121874296, "grad_norm": 0.6922806030630897, "learning_rate": 1.9952222763764816e-05, "loss": 0.7321, "step": 214 }, { "epoch": 0.09686866411353909, "grad_norm": 0.6647664762982298, "learning_rate": 1.995176040533191e-05, "loss": 0.7116, "step": 215 }, { "epoch": 0.09731921603964858, "grad_norm": 0.7930066523740955, "learning_rate": 1.9951295825859115e-05, "loss": 0.7201, "step": 216 }, { "epoch": 0.09776976796575805, "grad_norm": 0.6469690710794159, "learning_rate": 1.9950829025450116e-05, "loss": 0.7232, "step": 217 }, { "epoch": 0.09822031989186754, "grad_norm": 0.7674721267209388, "learning_rate": 1.9950360004209095e-05, "loss": 0.7307, "step": 218 }, { "epoch": 0.09867087181797703, "grad_norm": 0.6640672052303578, "learning_rate": 1.994988876224073e-05, "loss": 0.7257, "step": 219 }, { "epoch": 0.0991214237440865, "grad_norm": 0.7529170370194731, "learning_rate": 1.9949415299650184e-05, "loss": 0.7678, "step": 220 }, { "epoch": 0.09957197567019599, "grad_norm": 0.7135838116763162, "learning_rate": 1.994893961654314e-05, "loss": 0.7242, "step": 221 }, { "epoch": 0.10002252759630548, "grad_norm": 0.6903963912224623, "learning_rate": 1.994846171302575e-05, "loss": 0.7377, "step": 222 }, { "epoch": 0.10047307952241495, "grad_norm": 0.7762033371843914, "learning_rate": 1.9947981589204674e-05, "loss": 0.7151, "step": 223 }, { "epoch": 0.10092363144852444, "grad_norm": 0.6360336194336229, "learning_rate": 1.994749924518707e-05, "loss": 0.6852, "step": 224 }, { "epoch": 0.10137418337463393, "grad_norm": 0.816163660747766, "learning_rate": 1.9947014681080584e-05, "loss": 0.7306, "step": 225 }, { "epoch": 0.10182473530074342, "grad_norm": 0.6895138192365512, "learning_rate": 1.9946527896993363e-05, "loss": 0.6959, "step": 226 }, { "epoch": 0.10227528722685289, "grad_norm": 0.723096089187139, "learning_rate": 1.9946038893034045e-05, "loss": 0.725, "step": 227 }, { "epoch": 0.10272583915296238, "grad_norm": 0.7247438459380336, "learning_rate": 1.9945547669311773e-05, "loss": 0.7285, "step": 228 }, { "epoch": 0.10317639107907187, "grad_norm": 0.6618585765894427, "learning_rate": 1.9945054225936177e-05, "loss": 0.6897, "step": 229 }, { "epoch": 0.10362694300518134, "grad_norm": 0.6993840161959908, "learning_rate": 1.9944558563017376e-05, "loss": 0.6971, "step": 230 }, { "epoch": 0.10407749493129083, "grad_norm": 0.6960382928526576, "learning_rate": 1.9944060680666e-05, "loss": 0.6981, "step": 231 }, { "epoch": 0.10452804685740032, "grad_norm": 0.6975182785094366, "learning_rate": 1.994356057899317e-05, "loss": 0.7291, "step": 232 }, { "epoch": 0.1049785987835098, "grad_norm": 0.6996541091749989, "learning_rate": 1.994305825811049e-05, "loss": 0.7236, "step": 233 }, { "epoch": 0.10542915070961928, "grad_norm": 0.6943847607669932, "learning_rate": 1.994255371813007e-05, "loss": 0.7372, "step": 234 }, { "epoch": 0.10587970263572877, "grad_norm": 0.748646087194202, "learning_rate": 1.9942046959164516e-05, "loss": 0.7178, "step": 235 }, { "epoch": 0.10633025456183826, "grad_norm": 0.6876638312639068, "learning_rate": 1.9941537981326928e-05, "loss": 0.7346, "step": 236 }, { "epoch": 0.10678080648794773, "grad_norm": 0.7899797333628644, "learning_rate": 1.9941026784730898e-05, "loss": 0.7152, "step": 237 }, { "epoch": 0.10723135841405722, "grad_norm": 0.7148641816075103, "learning_rate": 1.9940513369490516e-05, "loss": 0.728, "step": 238 }, { "epoch": 0.1076819103401667, "grad_norm": 0.7122553023133636, "learning_rate": 1.9939997735720364e-05, "loss": 0.7147, "step": 239 }, { "epoch": 0.1081324622662762, "grad_norm": 0.8008999116403491, "learning_rate": 1.9939479883535524e-05, "loss": 0.7161, "step": 240 }, { "epoch": 0.10858301419238567, "grad_norm": 0.7016088951025189, "learning_rate": 1.993895981305157e-05, "loss": 0.7059, "step": 241 }, { "epoch": 0.10903356611849516, "grad_norm": 0.6729728282926628, "learning_rate": 1.9938437524384572e-05, "loss": 0.7012, "step": 242 }, { "epoch": 0.10948411804460464, "grad_norm": 0.8393629895242376, "learning_rate": 1.9937913017651092e-05, "loss": 0.7211, "step": 243 }, { "epoch": 0.10993466997071412, "grad_norm": 0.7811135395340536, "learning_rate": 1.9937386292968193e-05, "loss": 0.7274, "step": 244 }, { "epoch": 0.11038522189682361, "grad_norm": 0.7498845472747651, "learning_rate": 1.993685735045343e-05, "loss": 0.7097, "step": 245 }, { "epoch": 0.1108357738229331, "grad_norm": 0.7170689094059387, "learning_rate": 1.993632619022485e-05, "loss": 0.6993, "step": 246 }, { "epoch": 0.11128632574904258, "grad_norm": 0.6862851268182426, "learning_rate": 1.9935792812400997e-05, "loss": 0.7114, "step": 247 }, { "epoch": 0.11173687767515206, "grad_norm": 0.7028345169194179, "learning_rate": 1.9935257217100915e-05, "loss": 0.7029, "step": 248 }, { "epoch": 0.11218742960126155, "grad_norm": 0.6439290402158404, "learning_rate": 1.9934719404444137e-05, "loss": 0.6913, "step": 249 }, { "epoch": 0.11263798152737103, "grad_norm": 0.6853829771816389, "learning_rate": 1.9934179374550687e-05, "loss": 0.7225, "step": 250 }, { "epoch": 0.11308853345348051, "grad_norm": 0.699728204926948, "learning_rate": 1.9933637127541098e-05, "loss": 0.7412, "step": 251 }, { "epoch": 0.11353908537959, "grad_norm": 0.6902586205720096, "learning_rate": 1.9933092663536384e-05, "loss": 0.7506, "step": 252 }, { "epoch": 0.11398963730569948, "grad_norm": 0.7338678858758704, "learning_rate": 1.993254598265806e-05, "loss": 0.7228, "step": 253 }, { "epoch": 0.11444018923180897, "grad_norm": 0.7533096089834321, "learning_rate": 1.9931997085028132e-05, "loss": 0.6884, "step": 254 }, { "epoch": 0.11489074115791845, "grad_norm": 0.710169430487659, "learning_rate": 1.9931445970769103e-05, "loss": 0.7103, "step": 255 }, { "epoch": 0.11534129308402793, "grad_norm": 0.8367898434114598, "learning_rate": 1.9930892640003977e-05, "loss": 0.69, "step": 256 }, { "epoch": 0.11579184501013742, "grad_norm": 0.6806709151372053, "learning_rate": 1.9930337092856243e-05, "loss": 0.6966, "step": 257 }, { "epoch": 0.1162423969362469, "grad_norm": 0.8160198585968004, "learning_rate": 1.9929779329449886e-05, "loss": 0.7204, "step": 258 }, { "epoch": 0.11669294886235639, "grad_norm": 0.7234718524448546, "learning_rate": 1.9929219349909393e-05, "loss": 0.6909, "step": 259 }, { "epoch": 0.11714350078846587, "grad_norm": 0.6829409164576856, "learning_rate": 1.9928657154359734e-05, "loss": 0.7421, "step": 260 }, { "epoch": 0.11759405271457536, "grad_norm": 0.7237710522202636, "learning_rate": 1.992809274292639e-05, "loss": 0.7008, "step": 261 }, { "epoch": 0.11804460464068484, "grad_norm": 0.696236727288323, "learning_rate": 1.9927526115735315e-05, "loss": 0.6805, "step": 262 }, { "epoch": 0.11849515656679432, "grad_norm": 0.6985369863925316, "learning_rate": 1.9926957272912975e-05, "loss": 0.7231, "step": 263 }, { "epoch": 0.11894570849290381, "grad_norm": 0.7227977133592497, "learning_rate": 1.992638621458633e-05, "loss": 0.7204, "step": 264 }, { "epoch": 0.11939626041901329, "grad_norm": 0.6790491742897864, "learning_rate": 1.992581294088282e-05, "loss": 0.6996, "step": 265 }, { "epoch": 0.11984681234512277, "grad_norm": 0.7867382981390532, "learning_rate": 1.9925237451930392e-05, "loss": 0.7175, "step": 266 }, { "epoch": 0.12029736427123226, "grad_norm": 0.6772757822101494, "learning_rate": 1.9924659747857485e-05, "loss": 0.7586, "step": 267 }, { "epoch": 0.12074791619734174, "grad_norm": 0.7411060672148889, "learning_rate": 1.992407982879303e-05, "loss": 0.7399, "step": 268 }, { "epoch": 0.12119846812345123, "grad_norm": 0.6677271662335825, "learning_rate": 1.9923497694866455e-05, "loss": 0.6997, "step": 269 }, { "epoch": 0.12164902004956071, "grad_norm": 0.7168017734971441, "learning_rate": 1.992291334620768e-05, "loss": 0.7033, "step": 270 }, { "epoch": 0.1220995719756702, "grad_norm": 0.6460257247717686, "learning_rate": 1.992232678294712e-05, "loss": 0.7047, "step": 271 }, { "epoch": 0.12255012390177968, "grad_norm": 0.6588604514967725, "learning_rate": 1.9921738005215687e-05, "loss": 0.687, "step": 272 }, { "epoch": 0.12300067582788916, "grad_norm": 0.709098418141694, "learning_rate": 1.9921147013144782e-05, "loss": 0.6863, "step": 273 }, { "epoch": 0.12345122775399865, "grad_norm": 0.6851681104144931, "learning_rate": 1.99205538068663e-05, "loss": 0.7278, "step": 274 }, { "epoch": 0.12390177968010813, "grad_norm": 0.7482621960384899, "learning_rate": 1.9919958386512644e-05, "loss": 0.7305, "step": 275 }, { "epoch": 0.12435233160621761, "grad_norm": 0.6901055965035382, "learning_rate": 1.9919360752216687e-05, "loss": 0.6858, "step": 276 }, { "epoch": 0.1248028835323271, "grad_norm": 0.6736008970502404, "learning_rate": 1.9918760904111818e-05, "loss": 0.702, "step": 277 }, { "epoch": 0.1252534354584366, "grad_norm": 0.7050804426368975, "learning_rate": 1.991815884233191e-05, "loss": 0.7111, "step": 278 }, { "epoch": 0.12570398738454608, "grad_norm": 0.7534999663667773, "learning_rate": 1.991755456701133e-05, "loss": 0.731, "step": 279 }, { "epoch": 0.12615453931065557, "grad_norm": 0.7211386527458089, "learning_rate": 1.991694807828494e-05, "loss": 0.6459, "step": 280 }, { "epoch": 0.12660509123676503, "grad_norm": 0.7081071795971652, "learning_rate": 1.99163393762881e-05, "loss": 0.7215, "step": 281 }, { "epoch": 0.12705564316287452, "grad_norm": 0.7271778789119016, "learning_rate": 1.991572846115666e-05, "loss": 0.714, "step": 282 }, { "epoch": 0.127506195088984, "grad_norm": 0.7391925634582901, "learning_rate": 1.9915115333026954e-05, "loss": 0.6984, "step": 283 }, { "epoch": 0.1279567470150935, "grad_norm": 0.6960685341467058, "learning_rate": 1.9914499992035838e-05, "loss": 0.6922, "step": 284 }, { "epoch": 0.12840729894120298, "grad_norm": 0.7234711880987357, "learning_rate": 1.9913882438320627e-05, "loss": 0.6957, "step": 285 }, { "epoch": 0.12885785086731247, "grad_norm": 0.7146551329445472, "learning_rate": 1.9913262672019163e-05, "loss": 0.7026, "step": 286 }, { "epoch": 0.12930840279342193, "grad_norm": 0.7596053142960724, "learning_rate": 1.9912640693269754e-05, "loss": 0.7478, "step": 287 }, { "epoch": 0.12975895471953142, "grad_norm": 0.6921595455282982, "learning_rate": 1.9912016502211217e-05, "loss": 0.7105, "step": 288 }, { "epoch": 0.1302095066456409, "grad_norm": 0.8249366907371112, "learning_rate": 1.991139009898286e-05, "loss": 0.7168, "step": 289 }, { "epoch": 0.1306600585717504, "grad_norm": 0.6861412674746409, "learning_rate": 1.9910761483724486e-05, "loss": 0.7073, "step": 290 }, { "epoch": 0.13111061049785988, "grad_norm": 0.7502133701130201, "learning_rate": 1.9910130656576386e-05, "loss": 0.7153, "step": 291 }, { "epoch": 0.13156116242396937, "grad_norm": 0.7315317266784714, "learning_rate": 1.990949761767935e-05, "loss": 0.6963, "step": 292 }, { "epoch": 0.13201171435007886, "grad_norm": 0.6973686407334695, "learning_rate": 1.990886236717466e-05, "loss": 0.6826, "step": 293 }, { "epoch": 0.13246226627618832, "grad_norm": 0.7291158824954145, "learning_rate": 1.990822490520409e-05, "loss": 0.7096, "step": 294 }, { "epoch": 0.1329128182022978, "grad_norm": 0.7513920127233695, "learning_rate": 1.990758523190991e-05, "loss": 0.7024, "step": 295 }, { "epoch": 0.1333633701284073, "grad_norm": 0.6588775778081005, "learning_rate": 1.990694334743489e-05, "loss": 0.7141, "step": 296 }, { "epoch": 0.13381392205451678, "grad_norm": 0.8078323488655571, "learning_rate": 1.9906299251922273e-05, "loss": 0.7006, "step": 297 }, { "epoch": 0.13426447398062627, "grad_norm": 0.6338419302537458, "learning_rate": 1.9905652945515815e-05, "loss": 0.6954, "step": 298 }, { "epoch": 0.13471502590673576, "grad_norm": 0.7666075468561837, "learning_rate": 1.9905004428359765e-05, "loss": 0.685, "step": 299 }, { "epoch": 0.13516557783284525, "grad_norm": 0.6665614469361268, "learning_rate": 1.9904353700598847e-05, "loss": 0.7256, "step": 300 }, { "epoch": 0.1356161297589547, "grad_norm": 0.7639472229358025, "learning_rate": 1.9903700762378303e-05, "loss": 0.7531, "step": 301 }, { "epoch": 0.1360666816850642, "grad_norm": 0.6493355989653458, "learning_rate": 1.9903045613843844e-05, "loss": 0.6676, "step": 302 }, { "epoch": 0.13651723361117368, "grad_norm": 0.8537640696545801, "learning_rate": 1.9902388255141697e-05, "loss": 0.6848, "step": 303 }, { "epoch": 0.13696778553728317, "grad_norm": 0.6553627937756286, "learning_rate": 1.990172868641857e-05, "loss": 0.7052, "step": 304 }, { "epoch": 0.13741833746339266, "grad_norm": 0.8413496898742955, "learning_rate": 1.990106690782166e-05, "loss": 0.7425, "step": 305 }, { "epoch": 0.13786888938950215, "grad_norm": 0.6528667264822138, "learning_rate": 1.990040291949867e-05, "loss": 0.662, "step": 306 }, { "epoch": 0.13831944131561164, "grad_norm": 0.9162273262149863, "learning_rate": 1.9899736721597787e-05, "loss": 0.735, "step": 307 }, { "epoch": 0.1387699932417211, "grad_norm": 0.6696622044689865, "learning_rate": 1.989906831426769e-05, "loss": 0.7034, "step": 308 }, { "epoch": 0.13922054516783058, "grad_norm": 0.8300060716750726, "learning_rate": 1.9898397697657556e-05, "loss": 0.6935, "step": 309 }, { "epoch": 0.13967109709394007, "grad_norm": 0.6404123247640772, "learning_rate": 1.9897724871917057e-05, "loss": 0.6802, "step": 310 }, { "epoch": 0.14012164902004956, "grad_norm": 0.9154595649313165, "learning_rate": 1.989704983719635e-05, "loss": 0.6781, "step": 311 }, { "epoch": 0.14057220094615905, "grad_norm": 0.6763523428410992, "learning_rate": 1.9896372593646095e-05, "loss": 0.6864, "step": 312 }, { "epoch": 0.14102275287226854, "grad_norm": 0.8379795064972306, "learning_rate": 1.9895693141417433e-05, "loss": 0.7018, "step": 313 }, { "epoch": 0.14147330479837802, "grad_norm": 0.7136049761312958, "learning_rate": 1.9895011480662014e-05, "loss": 0.6991, "step": 314 }, { "epoch": 0.14192385672448748, "grad_norm": 0.7706059634994661, "learning_rate": 1.989432761153196e-05, "loss": 0.6927, "step": 315 }, { "epoch": 0.14237440865059697, "grad_norm": 0.7331654612614981, "learning_rate": 1.9893641534179906e-05, "loss": 0.7432, "step": 316 }, { "epoch": 0.14282496057670646, "grad_norm": 0.70603299565265, "learning_rate": 1.989295324875897e-05, "loss": 0.7266, "step": 317 }, { "epoch": 0.14327551250281595, "grad_norm": 0.7052935955829427, "learning_rate": 1.9892262755422758e-05, "loss": 0.6925, "step": 318 }, { "epoch": 0.14372606442892544, "grad_norm": 0.6917362813644842, "learning_rate": 1.9891570054325382e-05, "loss": 0.6915, "step": 319 }, { "epoch": 0.14417661635503493, "grad_norm": 0.7098375834493801, "learning_rate": 1.9890875145621435e-05, "loss": 0.6819, "step": 320 }, { "epoch": 0.1446271682811444, "grad_norm": 0.6679144317099803, "learning_rate": 1.989017802946601e-05, "loss": 0.7082, "step": 321 }, { "epoch": 0.14507772020725387, "grad_norm": 0.7093786111077883, "learning_rate": 1.9889478706014687e-05, "loss": 0.6631, "step": 322 }, { "epoch": 0.14552827213336336, "grad_norm": 0.7093310630791116, "learning_rate": 1.988877717542354e-05, "loss": 0.7023, "step": 323 }, { "epoch": 0.14597882405947285, "grad_norm": 0.6977616923654537, "learning_rate": 1.9888073437849145e-05, "loss": 0.6873, "step": 324 }, { "epoch": 0.14642937598558234, "grad_norm": 0.7325562605765275, "learning_rate": 1.9887367493448556e-05, "loss": 0.7088, "step": 325 }, { "epoch": 0.14687992791169183, "grad_norm": 0.6614985974844839, "learning_rate": 1.9886659342379325e-05, "loss": 0.7318, "step": 326 }, { "epoch": 0.14733047983780131, "grad_norm": 0.734704555648059, "learning_rate": 1.9885948984799502e-05, "loss": 0.7224, "step": 327 }, { "epoch": 0.1477810317639108, "grad_norm": 0.6878403329123667, "learning_rate": 1.988523642086762e-05, "loss": 0.6713, "step": 328 }, { "epoch": 0.14823158369002026, "grad_norm": 0.682913069767994, "learning_rate": 1.9884521650742718e-05, "loss": 0.6776, "step": 329 }, { "epoch": 0.14868213561612975, "grad_norm": 0.6615598976254641, "learning_rate": 1.9883804674584312e-05, "loss": 0.7069, "step": 330 }, { "epoch": 0.14913268754223924, "grad_norm": 0.6642198056324168, "learning_rate": 1.9883085492552415e-05, "loss": 0.698, "step": 331 }, { "epoch": 0.14958323946834873, "grad_norm": 0.650201940018281, "learning_rate": 1.9882364104807536e-05, "loss": 0.7107, "step": 332 }, { "epoch": 0.15003379139445822, "grad_norm": 0.6815266939530114, "learning_rate": 1.9881640511510683e-05, "loss": 0.6964, "step": 333 }, { "epoch": 0.1504843433205677, "grad_norm": 0.669891250625391, "learning_rate": 1.9880914712823338e-05, "loss": 0.6609, "step": 334 }, { "epoch": 0.1509348952466772, "grad_norm": 0.6400231443506212, "learning_rate": 1.9880186708907493e-05, "loss": 0.7027, "step": 335 }, { "epoch": 0.15138544717278665, "grad_norm": 0.7278525246472274, "learning_rate": 1.9879456499925616e-05, "loss": 0.7363, "step": 336 }, { "epoch": 0.15183599909889614, "grad_norm": 0.6351521693190316, "learning_rate": 1.987872408604068e-05, "loss": 0.7038, "step": 337 }, { "epoch": 0.15228655102500563, "grad_norm": 0.7362857818934413, "learning_rate": 1.9877989467416147e-05, "loss": 0.6916, "step": 338 }, { "epoch": 0.15273710295111512, "grad_norm": 0.6536512145356158, "learning_rate": 1.9877252644215965e-05, "loss": 0.7361, "step": 339 }, { "epoch": 0.1531876548772246, "grad_norm": 0.6705605666668448, "learning_rate": 1.9876513616604584e-05, "loss": 0.7082, "step": 340 }, { "epoch": 0.1536382068033341, "grad_norm": 0.6262654893936253, "learning_rate": 1.987577238474694e-05, "loss": 0.681, "step": 341 }, { "epoch": 0.15408875872944358, "grad_norm": 0.6366732003027316, "learning_rate": 1.9875028948808457e-05, "loss": 0.7154, "step": 342 }, { "epoch": 0.15453931065555304, "grad_norm": 0.6545148676673426, "learning_rate": 1.9874283308955058e-05, "loss": 0.7309, "step": 343 }, { "epoch": 0.15498986258166253, "grad_norm": 0.6646722217527633, "learning_rate": 1.9873535465353157e-05, "loss": 0.7114, "step": 344 }, { "epoch": 0.15544041450777202, "grad_norm": 0.6697159564707355, "learning_rate": 1.9872785418169656e-05, "loss": 0.6817, "step": 345 }, { "epoch": 0.1558909664338815, "grad_norm": 0.6652000082565307, "learning_rate": 1.9872033167571952e-05, "loss": 0.6871, "step": 346 }, { "epoch": 0.156341518359991, "grad_norm": 0.7061541346074837, "learning_rate": 1.9871278713727932e-05, "loss": 0.7467, "step": 347 }, { "epoch": 0.15679207028610048, "grad_norm": 0.6439806613867652, "learning_rate": 1.987052205680598e-05, "loss": 0.6823, "step": 348 }, { "epoch": 0.15724262221220997, "grad_norm": 0.6298980308946882, "learning_rate": 1.9869763196974957e-05, "loss": 0.6837, "step": 349 }, { "epoch": 0.15769317413831943, "grad_norm": 0.7143417112133001, "learning_rate": 1.9869002134404235e-05, "loss": 0.7296, "step": 350 }, { "epoch": 0.15814372606442892, "grad_norm": 0.644069860553988, "learning_rate": 1.9868238869263664e-05, "loss": 0.7225, "step": 351 }, { "epoch": 0.1585942779905384, "grad_norm": 0.7010886762512717, "learning_rate": 1.9867473401723595e-05, "loss": 0.702, "step": 352 }, { "epoch": 0.1590448299166479, "grad_norm": 0.667861933422268, "learning_rate": 1.986670573195486e-05, "loss": 0.7527, "step": 353 }, { "epoch": 0.15949538184275738, "grad_norm": 0.6951972684146229, "learning_rate": 1.986593586012879e-05, "loss": 0.698, "step": 354 }, { "epoch": 0.15994593376886687, "grad_norm": 0.6544373528972657, "learning_rate": 1.9865163786417202e-05, "loss": 0.7105, "step": 355 }, { "epoch": 0.16039648569497636, "grad_norm": 0.6433928430686771, "learning_rate": 1.9864389510992414e-05, "loss": 0.6795, "step": 356 }, { "epoch": 0.16084703762108582, "grad_norm": 0.7107411108532492, "learning_rate": 1.9863613034027224e-05, "loss": 0.7117, "step": 357 }, { "epoch": 0.1612975895471953, "grad_norm": 0.6624087322441761, "learning_rate": 1.9862834355694934e-05, "loss": 0.7047, "step": 358 }, { "epoch": 0.1617481414733048, "grad_norm": 0.7004243083935349, "learning_rate": 1.9862053476169318e-05, "loss": 0.7139, "step": 359 }, { "epoch": 0.16219869339941428, "grad_norm": 0.6510377858538963, "learning_rate": 1.9861270395624665e-05, "loss": 0.7233, "step": 360 }, { "epoch": 0.16264924532552377, "grad_norm": 0.6799679834839087, "learning_rate": 1.986048511423574e-05, "loss": 0.7012, "step": 361 }, { "epoch": 0.16309979725163326, "grad_norm": 0.6284062035271012, "learning_rate": 1.9859697632177796e-05, "loss": 0.696, "step": 362 }, { "epoch": 0.16355034917774275, "grad_norm": 0.6408467124947135, "learning_rate": 1.985890794962659e-05, "loss": 0.693, "step": 363 }, { "epoch": 0.1640009011038522, "grad_norm": 0.7377058513093833, "learning_rate": 1.9858116066758362e-05, "loss": 0.7006, "step": 364 }, { "epoch": 0.1644514530299617, "grad_norm": 0.6424972510753616, "learning_rate": 1.985732198374985e-05, "loss": 0.6782, "step": 365 }, { "epoch": 0.16490200495607119, "grad_norm": 0.7083687145195159, "learning_rate": 1.985652570077827e-05, "loss": 0.7006, "step": 366 }, { "epoch": 0.16535255688218067, "grad_norm": 0.8097650461114795, "learning_rate": 1.985572721802134e-05, "loss": 0.7036, "step": 367 }, { "epoch": 0.16580310880829016, "grad_norm": 0.6173985956805347, "learning_rate": 1.985492653565727e-05, "loss": 0.7486, "step": 368 }, { "epoch": 0.16625366073439965, "grad_norm": 0.7686189072584315, "learning_rate": 1.985412365386475e-05, "loss": 0.717, "step": 369 }, { "epoch": 0.1667042126605091, "grad_norm": 0.6539079598823289, "learning_rate": 1.985331857282297e-05, "loss": 0.7028, "step": 370 }, { "epoch": 0.1671547645866186, "grad_norm": 0.7475243004113215, "learning_rate": 1.985251129271161e-05, "loss": 0.7351, "step": 371 }, { "epoch": 0.1676053165127281, "grad_norm": 0.6840007977472856, "learning_rate": 1.9851701813710838e-05, "loss": 0.7218, "step": 372 }, { "epoch": 0.16805586843883757, "grad_norm": 0.7209523218706684, "learning_rate": 1.9850890136001314e-05, "loss": 0.7476, "step": 373 }, { "epoch": 0.16850642036494706, "grad_norm": 0.6856732198932168, "learning_rate": 1.9850076259764187e-05, "loss": 0.7275, "step": 374 }, { "epoch": 0.16895697229105655, "grad_norm": 0.6333474909579239, "learning_rate": 1.9849260185181105e-05, "loss": 0.7035, "step": 375 }, { "epoch": 0.16940752421716604, "grad_norm": 0.6988768838158922, "learning_rate": 1.9848441912434194e-05, "loss": 0.7147, "step": 376 }, { "epoch": 0.1698580761432755, "grad_norm": 0.6246906905015376, "learning_rate": 1.9847621441706076e-05, "loss": 0.7052, "step": 377 }, { "epoch": 0.170308628069385, "grad_norm": 0.6623621315685918, "learning_rate": 1.9846798773179866e-05, "loss": 0.719, "step": 378 }, { "epoch": 0.17075917999549448, "grad_norm": 0.615932361478822, "learning_rate": 1.984597390703917e-05, "loss": 0.707, "step": 379 }, { "epoch": 0.17120973192160396, "grad_norm": 0.6484161605093487, "learning_rate": 1.984514684346808e-05, "loss": 0.7263, "step": 380 }, { "epoch": 0.17166028384771345, "grad_norm": 0.6476791704288226, "learning_rate": 1.9844317582651178e-05, "loss": 0.687, "step": 381 }, { "epoch": 0.17211083577382294, "grad_norm": 0.6672880468257143, "learning_rate": 1.9843486124773546e-05, "loss": 0.7695, "step": 382 }, { "epoch": 0.17256138769993243, "grad_norm": 0.6435063885632898, "learning_rate": 1.984265247002074e-05, "loss": 0.702, "step": 383 }, { "epoch": 0.1730119396260419, "grad_norm": 0.6434195730693051, "learning_rate": 1.9841816618578824e-05, "loss": 0.6822, "step": 384 }, { "epoch": 0.17346249155215138, "grad_norm": 0.6223572532377251, "learning_rate": 1.9840978570634338e-05, "loss": 0.7118, "step": 385 }, { "epoch": 0.17391304347826086, "grad_norm": 0.636909765498188, "learning_rate": 1.984013832637432e-05, "loss": 0.7137, "step": 386 }, { "epoch": 0.17436359540437035, "grad_norm": 0.6089804486899985, "learning_rate": 1.98392958859863e-05, "loss": 0.7041, "step": 387 }, { "epoch": 0.17481414733047984, "grad_norm": 0.680714632795083, "learning_rate": 1.9838451249658285e-05, "loss": 0.7254, "step": 388 }, { "epoch": 0.17526469925658933, "grad_norm": 0.6652354081900853, "learning_rate": 1.9837604417578792e-05, "loss": 0.7186, "step": 389 }, { "epoch": 0.17571525118269882, "grad_norm": 0.6843688341564094, "learning_rate": 1.983675538993681e-05, "loss": 0.7367, "step": 390 }, { "epoch": 0.17616580310880828, "grad_norm": 0.6699775863352361, "learning_rate": 1.983590416692183e-05, "loss": 0.6912, "step": 391 }, { "epoch": 0.17661635503491777, "grad_norm": 0.6619185216194325, "learning_rate": 1.9835050748723826e-05, "loss": 0.6901, "step": 392 }, { "epoch": 0.17706690696102725, "grad_norm": 0.7002835182706632, "learning_rate": 1.9834195135533264e-05, "loss": 0.72, "step": 393 }, { "epoch": 0.17751745888713674, "grad_norm": 0.6746465059996118, "learning_rate": 1.98333373275411e-05, "loss": 0.7169, "step": 394 }, { "epoch": 0.17796801081324623, "grad_norm": 0.6553884719475779, "learning_rate": 1.9832477324938787e-05, "loss": 0.7306, "step": 395 }, { "epoch": 0.17841856273935572, "grad_norm": 0.7189859692024404, "learning_rate": 1.983161512791825e-05, "loss": 0.6835, "step": 396 }, { "epoch": 0.1788691146654652, "grad_norm": 0.6342920696571585, "learning_rate": 1.9830750736671923e-05, "loss": 0.6813, "step": 397 }, { "epoch": 0.17931966659157467, "grad_norm": 0.7484980684907374, "learning_rate": 1.9829884151392717e-05, "loss": 0.706, "step": 398 }, { "epoch": 0.17977021851768415, "grad_norm": 0.6839161946636343, "learning_rate": 1.982901537227404e-05, "loss": 0.6787, "step": 399 }, { "epoch": 0.18022077044379364, "grad_norm": 0.6219111105154371, "learning_rate": 1.9828144399509782e-05, "loss": 0.7333, "step": 400 }, { "epoch": 0.18067132236990313, "grad_norm": 0.6891918163058227, "learning_rate": 1.982727123329433e-05, "loss": 0.6928, "step": 401 }, { "epoch": 0.18112187429601262, "grad_norm": 0.6354470115317563, "learning_rate": 1.982639587382256e-05, "loss": 0.7051, "step": 402 }, { "epoch": 0.1815724262221221, "grad_norm": 0.7193897776093219, "learning_rate": 1.9825518321289837e-05, "loss": 0.6758, "step": 403 }, { "epoch": 0.1820229781482316, "grad_norm": 0.6213760772676645, "learning_rate": 1.9824638575892003e-05, "loss": 0.666, "step": 404 }, { "epoch": 0.18247353007434106, "grad_norm": 0.6884414923435983, "learning_rate": 1.9823756637825412e-05, "loss": 0.7033, "step": 405 }, { "epoch": 0.18292408200045054, "grad_norm": 0.6720513000789211, "learning_rate": 1.982287250728689e-05, "loss": 0.693, "step": 406 }, { "epoch": 0.18337463392656003, "grad_norm": 0.6224031604365896, "learning_rate": 1.9821986184473757e-05, "loss": 0.7014, "step": 407 }, { "epoch": 0.18382518585266952, "grad_norm": 0.6486850556410484, "learning_rate": 1.982109766958382e-05, "loss": 0.6868, "step": 408 }, { "epoch": 0.184275737778779, "grad_norm": 0.6259700961451686, "learning_rate": 1.9820206962815392e-05, "loss": 0.6988, "step": 409 }, { "epoch": 0.1847262897048885, "grad_norm": 0.6446090604239773, "learning_rate": 1.9819314064367247e-05, "loss": 0.7002, "step": 410 }, { "epoch": 0.18517684163099798, "grad_norm": 0.6316086842549307, "learning_rate": 1.981841897443867e-05, "loss": 0.7268, "step": 411 }, { "epoch": 0.18562739355710745, "grad_norm": 0.6796805684820945, "learning_rate": 1.981752169322942e-05, "loss": 0.7225, "step": 412 }, { "epoch": 0.18607794548321693, "grad_norm": 0.611187593531429, "learning_rate": 1.9816622220939762e-05, "loss": 0.704, "step": 413 }, { "epoch": 0.18652849740932642, "grad_norm": 0.6747612993073943, "learning_rate": 1.9815720557770436e-05, "loss": 0.6382, "step": 414 }, { "epoch": 0.1869790493354359, "grad_norm": 0.6803044768768016, "learning_rate": 1.9814816703922678e-05, "loss": 0.6905, "step": 415 }, { "epoch": 0.1874296012615454, "grad_norm": 0.6465986988411903, "learning_rate": 1.9813910659598208e-05, "loss": 0.7223, "step": 416 }, { "epoch": 0.18788015318765489, "grad_norm": 0.6406841977174035, "learning_rate": 1.981300242499924e-05, "loss": 0.6867, "step": 417 }, { "epoch": 0.18833070511376437, "grad_norm": 0.6152196730504224, "learning_rate": 1.9812092000328474e-05, "loss": 0.6631, "step": 418 }, { "epoch": 0.18878125703987383, "grad_norm": 0.6873856092573581, "learning_rate": 1.9811179385789098e-05, "loss": 0.6626, "step": 419 }, { "epoch": 0.18923180896598332, "grad_norm": 0.624909052725759, "learning_rate": 1.981026458158479e-05, "loss": 0.6826, "step": 420 }, { "epoch": 0.1896823608920928, "grad_norm": 0.6572531603640082, "learning_rate": 1.9809347587919716e-05, "loss": 0.7184, "step": 421 }, { "epoch": 0.1901329128182023, "grad_norm": 0.6932216412763768, "learning_rate": 1.9808428404998532e-05, "loss": 0.7419, "step": 422 }, { "epoch": 0.1905834647443118, "grad_norm": 0.7219427394394876, "learning_rate": 1.980750703302638e-05, "loss": 0.7029, "step": 423 }, { "epoch": 0.19103401667042127, "grad_norm": 0.6447783352377131, "learning_rate": 1.98065834722089e-05, "loss": 0.7137, "step": 424 }, { "epoch": 0.19148456859653076, "grad_norm": 0.6848881028793202, "learning_rate": 1.9805657722752202e-05, "loss": 0.7175, "step": 425 }, { "epoch": 0.19193512052264022, "grad_norm": 0.7418492516796019, "learning_rate": 1.98047297848629e-05, "loss": 0.7119, "step": 426 }, { "epoch": 0.1923856724487497, "grad_norm": 0.7081459145030108, "learning_rate": 1.9803799658748096e-05, "loss": 0.6917, "step": 427 }, { "epoch": 0.1928362243748592, "grad_norm": 0.7016494345461589, "learning_rate": 1.980286734461537e-05, "loss": 0.7025, "step": 428 }, { "epoch": 0.1932867763009687, "grad_norm": 0.7779329507974021, "learning_rate": 1.9801932842672796e-05, "loss": 0.6746, "step": 429 }, { "epoch": 0.19373732822707818, "grad_norm": 0.7064784637796473, "learning_rate": 1.9800996153128942e-05, "loss": 0.6866, "step": 430 }, { "epoch": 0.19418788015318766, "grad_norm": 0.7523225669188083, "learning_rate": 1.9800057276192856e-05, "loss": 0.6223, "step": 431 }, { "epoch": 0.19463843207929715, "grad_norm": 0.7338675159561382, "learning_rate": 1.9799116212074077e-05, "loss": 0.6736, "step": 432 }, { "epoch": 0.1950889840054066, "grad_norm": 0.7679677605645412, "learning_rate": 1.9798172960982636e-05, "loss": 0.6845, "step": 433 }, { "epoch": 0.1955395359315161, "grad_norm": 0.6737427848022348, "learning_rate": 1.979722752312904e-05, "loss": 0.6932, "step": 434 }, { "epoch": 0.1959900878576256, "grad_norm": 0.7932945071330098, "learning_rate": 1.97962798987243e-05, "loss": 0.6851, "step": 435 }, { "epoch": 0.19644063978373508, "grad_norm": 0.7782072943349446, "learning_rate": 1.9795330087979905e-05, "loss": 0.6912, "step": 436 }, { "epoch": 0.19689119170984457, "grad_norm": 0.6439183705315292, "learning_rate": 1.9794378091107834e-05, "loss": 0.7335, "step": 437 }, { "epoch": 0.19734174363595405, "grad_norm": 0.7653725288479553, "learning_rate": 1.9793423908320554e-05, "loss": 0.6889, "step": 438 }, { "epoch": 0.19779229556206354, "grad_norm": 0.6671903289405967, "learning_rate": 1.9792467539831018e-05, "loss": 0.7246, "step": 439 }, { "epoch": 0.198242847488173, "grad_norm": 0.7963795727565973, "learning_rate": 1.9791508985852676e-05, "loss": 0.6839, "step": 440 }, { "epoch": 0.1986933994142825, "grad_norm": 0.643977929667029, "learning_rate": 1.9790548246599447e-05, "loss": 0.6875, "step": 441 }, { "epoch": 0.19914395134039198, "grad_norm": 0.7907889136722032, "learning_rate": 1.978958532228576e-05, "loss": 0.6737, "step": 442 }, { "epoch": 0.19959450326650147, "grad_norm": 0.741312633832084, "learning_rate": 1.978862021312652e-05, "loss": 0.7048, "step": 443 }, { "epoch": 0.20004505519261095, "grad_norm": 0.7446935305334224, "learning_rate": 1.978765291933712e-05, "loss": 0.7141, "step": 444 }, { "epoch": 0.20049560711872044, "grad_norm": 0.7416116056530043, "learning_rate": 1.9786683441133432e-05, "loss": 0.7161, "step": 445 }, { "epoch": 0.2009461590448299, "grad_norm": 0.744836201162258, "learning_rate": 1.978571177873183e-05, "loss": 0.6774, "step": 446 }, { "epoch": 0.2013967109709394, "grad_norm": 0.7283474418712145, "learning_rate": 1.978473793234918e-05, "loss": 0.7658, "step": 447 }, { "epoch": 0.20184726289704888, "grad_norm": 0.7404943948017539, "learning_rate": 1.9783761902202814e-05, "loss": 0.6829, "step": 448 }, { "epoch": 0.20229781482315837, "grad_norm": 0.7000213016959208, "learning_rate": 1.978278368851057e-05, "loss": 0.7135, "step": 449 }, { "epoch": 0.20274836674926786, "grad_norm": 0.7401298027164153, "learning_rate": 1.978180329149076e-05, "loss": 0.7046, "step": 450 }, { "epoch": 0.20319891867537734, "grad_norm": 0.8033252772553693, "learning_rate": 1.9780820711362193e-05, "loss": 0.6701, "step": 451 }, { "epoch": 0.20364947060148683, "grad_norm": 0.7296781148217782, "learning_rate": 1.977983594834416e-05, "loss": 0.7122, "step": 452 }, { "epoch": 0.2041000225275963, "grad_norm": 0.890050486930849, "learning_rate": 1.9778849002656446e-05, "loss": 0.7078, "step": 453 }, { "epoch": 0.20455057445370578, "grad_norm": 0.6557915832543912, "learning_rate": 1.9777859874519312e-05, "loss": 0.7173, "step": 454 }, { "epoch": 0.20500112637981527, "grad_norm": 0.7293000194788269, "learning_rate": 1.9776868564153517e-05, "loss": 0.6762, "step": 455 }, { "epoch": 0.20545167830592476, "grad_norm": 0.6899414010169248, "learning_rate": 1.97758750717803e-05, "loss": 0.7373, "step": 456 }, { "epoch": 0.20590223023203424, "grad_norm": 0.785152966588635, "learning_rate": 1.9774879397621387e-05, "loss": 0.6979, "step": 457 }, { "epoch": 0.20635278215814373, "grad_norm": 0.6942115876249008, "learning_rate": 1.9773881541898996e-05, "loss": 0.7006, "step": 458 }, { "epoch": 0.20680333408425322, "grad_norm": 0.771631059867679, "learning_rate": 1.977288150483583e-05, "loss": 0.689, "step": 459 }, { "epoch": 0.20725388601036268, "grad_norm": 0.67810353370607, "learning_rate": 1.9771879286655074e-05, "loss": 0.6848, "step": 460 }, { "epoch": 0.20770443793647217, "grad_norm": 0.6805558995005658, "learning_rate": 1.977087488758041e-05, "loss": 0.674, "step": 461 }, { "epoch": 0.20815498986258166, "grad_norm": 0.6737971262681217, "learning_rate": 1.9769868307835996e-05, "loss": 0.6993, "step": 462 }, { "epoch": 0.20860554178869115, "grad_norm": 0.7861225556485847, "learning_rate": 1.9768859547646476e-05, "loss": 0.6913, "step": 463 }, { "epoch": 0.20905609371480063, "grad_norm": 0.687246651473198, "learning_rate": 1.9767848607236997e-05, "loss": 0.6995, "step": 464 }, { "epoch": 0.20950664564091012, "grad_norm": 0.7270690202448106, "learning_rate": 1.9766835486833173e-05, "loss": 0.6976, "step": 465 }, { "epoch": 0.2099571975670196, "grad_norm": 0.6965289941548458, "learning_rate": 1.9765820186661116e-05, "loss": 0.6786, "step": 466 }, { "epoch": 0.21040774949312907, "grad_norm": 0.6405702362717197, "learning_rate": 1.9764802706947423e-05, "loss": 0.7064, "step": 467 }, { "epoch": 0.21085830141923856, "grad_norm": 0.6924031596822716, "learning_rate": 1.976378304791917e-05, "loss": 0.6608, "step": 468 }, { "epoch": 0.21130885334534805, "grad_norm": 0.6652840354683414, "learning_rate": 1.976276120980393e-05, "loss": 0.7163, "step": 469 }, { "epoch": 0.21175940527145753, "grad_norm": 0.6750592531164503, "learning_rate": 1.9761737192829753e-05, "loss": 0.6531, "step": 470 }, { "epoch": 0.21220995719756702, "grad_norm": 0.6318513997840024, "learning_rate": 1.9760710997225187e-05, "loss": 0.6613, "step": 471 }, { "epoch": 0.2126605091236765, "grad_norm": 0.6505840741971444, "learning_rate": 1.975968262321925e-05, "loss": 0.7079, "step": 472 }, { "epoch": 0.213111061049786, "grad_norm": 0.6132161108784739, "learning_rate": 1.9758652071041463e-05, "loss": 0.7192, "step": 473 }, { "epoch": 0.21356161297589546, "grad_norm": 0.673645234713038, "learning_rate": 1.9757619340921824e-05, "loss": 0.7267, "step": 474 }, { "epoch": 0.21401216490200495, "grad_norm": 0.6094059114650869, "learning_rate": 1.9756584433090812e-05, "loss": 0.6626, "step": 475 }, { "epoch": 0.21446271682811444, "grad_norm": 0.6729948303800201, "learning_rate": 1.9755547347779405e-05, "loss": 0.705, "step": 476 }, { "epoch": 0.21491326875422392, "grad_norm": 0.6356930893610828, "learning_rate": 1.9754508085219057e-05, "loss": 0.7134, "step": 477 }, { "epoch": 0.2153638206803334, "grad_norm": 0.6234074147505199, "learning_rate": 1.9753466645641713e-05, "loss": 0.7059, "step": 478 }, { "epoch": 0.2158143726064429, "grad_norm": 0.6096140141872605, "learning_rate": 1.97524230292798e-05, "loss": 0.6451, "step": 479 }, { "epoch": 0.2162649245325524, "grad_norm": 0.6423300579721407, "learning_rate": 1.9751377236366237e-05, "loss": 0.7235, "step": 480 }, { "epoch": 0.21671547645866185, "grad_norm": 0.6274356872080136, "learning_rate": 1.9750329267134417e-05, "loss": 0.6702, "step": 481 }, { "epoch": 0.21716602838477134, "grad_norm": 0.6793839970569187, "learning_rate": 1.9749279121818235e-05, "loss": 0.6903, "step": 482 }, { "epoch": 0.21761658031088082, "grad_norm": 0.6548444035858052, "learning_rate": 1.9748226800652062e-05, "loss": 0.6879, "step": 483 }, { "epoch": 0.2180671322369903, "grad_norm": 0.6371871803149554, "learning_rate": 1.974717230387075e-05, "loss": 0.673, "step": 484 }, { "epoch": 0.2185176841630998, "grad_norm": 0.6408377405518155, "learning_rate": 1.9746115631709645e-05, "loss": 0.6765, "step": 485 }, { "epoch": 0.2189682360892093, "grad_norm": 0.6188332748955431, "learning_rate": 1.974505678440458e-05, "loss": 0.695, "step": 486 }, { "epoch": 0.21941878801531878, "grad_norm": 0.6034423778919993, "learning_rate": 1.974399576219186e-05, "loss": 0.6571, "step": 487 }, { "epoch": 0.21986933994142824, "grad_norm": 0.6809070934755834, "learning_rate": 1.9742932565308293e-05, "loss": 0.6766, "step": 488 }, { "epoch": 0.22031989186753773, "grad_norm": 0.6190532899941582, "learning_rate": 1.974186719399116e-05, "loss": 0.6944, "step": 489 }, { "epoch": 0.22077044379364721, "grad_norm": 0.6380838786116135, "learning_rate": 1.9740799648478233e-05, "loss": 0.689, "step": 490 }, { "epoch": 0.2212209957197567, "grad_norm": 0.6103877312353424, "learning_rate": 1.9739729929007762e-05, "loss": 0.7091, "step": 491 }, { "epoch": 0.2216715476458662, "grad_norm": 0.6754666233457889, "learning_rate": 1.9738658035818495e-05, "loss": 0.7087, "step": 492 }, { "epoch": 0.22212209957197568, "grad_norm": 0.7021060030503081, "learning_rate": 1.973758396914966e-05, "loss": 0.6783, "step": 493 }, { "epoch": 0.22257265149808517, "grad_norm": 0.636675626211076, "learning_rate": 1.9736507729240957e-05, "loss": 0.6984, "step": 494 }, { "epoch": 0.22302320342419463, "grad_norm": 0.6809914749170347, "learning_rate": 1.973542931633259e-05, "loss": 0.6685, "step": 495 }, { "epoch": 0.22347375535030412, "grad_norm": 0.6007150605816346, "learning_rate": 1.9734348730665233e-05, "loss": 0.7006, "step": 496 }, { "epoch": 0.2239243072764136, "grad_norm": 0.6356146925107621, "learning_rate": 1.973326597248006e-05, "loss": 0.7139, "step": 497 }, { "epoch": 0.2243748592025231, "grad_norm": 0.6064583503105897, "learning_rate": 1.9732181042018718e-05, "loss": 0.6796, "step": 498 }, { "epoch": 0.22482541112863258, "grad_norm": 0.596129948545187, "learning_rate": 1.973109393952334e-05, "loss": 0.704, "step": 499 }, { "epoch": 0.22527596305474207, "grad_norm": 0.5928536074138244, "learning_rate": 1.973000466523655e-05, "loss": 0.6687, "step": 500 }, { "epoch": 0.22572651498085156, "grad_norm": 0.6555276865325836, "learning_rate": 1.972891321940145e-05, "loss": 0.6914, "step": 501 }, { "epoch": 0.22617706690696102, "grad_norm": 0.5866029820664469, "learning_rate": 1.972781960226163e-05, "loss": 0.7187, "step": 502 }, { "epoch": 0.2266276188330705, "grad_norm": 0.6257854325851996, "learning_rate": 1.9726723814061168e-05, "loss": 0.7037, "step": 503 }, { "epoch": 0.22707817075918, "grad_norm": 0.6270768080573341, "learning_rate": 1.972562585504462e-05, "loss": 0.6944, "step": 504 }, { "epoch": 0.22752872268528948, "grad_norm": 0.6074270021926919, "learning_rate": 1.9724525725457027e-05, "loss": 0.6743, "step": 505 }, { "epoch": 0.22797927461139897, "grad_norm": 0.6424576530842272, "learning_rate": 1.972342342554392e-05, "loss": 0.687, "step": 506 }, { "epoch": 0.22842982653750846, "grad_norm": 0.6073742004755304, "learning_rate": 1.9722318955551307e-05, "loss": 0.6648, "step": 507 }, { "epoch": 0.22888037846361795, "grad_norm": 0.5908828695314912, "learning_rate": 1.972121231572569e-05, "loss": 0.6745, "step": 508 }, { "epoch": 0.2293309303897274, "grad_norm": 0.6341666432762236, "learning_rate": 1.9720103506314042e-05, "loss": 0.7033, "step": 509 }, { "epoch": 0.2297814823158369, "grad_norm": 0.6098067135154692, "learning_rate": 1.9718992527563833e-05, "loss": 0.6678, "step": 510 }, { "epoch": 0.23023203424194638, "grad_norm": 0.6562077715387611, "learning_rate": 1.9717879379723012e-05, "loss": 0.7075, "step": 511 }, { "epoch": 0.23068258616805587, "grad_norm": 0.6467041860715018, "learning_rate": 1.971676406304001e-05, "loss": 0.6707, "step": 512 }, { "epoch": 0.23113313809416536, "grad_norm": 0.624866020413649, "learning_rate": 1.971564657776375e-05, "loss": 0.6852, "step": 513 }, { "epoch": 0.23158369002027485, "grad_norm": 0.7511723992271656, "learning_rate": 1.971452692414362e-05, "loss": 0.689, "step": 514 }, { "epoch": 0.23203424194638433, "grad_norm": 0.6193784760269153, "learning_rate": 1.9713405102429516e-05, "loss": 0.6857, "step": 515 }, { "epoch": 0.2324847938724938, "grad_norm": 0.7098111562771399, "learning_rate": 1.9712281112871805e-05, "loss": 0.7405, "step": 516 }, { "epoch": 0.23293534579860328, "grad_norm": 0.6343834995365052, "learning_rate": 1.9711154955721338e-05, "loss": 0.7056, "step": 517 }, { "epoch": 0.23338589772471277, "grad_norm": 0.6210818547101881, "learning_rate": 1.971002663122945e-05, "loss": 0.6911, "step": 518 }, { "epoch": 0.23383644965082226, "grad_norm": 0.6783325385750062, "learning_rate": 1.9708896139647963e-05, "loss": 0.7136, "step": 519 }, { "epoch": 0.23428700157693175, "grad_norm": 0.6173286600054512, "learning_rate": 1.9707763481229182e-05, "loss": 0.6788, "step": 520 }, { "epoch": 0.23473755350304124, "grad_norm": 0.6183671527598938, "learning_rate": 1.9706628656225896e-05, "loss": 0.6972, "step": 521 }, { "epoch": 0.23518810542915072, "grad_norm": 0.6147722250841676, "learning_rate": 1.9705491664891368e-05, "loss": 0.7147, "step": 522 }, { "epoch": 0.23563865735526018, "grad_norm": 0.6581179457333659, "learning_rate": 1.970435250747936e-05, "loss": 0.6985, "step": 523 }, { "epoch": 0.23608920928136967, "grad_norm": 0.6557853142550467, "learning_rate": 1.9703211184244108e-05, "loss": 0.6906, "step": 524 }, { "epoch": 0.23653976120747916, "grad_norm": 0.609324824232077, "learning_rate": 1.9702067695440333e-05, "loss": 0.6944, "step": 525 }, { "epoch": 0.23699031313358865, "grad_norm": 0.6476922057235207, "learning_rate": 1.9700922041323237e-05, "loss": 0.675, "step": 526 }, { "epoch": 0.23744086505969814, "grad_norm": 0.6125791780290308, "learning_rate": 1.969977422214851e-05, "loss": 0.6866, "step": 527 }, { "epoch": 0.23789141698580762, "grad_norm": 0.6822095343862661, "learning_rate": 1.9698624238172326e-05, "loss": 0.7322, "step": 528 }, { "epoch": 0.23834196891191708, "grad_norm": 0.6555502081048908, "learning_rate": 1.969747208965133e-05, "loss": 0.6965, "step": 529 }, { "epoch": 0.23879252083802657, "grad_norm": 0.611890774376181, "learning_rate": 1.969631777684267e-05, "loss": 0.69, "step": 530 }, { "epoch": 0.23924307276413606, "grad_norm": 0.6355726623078007, "learning_rate": 1.969516130000396e-05, "loss": 0.6974, "step": 531 }, { "epoch": 0.23969362469024555, "grad_norm": 0.5826104753804613, "learning_rate": 1.9694002659393306e-05, "loss": 0.7251, "step": 532 }, { "epoch": 0.24014417661635504, "grad_norm": 0.6144395764376858, "learning_rate": 1.969284185526929e-05, "loss": 0.6993, "step": 533 }, { "epoch": 0.24059472854246453, "grad_norm": 0.6234576003684122, "learning_rate": 1.9691678887890987e-05, "loss": 0.6763, "step": 534 }, { "epoch": 0.241045280468574, "grad_norm": 0.6155748993052974, "learning_rate": 1.9690513757517942e-05, "loss": 0.7093, "step": 535 }, { "epoch": 0.24149583239468347, "grad_norm": 0.6099688054437925, "learning_rate": 1.9689346464410195e-05, "loss": 0.6699, "step": 536 }, { "epoch": 0.24194638432079296, "grad_norm": 0.6315101740475216, "learning_rate": 1.968817700882826e-05, "loss": 0.7014, "step": 537 }, { "epoch": 0.24239693624690245, "grad_norm": 0.6537932824476593, "learning_rate": 1.968700539103314e-05, "loss": 0.7236, "step": 538 }, { "epoch": 0.24284748817301194, "grad_norm": 0.6133930341430358, "learning_rate": 1.9685831611286312e-05, "loss": 0.6821, "step": 539 }, { "epoch": 0.24329804009912143, "grad_norm": 0.6278599886016945, "learning_rate": 1.9684655669849747e-05, "loss": 0.6883, "step": 540 }, { "epoch": 0.24374859202523091, "grad_norm": 0.6428233444805903, "learning_rate": 1.968347756698589e-05, "loss": 0.6594, "step": 541 }, { "epoch": 0.2441991439513404, "grad_norm": 0.6157948111143834, "learning_rate": 1.9682297302957666e-05, "loss": 0.6911, "step": 542 }, { "epoch": 0.24464969587744986, "grad_norm": 0.7336357137224443, "learning_rate": 1.9681114878028494e-05, "loss": 0.6541, "step": 543 }, { "epoch": 0.24510024780355935, "grad_norm": 0.7196140473189447, "learning_rate": 1.9679930292462265e-05, "loss": 0.7174, "step": 544 }, { "epoch": 0.24555079972966884, "grad_norm": 0.5859854702251692, "learning_rate": 1.9678743546523357e-05, "loss": 0.6646, "step": 545 }, { "epoch": 0.24600135165577833, "grad_norm": 0.7631857775864505, "learning_rate": 1.9677554640476625e-05, "loss": 0.6786, "step": 546 }, { "epoch": 0.24645190358188782, "grad_norm": 0.6282306526370739, "learning_rate": 1.9676363574587414e-05, "loss": 0.705, "step": 547 }, { "epoch": 0.2469024555079973, "grad_norm": 0.626084399446757, "learning_rate": 1.9675170349121545e-05, "loss": 0.6816, "step": 548 }, { "epoch": 0.2473530074341068, "grad_norm": 0.6910345602486441, "learning_rate": 1.9673974964345327e-05, "loss": 0.7154, "step": 549 }, { "epoch": 0.24780355936021625, "grad_norm": 0.6084888837144801, "learning_rate": 1.967277742052554e-05, "loss": 0.6564, "step": 550 }, { "epoch": 0.24825411128632574, "grad_norm": 0.6616809208650266, "learning_rate": 1.9671577717929453e-05, "loss": 0.6319, "step": 551 }, { "epoch": 0.24870466321243523, "grad_norm": 0.627238664531889, "learning_rate": 1.9670375856824823e-05, "loss": 0.6946, "step": 552 }, { "epoch": 0.24915521513854472, "grad_norm": 0.6404878896222891, "learning_rate": 1.966917183747987e-05, "loss": 0.6863, "step": 553 }, { "epoch": 0.2496057670646542, "grad_norm": 0.693932089677475, "learning_rate": 1.9667965660163323e-05, "loss": 0.6829, "step": 554 }, { "epoch": 0.2500563189907637, "grad_norm": 0.6413480204624329, "learning_rate": 1.9666757325144366e-05, "loss": 0.6962, "step": 555 }, { "epoch": 0.2500563189907637, "eval_loss": 0.6636229753494263, "eval_runtime": 24.4127, "eval_samples_per_second": 11.428, "eval_steps_per_second": 0.492, "step": 555 }, { "epoch": 0.2505068709168732, "grad_norm": 0.6651691510191432, "learning_rate": 1.9665546832692682e-05, "loss": 0.6917, "step": 556 }, { "epoch": 0.25095742284298267, "grad_norm": 0.6214183703937776, "learning_rate": 1.966433418307843e-05, "loss": 0.7066, "step": 557 }, { "epoch": 0.25140797476909216, "grad_norm": 0.6197154623962617, "learning_rate": 1.966311937657224e-05, "loss": 0.6825, "step": 558 }, { "epoch": 0.25185852669520165, "grad_norm": 0.6298062668464153, "learning_rate": 1.9661902413445246e-05, "loss": 0.6968, "step": 559 }, { "epoch": 0.25230907862131113, "grad_norm": 0.7081723461059983, "learning_rate": 1.9660683293969042e-05, "loss": 0.7033, "step": 560 }, { "epoch": 0.25275963054742057, "grad_norm": 0.5906051698106366, "learning_rate": 1.965946201841572e-05, "loss": 0.7025, "step": 561 }, { "epoch": 0.25321018247353005, "grad_norm": 0.6133025948073277, "learning_rate": 1.9658238587057832e-05, "loss": 0.6876, "step": 562 }, { "epoch": 0.25366073439963954, "grad_norm": 0.6052065544924956, "learning_rate": 1.9657013000168438e-05, "loss": 0.7335, "step": 563 }, { "epoch": 0.25411128632574903, "grad_norm": 0.6065744663930799, "learning_rate": 1.9655785258021053e-05, "loss": 0.705, "step": 564 }, { "epoch": 0.2545618382518585, "grad_norm": 0.6313371792834113, "learning_rate": 1.9654555360889697e-05, "loss": 0.6557, "step": 565 }, { "epoch": 0.255012390177968, "grad_norm": 0.627994597832757, "learning_rate": 1.965332330904885e-05, "loss": 0.6776, "step": 566 }, { "epoch": 0.2554629421040775, "grad_norm": 0.6174168271886391, "learning_rate": 1.9652089102773487e-05, "loss": 0.6968, "step": 567 }, { "epoch": 0.255913494030187, "grad_norm": 0.6084366481306376, "learning_rate": 1.965085274233906e-05, "loss": 0.6637, "step": 568 }, { "epoch": 0.25636404595629647, "grad_norm": 0.6245655088706182, "learning_rate": 1.9649614228021497e-05, "loss": 0.6923, "step": 569 }, { "epoch": 0.25681459788240596, "grad_norm": 0.5722296099501842, "learning_rate": 1.964837356009721e-05, "loss": 0.6904, "step": 570 }, { "epoch": 0.25726514980851545, "grad_norm": 0.629734673935994, "learning_rate": 1.964713073884309e-05, "loss": 0.7067, "step": 571 }, { "epoch": 0.25771570173462494, "grad_norm": 0.6378743025981062, "learning_rate": 1.9645885764536522e-05, "loss": 0.7193, "step": 572 }, { "epoch": 0.2581662536607344, "grad_norm": 0.6163805755071894, "learning_rate": 1.9644638637455348e-05, "loss": 0.6873, "step": 573 }, { "epoch": 0.25861680558684386, "grad_norm": 0.6241701794712717, "learning_rate": 1.9643389357877907e-05, "loss": 0.6827, "step": 574 }, { "epoch": 0.25906735751295334, "grad_norm": 0.714283214759487, "learning_rate": 1.9642137926083013e-05, "loss": 0.7033, "step": 575 }, { "epoch": 0.25951790943906283, "grad_norm": 0.6841069797027297, "learning_rate": 1.9640884342349965e-05, "loss": 0.6991, "step": 576 }, { "epoch": 0.2599684613651723, "grad_norm": 0.6950618795503724, "learning_rate": 1.9639628606958535e-05, "loss": 0.6775, "step": 577 }, { "epoch": 0.2604190132912818, "grad_norm": 0.6340097933853994, "learning_rate": 1.963837072018898e-05, "loss": 0.6756, "step": 578 }, { "epoch": 0.2608695652173913, "grad_norm": 0.6293634793160385, "learning_rate": 1.963711068232203e-05, "loss": 0.6904, "step": 579 }, { "epoch": 0.2613201171435008, "grad_norm": 0.5922596777145966, "learning_rate": 1.9635848493638913e-05, "loss": 0.7095, "step": 580 }, { "epoch": 0.2617706690696103, "grad_norm": 0.581592948114822, "learning_rate": 1.9634584154421316e-05, "loss": 0.6782, "step": 581 }, { "epoch": 0.26222122099571976, "grad_norm": 0.5956449789154409, "learning_rate": 1.9633317664951418e-05, "loss": 0.6994, "step": 582 }, { "epoch": 0.26267177292182925, "grad_norm": 0.6329885287665818, "learning_rate": 1.9632049025511875e-05, "loss": 0.7004, "step": 583 }, { "epoch": 0.26312232484793874, "grad_norm": 0.5811769728331713, "learning_rate": 1.963077823638582e-05, "loss": 0.7171, "step": 584 }, { "epoch": 0.2635728767740482, "grad_norm": 0.6459840406211704, "learning_rate": 1.9629505297856872e-05, "loss": 0.7284, "step": 585 }, { "epoch": 0.2640234287001577, "grad_norm": 0.5922634386478515, "learning_rate": 1.9628230210209124e-05, "loss": 0.6759, "step": 586 }, { "epoch": 0.2644739806262672, "grad_norm": 0.6408016767809648, "learning_rate": 1.962695297372715e-05, "loss": 0.6761, "step": 587 }, { "epoch": 0.26492453255237663, "grad_norm": 0.6123356523033406, "learning_rate": 1.962567358869601e-05, "loss": 0.7291, "step": 588 }, { "epoch": 0.2653750844784861, "grad_norm": 0.6187767261107578, "learning_rate": 1.9624392055401233e-05, "loss": 0.7084, "step": 589 }, { "epoch": 0.2658256364045956, "grad_norm": 0.5836323013779734, "learning_rate": 1.962310837412883e-05, "loss": 0.7061, "step": 590 }, { "epoch": 0.2662761883307051, "grad_norm": 0.5988424821861096, "learning_rate": 1.96218225451653e-05, "loss": 0.6771, "step": 591 }, { "epoch": 0.2667267402568146, "grad_norm": 0.5720489982979462, "learning_rate": 1.962053456879761e-05, "loss": 0.6937, "step": 592 }, { "epoch": 0.2671772921829241, "grad_norm": 0.5814972885257707, "learning_rate": 1.9619244445313214e-05, "loss": 0.7025, "step": 593 }, { "epoch": 0.26762784410903356, "grad_norm": 0.628811081972455, "learning_rate": 1.9617952175000042e-05, "loss": 0.6888, "step": 594 }, { "epoch": 0.26807839603514305, "grad_norm": 0.5976031476691468, "learning_rate": 1.9616657758146503e-05, "loss": 0.6953, "step": 595 }, { "epoch": 0.26852894796125254, "grad_norm": 0.6016446691339726, "learning_rate": 1.961536119504149e-05, "loss": 0.6826, "step": 596 }, { "epoch": 0.26897949988736203, "grad_norm": 0.5910289231260587, "learning_rate": 1.9614062485974364e-05, "loss": 0.6893, "step": 597 }, { "epoch": 0.2694300518134715, "grad_norm": 0.602285611669756, "learning_rate": 1.9612761631234976e-05, "loss": 0.6921, "step": 598 }, { "epoch": 0.269880603739581, "grad_norm": 0.6022231439916831, "learning_rate": 1.961145863111365e-05, "loss": 0.7002, "step": 599 }, { "epoch": 0.2703311556656905, "grad_norm": 0.5601486973260071, "learning_rate": 1.961015348590119e-05, "loss": 0.7178, "step": 600 }, { "epoch": 0.2707817075918, "grad_norm": 0.5784550455754301, "learning_rate": 1.9608846195888878e-05, "loss": 0.6892, "step": 601 }, { "epoch": 0.2712322595179094, "grad_norm": 0.575479261820419, "learning_rate": 1.9607536761368484e-05, "loss": 0.6909, "step": 602 }, { "epoch": 0.2716828114440189, "grad_norm": 0.5746348036493247, "learning_rate": 1.9606225182632237e-05, "loss": 0.7148, "step": 603 }, { "epoch": 0.2721333633701284, "grad_norm": 0.5750633025657715, "learning_rate": 1.9604911459972863e-05, "loss": 0.675, "step": 604 }, { "epoch": 0.2725839152962379, "grad_norm": 0.6192862299114515, "learning_rate": 1.9603595593683556e-05, "loss": 0.7433, "step": 605 }, { "epoch": 0.27303446722234737, "grad_norm": 0.5521231582184766, "learning_rate": 1.9602277584057993e-05, "loss": 0.6663, "step": 606 }, { "epoch": 0.27348501914845685, "grad_norm": 0.5850038733825301, "learning_rate": 1.960095743139033e-05, "loss": 0.6558, "step": 607 }, { "epoch": 0.27393557107456634, "grad_norm": 0.5906076098885785, "learning_rate": 1.9599635135975195e-05, "loss": 0.6771, "step": 608 }, { "epoch": 0.27438612300067583, "grad_norm": 0.6622512830444315, "learning_rate": 1.95983106981077e-05, "loss": 0.681, "step": 609 }, { "epoch": 0.2748366749267853, "grad_norm": 0.5766100220615491, "learning_rate": 1.959698411808344e-05, "loss": 0.7001, "step": 610 }, { "epoch": 0.2752872268528948, "grad_norm": 0.6642510059803942, "learning_rate": 1.9595655396198475e-05, "loss": 0.668, "step": 611 }, { "epoch": 0.2757377787790043, "grad_norm": 0.611807459536595, "learning_rate": 1.9594324532749353e-05, "loss": 0.6867, "step": 612 }, { "epoch": 0.2761883307051138, "grad_norm": 0.6631453753428533, "learning_rate": 1.9592991528033098e-05, "loss": 0.707, "step": 613 }, { "epoch": 0.27663888263122327, "grad_norm": 0.5934471071657059, "learning_rate": 1.9591656382347203e-05, "loss": 0.688, "step": 614 }, { "epoch": 0.27708943455733276, "grad_norm": 0.6773393249089824, "learning_rate": 1.959031909598966e-05, "loss": 0.6973, "step": 615 }, { "epoch": 0.2775399864834422, "grad_norm": 0.6631990839471771, "learning_rate": 1.9588979669258913e-05, "loss": 0.6955, "step": 616 }, { "epoch": 0.2779905384095517, "grad_norm": 0.6380592479253014, "learning_rate": 1.95876381024539e-05, "loss": 0.6764, "step": 617 }, { "epoch": 0.27844109033566117, "grad_norm": 0.6161778390141609, "learning_rate": 1.9586294395874037e-05, "loss": 0.6857, "step": 618 }, { "epoch": 0.27889164226177066, "grad_norm": 0.6052002789957969, "learning_rate": 1.958494854981921e-05, "loss": 0.707, "step": 619 }, { "epoch": 0.27934219418788014, "grad_norm": 0.5811193199247179, "learning_rate": 1.9583600564589784e-05, "loss": 0.6484, "step": 620 }, { "epoch": 0.27979274611398963, "grad_norm": 0.5799866807864105, "learning_rate": 1.9582250440486606e-05, "loss": 0.6943, "step": 621 }, { "epoch": 0.2802432980400991, "grad_norm": 0.6424859711737029, "learning_rate": 1.958089817781099e-05, "loss": 0.6831, "step": 622 }, { "epoch": 0.2806938499662086, "grad_norm": 0.5853035319590337, "learning_rate": 1.957954377686475e-05, "loss": 0.7172, "step": 623 }, { "epoch": 0.2811444018923181, "grad_norm": 0.6664582553859778, "learning_rate": 1.957818723795015e-05, "loss": 0.7037, "step": 624 }, { "epoch": 0.2815949538184276, "grad_norm": 0.589167925963962, "learning_rate": 1.9576828561369946e-05, "loss": 0.6852, "step": 625 }, { "epoch": 0.2820455057445371, "grad_norm": 0.7079216891292703, "learning_rate": 1.9575467747427367e-05, "loss": 0.7145, "step": 626 }, { "epoch": 0.28249605767064656, "grad_norm": 0.5663537962374353, "learning_rate": 1.9574104796426124e-05, "loss": 0.7335, "step": 627 }, { "epoch": 0.28294660959675605, "grad_norm": 0.6610115898594018, "learning_rate": 1.9572739708670396e-05, "loss": 0.6912, "step": 628 }, { "epoch": 0.28339716152286554, "grad_norm": 0.6028933114287262, "learning_rate": 1.9571372484464852e-05, "loss": 0.6738, "step": 629 }, { "epoch": 0.28384771344897497, "grad_norm": 0.6593364201007214, "learning_rate": 1.9570003124114622e-05, "loss": 0.6934, "step": 630 }, { "epoch": 0.28429826537508446, "grad_norm": 0.565214736980431, "learning_rate": 1.9568631627925323e-05, "loss": 0.6898, "step": 631 }, { "epoch": 0.28474881730119395, "grad_norm": 0.6304958724144937, "learning_rate": 1.956725799620305e-05, "loss": 0.7125, "step": 632 }, { "epoch": 0.28519936922730343, "grad_norm": 0.5999664750582296, "learning_rate": 1.9565882229254366e-05, "loss": 0.7019, "step": 633 }, { "epoch": 0.2856499211534129, "grad_norm": 0.6203047933752998, "learning_rate": 1.9564504327386318e-05, "loss": 0.6698, "step": 634 }, { "epoch": 0.2861004730795224, "grad_norm": 0.6492501424663256, "learning_rate": 1.9563124290906427e-05, "loss": 0.6845, "step": 635 }, { "epoch": 0.2865510250056319, "grad_norm": 0.6477851588560527, "learning_rate": 1.956174212012269e-05, "loss": 0.6903, "step": 636 }, { "epoch": 0.2870015769317414, "grad_norm": 0.6720504131710313, "learning_rate": 1.9560357815343577e-05, "loss": 0.6588, "step": 637 }, { "epoch": 0.2874521288578509, "grad_norm": 0.6755987351839554, "learning_rate": 1.9558971376878048e-05, "loss": 0.6688, "step": 638 }, { "epoch": 0.28790268078396036, "grad_norm": 0.6310418433244255, "learning_rate": 1.9557582805035517e-05, "loss": 0.7105, "step": 639 }, { "epoch": 0.28835323271006985, "grad_norm": 0.6935255071395937, "learning_rate": 1.9556192100125893e-05, "loss": 0.6814, "step": 640 }, { "epoch": 0.28880378463617934, "grad_norm": 0.6097941347410006, "learning_rate": 1.9554799262459557e-05, "loss": 0.6393, "step": 641 }, { "epoch": 0.2892543365622888, "grad_norm": 0.6933804312678278, "learning_rate": 1.9553404292347356e-05, "loss": 0.654, "step": 642 }, { "epoch": 0.2897048884883983, "grad_norm": 0.718415302298756, "learning_rate": 1.9552007190100625e-05, "loss": 0.7169, "step": 643 }, { "epoch": 0.29015544041450775, "grad_norm": 0.6273705160077429, "learning_rate": 1.955060795603117e-05, "loss": 0.6676, "step": 644 }, { "epoch": 0.29060599234061724, "grad_norm": 0.6665569665161881, "learning_rate": 1.9549206590451274e-05, "loss": 0.7033, "step": 645 }, { "epoch": 0.2910565442667267, "grad_norm": 0.6274580644406725, "learning_rate": 1.954780309367369e-05, "loss": 0.7029, "step": 646 }, { "epoch": 0.2915070961928362, "grad_norm": 0.6735735738327197, "learning_rate": 1.9546397466011654e-05, "loss": 0.6801, "step": 647 }, { "epoch": 0.2919576481189457, "grad_norm": 0.6081229173360764, "learning_rate": 1.9544989707778877e-05, "loss": 0.6839, "step": 648 }, { "epoch": 0.2924082000450552, "grad_norm": 0.5924397759036083, "learning_rate": 1.9543579819289538e-05, "loss": 0.7117, "step": 649 }, { "epoch": 0.2928587519711647, "grad_norm": 0.6055724717409043, "learning_rate": 1.9542167800858302e-05, "loss": 0.6937, "step": 650 }, { "epoch": 0.29330930389727417, "grad_norm": 0.6418085934286423, "learning_rate": 1.95407536528003e-05, "loss": 0.6918, "step": 651 }, { "epoch": 0.29375985582338365, "grad_norm": 0.6033546068292341, "learning_rate": 1.9539337375431144e-05, "loss": 0.6917, "step": 652 }, { "epoch": 0.29421040774949314, "grad_norm": 0.639666643385695, "learning_rate": 1.9537918969066923e-05, "loss": 0.6585, "step": 653 }, { "epoch": 0.29466095967560263, "grad_norm": 0.6082035173303802, "learning_rate": 1.953649843402419e-05, "loss": 0.6824, "step": 654 }, { "epoch": 0.2951115116017121, "grad_norm": 0.6569327316395324, "learning_rate": 1.9535075770619995e-05, "loss": 0.666, "step": 655 }, { "epoch": 0.2955620635278216, "grad_norm": 0.6197235725475753, "learning_rate": 1.953365097917183e-05, "loss": 0.6687, "step": 656 }, { "epoch": 0.29601261545393104, "grad_norm": 0.666466176351875, "learning_rate": 1.9532224059997693e-05, "loss": 0.6707, "step": 657 }, { "epoch": 0.2964631673800405, "grad_norm": 0.675629307142219, "learning_rate": 1.9530795013416046e-05, "loss": 0.6839, "step": 658 }, { "epoch": 0.29691371930615, "grad_norm": 0.5803816282475424, "learning_rate": 1.9529363839745816e-05, "loss": 0.7197, "step": 659 }, { "epoch": 0.2973642712322595, "grad_norm": 0.6423184246432553, "learning_rate": 1.952793053930642e-05, "loss": 0.7132, "step": 660 }, { "epoch": 0.297814823158369, "grad_norm": 0.568780653154261, "learning_rate": 1.952649511241774e-05, "loss": 0.6781, "step": 661 }, { "epoch": 0.2982653750844785, "grad_norm": 0.5959400546985325, "learning_rate": 1.9525057559400134e-05, "loss": 0.6927, "step": 662 }, { "epoch": 0.29871592701058797, "grad_norm": 0.6063909603800159, "learning_rate": 1.9523617880574443e-05, "loss": 0.6764, "step": 663 }, { "epoch": 0.29916647893669746, "grad_norm": 0.6084662624337358, "learning_rate": 1.9522176076261966e-05, "loss": 0.669, "step": 664 }, { "epoch": 0.29961703086280694, "grad_norm": 0.6269923823108713, "learning_rate": 1.9520732146784493e-05, "loss": 0.6719, "step": 665 }, { "epoch": 0.30006758278891643, "grad_norm": 0.6120038590889803, "learning_rate": 1.9519286092464274e-05, "loss": 0.7057, "step": 666 }, { "epoch": 0.3005181347150259, "grad_norm": 0.6292403700014958, "learning_rate": 1.9517837913624048e-05, "loss": 0.666, "step": 667 }, { "epoch": 0.3009686866411354, "grad_norm": 0.6424511281167654, "learning_rate": 1.9516387610587016e-05, "loss": 0.6489, "step": 668 }, { "epoch": 0.3014192385672449, "grad_norm": 0.6075467038232357, "learning_rate": 1.951493518367686e-05, "loss": 0.6646, "step": 669 }, { "epoch": 0.3018697904933544, "grad_norm": 0.6491992981195934, "learning_rate": 1.951348063321773e-05, "loss": 0.7167, "step": 670 }, { "epoch": 0.3023203424194638, "grad_norm": 0.5984442267224125, "learning_rate": 1.9512023959534256e-05, "loss": 0.6909, "step": 671 }, { "epoch": 0.3027708943455733, "grad_norm": 0.6582154259503985, "learning_rate": 1.9510565162951538e-05, "loss": 0.7119, "step": 672 }, { "epoch": 0.3032214462716828, "grad_norm": 0.5731608821167077, "learning_rate": 1.950910424379515e-05, "loss": 0.6628, "step": 673 }, { "epoch": 0.3036719981977923, "grad_norm": 0.6692966133869258, "learning_rate": 1.950764120239115e-05, "loss": 0.6966, "step": 674 }, { "epoch": 0.30412255012390177, "grad_norm": 0.6135240762077352, "learning_rate": 1.950617603906604e-05, "loss": 0.6692, "step": 675 }, { "epoch": 0.30457310205001126, "grad_norm": 0.5869562932142208, "learning_rate": 1.950470875414684e-05, "loss": 0.7042, "step": 676 }, { "epoch": 0.30502365397612075, "grad_norm": 0.6159045011681168, "learning_rate": 1.9503239347961006e-05, "loss": 0.6713, "step": 677 }, { "epoch": 0.30547420590223023, "grad_norm": 0.5974199902268705, "learning_rate": 1.9501767820836485e-05, "loss": 0.6886, "step": 678 }, { "epoch": 0.3059247578283397, "grad_norm": 0.6021073707941194, "learning_rate": 1.9500294173101687e-05, "loss": 0.6597, "step": 679 }, { "epoch": 0.3063753097544492, "grad_norm": 0.6360534208841446, "learning_rate": 1.949881840508551e-05, "loss": 0.6772, "step": 680 }, { "epoch": 0.3068258616805587, "grad_norm": 0.6633669119836239, "learning_rate": 1.9497340517117314e-05, "loss": 0.7037, "step": 681 }, { "epoch": 0.3072764136066682, "grad_norm": 0.5734643324400857, "learning_rate": 1.9495860509526935e-05, "loss": 0.7014, "step": 682 }, { "epoch": 0.3077269655327777, "grad_norm": 0.6302959860447265, "learning_rate": 1.9494378382644675e-05, "loss": 0.6961, "step": 683 }, { "epoch": 0.30817751745888716, "grad_norm": 0.6229374991194246, "learning_rate": 1.9492894136801328e-05, "loss": 0.7016, "step": 684 }, { "epoch": 0.3086280693849966, "grad_norm": 0.6575887845566094, "learning_rate": 1.949140777232814e-05, "loss": 0.7063, "step": 685 }, { "epoch": 0.3090786213111061, "grad_norm": 0.5776278740193711, "learning_rate": 1.9489919289556844e-05, "loss": 0.6765, "step": 686 }, { "epoch": 0.30952917323721557, "grad_norm": 0.6441907040454057, "learning_rate": 1.948842868881964e-05, "loss": 0.7083, "step": 687 }, { "epoch": 0.30997972516332506, "grad_norm": 0.5772588879271462, "learning_rate": 1.9486935970449196e-05, "loss": 0.6916, "step": 688 }, { "epoch": 0.31043027708943455, "grad_norm": 0.6984200721153881, "learning_rate": 1.948544113477866e-05, "loss": 0.6973, "step": 689 }, { "epoch": 0.31088082901554404, "grad_norm": 0.7029334062831438, "learning_rate": 1.948394418214165e-05, "loss": 0.7149, "step": 690 }, { "epoch": 0.3113313809416535, "grad_norm": 0.6200734704215428, "learning_rate": 1.9482445112872265e-05, "loss": 0.692, "step": 691 }, { "epoch": 0.311781932867763, "grad_norm": 0.7223211195168049, "learning_rate": 1.948094392730506e-05, "loss": 0.7131, "step": 692 }, { "epoch": 0.3122324847938725, "grad_norm": 0.6349232658911147, "learning_rate": 1.947944062577507e-05, "loss": 0.7051, "step": 693 }, { "epoch": 0.312683036719982, "grad_norm": 0.6965900882728192, "learning_rate": 1.9477935208617806e-05, "loss": 0.7031, "step": 694 }, { "epoch": 0.3131335886460915, "grad_norm": 0.6075355837501608, "learning_rate": 1.9476427676169244e-05, "loss": 0.6922, "step": 695 }, { "epoch": 0.31358414057220096, "grad_norm": 0.6320439351155522, "learning_rate": 1.947491802876584e-05, "loss": 0.6907, "step": 696 }, { "epoch": 0.31403469249831045, "grad_norm": 0.5864455843982838, "learning_rate": 1.9473406266744518e-05, "loss": 0.6585, "step": 697 }, { "epoch": 0.31448524442441994, "grad_norm": 0.6568592557483223, "learning_rate": 1.947189239044267e-05, "loss": 0.6762, "step": 698 }, { "epoch": 0.3149357963505294, "grad_norm": 0.5999901736465252, "learning_rate": 1.947037640019817e-05, "loss": 0.7042, "step": 699 }, { "epoch": 0.31538634827663886, "grad_norm": 0.5852539323253255, "learning_rate": 1.946885829634935e-05, "loss": 0.6712, "step": 700 }, { "epoch": 0.31583690020274835, "grad_norm": 0.6203912134522908, "learning_rate": 1.9467338079235026e-05, "loss": 0.6857, "step": 701 }, { "epoch": 0.31628745212885784, "grad_norm": 0.5771811571327317, "learning_rate": 1.9465815749194482e-05, "loss": 0.6728, "step": 702 }, { "epoch": 0.3167380040549673, "grad_norm": 0.6340782830616382, "learning_rate": 1.9464291306567473e-05, "loss": 0.7059, "step": 703 }, { "epoch": 0.3171885559810768, "grad_norm": 0.5959484662756691, "learning_rate": 1.946276475169422e-05, "loss": 0.6753, "step": 704 }, { "epoch": 0.3176391079071863, "grad_norm": 0.6017414567692858, "learning_rate": 1.9461236084915423e-05, "loss": 0.7221, "step": 705 }, { "epoch": 0.3180896598332958, "grad_norm": 0.5865730342501112, "learning_rate": 1.9459705306572255e-05, "loss": 0.6772, "step": 706 }, { "epoch": 0.3185402117594053, "grad_norm": 0.6013901200268561, "learning_rate": 1.9458172417006347e-05, "loss": 0.678, "step": 707 }, { "epoch": 0.31899076368551477, "grad_norm": 0.5778880405676274, "learning_rate": 1.945663741655982e-05, "loss": 0.7018, "step": 708 }, { "epoch": 0.31944131561162425, "grad_norm": 0.5891846469725553, "learning_rate": 1.9455100305575246e-05, "loss": 0.682, "step": 709 }, { "epoch": 0.31989186753773374, "grad_norm": 0.59921578788093, "learning_rate": 1.945356108439569e-05, "loss": 0.6932, "step": 710 }, { "epoch": 0.32034241946384323, "grad_norm": 0.6137638919404782, "learning_rate": 1.9452019753364667e-05, "loss": 0.678, "step": 711 }, { "epoch": 0.3207929713899527, "grad_norm": 0.6104235534726493, "learning_rate": 1.9450476312826178e-05, "loss": 0.6651, "step": 712 }, { "epoch": 0.32124352331606215, "grad_norm": 0.626199308321514, "learning_rate": 1.9448930763124686e-05, "loss": 0.6727, "step": 713 }, { "epoch": 0.32169407524217164, "grad_norm": 0.641656144989011, "learning_rate": 1.9447383104605126e-05, "loss": 0.6762, "step": 714 }, { "epoch": 0.32214462716828113, "grad_norm": 0.6343324597764685, "learning_rate": 1.944583333761291e-05, "loss": 0.6903, "step": 715 }, { "epoch": 0.3225951790943906, "grad_norm": 0.5932553235220821, "learning_rate": 1.9444281462493912e-05, "loss": 0.6714, "step": 716 }, { "epoch": 0.3230457310205001, "grad_norm": 0.6110103267378353, "learning_rate": 1.9442727479594486e-05, "loss": 0.6366, "step": 717 }, { "epoch": 0.3234962829466096, "grad_norm": 0.6253367809625204, "learning_rate": 1.944117138926144e-05, "loss": 0.7013, "step": 718 }, { "epoch": 0.3239468348727191, "grad_norm": 0.6227133407590568, "learning_rate": 1.9439613191842075e-05, "loss": 0.7182, "step": 719 }, { "epoch": 0.32439738679882857, "grad_norm": 0.6105436974145987, "learning_rate": 1.9438052887684144e-05, "loss": 0.7077, "step": 720 }, { "epoch": 0.32484793872493806, "grad_norm": 0.5998460315305841, "learning_rate": 1.9436490477135877e-05, "loss": 0.68, "step": 721 }, { "epoch": 0.32529849065104754, "grad_norm": 0.6141353094116941, "learning_rate": 1.9434925960545978e-05, "loss": 0.7037, "step": 722 }, { "epoch": 0.32574904257715703, "grad_norm": 0.592552584176774, "learning_rate": 1.943335933826361e-05, "loss": 0.686, "step": 723 }, { "epoch": 0.3261995945032665, "grad_norm": 0.6385035854095297, "learning_rate": 1.943179061063842e-05, "loss": 0.6885, "step": 724 }, { "epoch": 0.326650146429376, "grad_norm": 0.6107024764689214, "learning_rate": 1.9430219778020512e-05, "loss": 0.7018, "step": 725 }, { "epoch": 0.3271006983554855, "grad_norm": 0.6655020390295063, "learning_rate": 1.9428646840760466e-05, "loss": 0.7109, "step": 726 }, { "epoch": 0.32755125028159493, "grad_norm": 0.590516744005129, "learning_rate": 1.9427071799209335e-05, "loss": 0.6549, "step": 727 }, { "epoch": 0.3280018022077044, "grad_norm": 0.5779581626573479, "learning_rate": 1.9425494653718632e-05, "loss": 0.6788, "step": 728 }, { "epoch": 0.3284523541338139, "grad_norm": 0.6045046341722422, "learning_rate": 1.942391540464035e-05, "loss": 0.701, "step": 729 }, { "epoch": 0.3289029060599234, "grad_norm": 0.5895635173795504, "learning_rate": 1.9422334052326946e-05, "loss": 0.6816, "step": 730 }, { "epoch": 0.3293534579860329, "grad_norm": 0.5803097860724016, "learning_rate": 1.9420750597131345e-05, "loss": 0.6689, "step": 731 }, { "epoch": 0.32980400991214237, "grad_norm": 0.6380961627353241, "learning_rate": 1.941916503940694e-05, "loss": 0.6683, "step": 732 }, { "epoch": 0.33025456183825186, "grad_norm": 0.6057758129554057, "learning_rate": 1.9417577379507612e-05, "loss": 0.6866, "step": 733 }, { "epoch": 0.33070511376436135, "grad_norm": 0.5855047453006875, "learning_rate": 1.9415987617787678e-05, "loss": 0.673, "step": 734 }, { "epoch": 0.33115566569047084, "grad_norm": 0.6572229682888817, "learning_rate": 1.941439575460195e-05, "loss": 0.6935, "step": 735 }, { "epoch": 0.3316062176165803, "grad_norm": 0.6054822541489334, "learning_rate": 1.9412801790305698e-05, "loss": 0.6577, "step": 736 }, { "epoch": 0.3320567695426898, "grad_norm": 0.6365205148504268, "learning_rate": 1.941120572525467e-05, "loss": 0.6867, "step": 737 }, { "epoch": 0.3325073214687993, "grad_norm": 0.6173266849396387, "learning_rate": 1.9409607559805075e-05, "loss": 0.7084, "step": 738 }, { "epoch": 0.3329578733949088, "grad_norm": 0.6180792226082803, "learning_rate": 1.940800729431359e-05, "loss": 0.6844, "step": 739 }, { "epoch": 0.3334084253210182, "grad_norm": 0.5977958902964415, "learning_rate": 1.940640492913736e-05, "loss": 0.6898, "step": 740 }, { "epoch": 0.3338589772471277, "grad_norm": 0.6076275608526684, "learning_rate": 1.9404800464634005e-05, "loss": 0.7108, "step": 741 }, { "epoch": 0.3343095291732372, "grad_norm": 0.5956082398882077, "learning_rate": 1.9403193901161614e-05, "loss": 0.7243, "step": 742 }, { "epoch": 0.3347600810993467, "grad_norm": 0.5988590759278386, "learning_rate": 1.940158523907874e-05, "loss": 0.6989, "step": 743 }, { "epoch": 0.3352106330254562, "grad_norm": 0.6031984757719688, "learning_rate": 1.9399974478744397e-05, "loss": 0.6622, "step": 744 }, { "epoch": 0.33566118495156566, "grad_norm": 0.6705001103165261, "learning_rate": 1.9398361620518086e-05, "loss": 0.7068, "step": 745 }, { "epoch": 0.33611173687767515, "grad_norm": 0.6103292603763242, "learning_rate": 1.9396746664759757e-05, "loss": 0.7128, "step": 746 }, { "epoch": 0.33656228880378464, "grad_norm": 0.560243893822948, "learning_rate": 1.9395129611829844e-05, "loss": 0.6731, "step": 747 }, { "epoch": 0.3370128407298941, "grad_norm": 0.5793128281526245, "learning_rate": 1.9393510462089237e-05, "loss": 0.6798, "step": 748 }, { "epoch": 0.3374633926560036, "grad_norm": 0.5580935178669801, "learning_rate": 1.93918892158993e-05, "loss": 0.6793, "step": 749 }, { "epoch": 0.3379139445821131, "grad_norm": 0.5677286257986721, "learning_rate": 1.9390265873621868e-05, "loss": 0.6669, "step": 750 }, { "epoch": 0.3383644965082226, "grad_norm": 0.5465951540340994, "learning_rate": 1.938864043561923e-05, "loss": 0.6963, "step": 751 }, { "epoch": 0.3388150484343321, "grad_norm": 0.5714375125994006, "learning_rate": 1.9387012902254165e-05, "loss": 0.6828, "step": 752 }, { "epoch": 0.33926560036044157, "grad_norm": 0.5517471552112093, "learning_rate": 1.938538327388989e-05, "loss": 0.6749, "step": 753 }, { "epoch": 0.339716152286551, "grad_norm": 0.5490399561009669, "learning_rate": 1.9383751550890125e-05, "loss": 0.6821, "step": 754 }, { "epoch": 0.3401667042126605, "grad_norm": 0.5561489943897826, "learning_rate": 1.938211773361903e-05, "loss": 0.6833, "step": 755 }, { "epoch": 0.34061725613877, "grad_norm": 0.549704741962116, "learning_rate": 1.9380481822441236e-05, "loss": 0.6682, "step": 756 }, { "epoch": 0.34106780806487946, "grad_norm": 0.5623849152832139, "learning_rate": 1.9378843817721856e-05, "loss": 0.6876, "step": 757 }, { "epoch": 0.34151835999098895, "grad_norm": 0.5495442168258505, "learning_rate": 1.9377203719826454e-05, "loss": 0.7061, "step": 758 }, { "epoch": 0.34196891191709844, "grad_norm": 0.6211026755367937, "learning_rate": 1.9375561529121073e-05, "loss": 0.6846, "step": 759 }, { "epoch": 0.3424194638432079, "grad_norm": 0.5759327082050807, "learning_rate": 1.937391724597222e-05, "loss": 0.704, "step": 760 }, { "epoch": 0.3428700157693174, "grad_norm": 0.5899871222211905, "learning_rate": 1.9372270870746856e-05, "loss": 0.7011, "step": 761 }, { "epoch": 0.3433205676954269, "grad_norm": 0.5779414190958309, "learning_rate": 1.937062240381243e-05, "loss": 0.7048, "step": 762 }, { "epoch": 0.3437711196215364, "grad_norm": 0.5801659215843299, "learning_rate": 1.9368971845536844e-05, "loss": 0.7247, "step": 763 }, { "epoch": 0.3442216715476459, "grad_norm": 0.5917266162388269, "learning_rate": 1.9367319196288476e-05, "loss": 0.7474, "step": 764 }, { "epoch": 0.34467222347375537, "grad_norm": 0.6237360864255513, "learning_rate": 1.936566445643616e-05, "loss": 0.6912, "step": 765 }, { "epoch": 0.34512277539986486, "grad_norm": 0.6003849284747763, "learning_rate": 1.93640076263492e-05, "loss": 0.672, "step": 766 }, { "epoch": 0.34557332732597434, "grad_norm": 0.6068486614260854, "learning_rate": 1.9362348706397374e-05, "loss": 0.7009, "step": 767 }, { "epoch": 0.3460238792520838, "grad_norm": 0.5675440906569967, "learning_rate": 1.9360687696950913e-05, "loss": 0.6905, "step": 768 }, { "epoch": 0.34647443117819327, "grad_norm": 0.5881021808508624, "learning_rate": 1.9359024598380535e-05, "loss": 0.7271, "step": 769 }, { "epoch": 0.34692498310430275, "grad_norm": 0.6046718964695855, "learning_rate": 1.9357359411057398e-05, "loss": 0.6781, "step": 770 }, { "epoch": 0.34737553503041224, "grad_norm": 0.6050733099201897, "learning_rate": 1.9355692135353144e-05, "loss": 0.6726, "step": 771 }, { "epoch": 0.34782608695652173, "grad_norm": 0.5767477666806948, "learning_rate": 1.935402277163988e-05, "loss": 0.6847, "step": 772 }, { "epoch": 0.3482766388826312, "grad_norm": 0.5336953270830425, "learning_rate": 1.935235132029017e-05, "loss": 0.6663, "step": 773 }, { "epoch": 0.3487271908087407, "grad_norm": 0.6204782534961957, "learning_rate": 1.9350677781677055e-05, "loss": 0.7144, "step": 774 }, { "epoch": 0.3491777427348502, "grad_norm": 0.5948794509263697, "learning_rate": 1.9349002156174037e-05, "loss": 0.6816, "step": 775 }, { "epoch": 0.3496282946609597, "grad_norm": 0.6089574895892252, "learning_rate": 1.9347324444155074e-05, "loss": 0.6931, "step": 776 }, { "epoch": 0.35007884658706917, "grad_norm": 0.6133708978239014, "learning_rate": 1.934564464599461e-05, "loss": 0.6696, "step": 777 }, { "epoch": 0.35052939851317866, "grad_norm": 0.6141300977392192, "learning_rate": 1.9343962762067536e-05, "loss": 0.6595, "step": 778 }, { "epoch": 0.35097995043928815, "grad_norm": 0.5690625039478855, "learning_rate": 1.9342278792749217e-05, "loss": 0.7053, "step": 779 }, { "epoch": 0.35143050236539763, "grad_norm": 0.5970932790928726, "learning_rate": 1.9340592738415487e-05, "loss": 0.6466, "step": 780 }, { "epoch": 0.3518810542915071, "grad_norm": 0.5715307751876099, "learning_rate": 1.933890459944263e-05, "loss": 0.7042, "step": 781 }, { "epoch": 0.35233160621761656, "grad_norm": 0.6111258850635181, "learning_rate": 1.9337214376207417e-05, "loss": 0.6666, "step": 782 }, { "epoch": 0.35278215814372604, "grad_norm": 0.5599217065521995, "learning_rate": 1.9335522069087072e-05, "loss": 0.6963, "step": 783 }, { "epoch": 0.35323271006983553, "grad_norm": 0.6467637783888632, "learning_rate": 1.9333827678459277e-05, "loss": 0.6871, "step": 784 }, { "epoch": 0.353683261995945, "grad_norm": 0.5966006425157029, "learning_rate": 1.9332131204702197e-05, "loss": 0.6959, "step": 785 }, { "epoch": 0.3541338139220545, "grad_norm": 0.5738579097581328, "learning_rate": 1.9330432648194444e-05, "loss": 0.6957, "step": 786 }, { "epoch": 0.354584365848164, "grad_norm": 0.6054583383668011, "learning_rate": 1.9328732009315107e-05, "loss": 0.6884, "step": 787 }, { "epoch": 0.3550349177742735, "grad_norm": 0.5841120841980693, "learning_rate": 1.9327029288443734e-05, "loss": 0.6793, "step": 788 }, { "epoch": 0.35548546970038297, "grad_norm": 0.6180850167221837, "learning_rate": 1.932532448596034e-05, "loss": 0.6895, "step": 789 }, { "epoch": 0.35593602162649246, "grad_norm": 0.62821135941419, "learning_rate": 1.93236176022454e-05, "loss": 0.6551, "step": 790 }, { "epoch": 0.35638657355260195, "grad_norm": 0.6163514778515377, "learning_rate": 1.9321908637679868e-05, "loss": 0.6638, "step": 791 }, { "epoch": 0.35683712547871144, "grad_norm": 0.5983715097137955, "learning_rate": 1.932019759264514e-05, "loss": 0.6778, "step": 792 }, { "epoch": 0.3572876774048209, "grad_norm": 0.5812862507900827, "learning_rate": 1.9318484467523096e-05, "loss": 0.6576, "step": 793 }, { "epoch": 0.3577382293309304, "grad_norm": 0.6061773510570667, "learning_rate": 1.9316769262696067e-05, "loss": 0.7221, "step": 794 }, { "epoch": 0.3581887812570399, "grad_norm": 0.613785391636501, "learning_rate": 1.9315051978546857e-05, "loss": 0.6733, "step": 795 }, { "epoch": 0.35863933318314933, "grad_norm": 0.5983640868936853, "learning_rate": 1.9313332615458725e-05, "loss": 0.6764, "step": 796 }, { "epoch": 0.3590898851092588, "grad_norm": 0.61342257372679, "learning_rate": 1.931161117381541e-05, "loss": 0.6899, "step": 797 }, { "epoch": 0.3595404370353683, "grad_norm": 0.5832131745584712, "learning_rate": 1.9309887654001095e-05, "loss": 0.6765, "step": 798 }, { "epoch": 0.3599909889614778, "grad_norm": 0.5729424401317761, "learning_rate": 1.930816205640044e-05, "loss": 0.7035, "step": 799 }, { "epoch": 0.3604415408875873, "grad_norm": 0.668035812655157, "learning_rate": 1.9306434381398565e-05, "loss": 0.7177, "step": 800 }, { "epoch": 0.3608920928136968, "grad_norm": 0.5799481777199096, "learning_rate": 1.930470462938105e-05, "loss": 0.688, "step": 801 }, { "epoch": 0.36134264473980626, "grad_norm": 0.6251549908418697, "learning_rate": 1.9302972800733945e-05, "loss": 0.633, "step": 802 }, { "epoch": 0.36179319666591575, "grad_norm": 0.6968514197384968, "learning_rate": 1.930123889584376e-05, "loss": 0.6932, "step": 803 }, { "epoch": 0.36224374859202524, "grad_norm": 0.5702919616969513, "learning_rate": 1.9299502915097475e-05, "loss": 0.6707, "step": 804 }, { "epoch": 0.3626943005181347, "grad_norm": 0.6798256264583517, "learning_rate": 1.9297764858882516e-05, "loss": 0.687, "step": 805 }, { "epoch": 0.3631448524442442, "grad_norm": 0.6007364075173034, "learning_rate": 1.9296024727586792e-05, "loss": 0.6804, "step": 806 }, { "epoch": 0.3635954043703537, "grad_norm": 0.6617185103094994, "learning_rate": 1.929428252159866e-05, "loss": 0.6993, "step": 807 }, { "epoch": 0.3640459562964632, "grad_norm": 0.5933590454675143, "learning_rate": 1.929253824130695e-05, "loss": 0.6804, "step": 808 }, { "epoch": 0.3644965082225726, "grad_norm": 0.5999373323685662, "learning_rate": 1.9290791887100954e-05, "loss": 0.6812, "step": 809 }, { "epoch": 0.3649470601486821, "grad_norm": 0.6231813156943945, "learning_rate": 1.928904345937042e-05, "loss": 0.688, "step": 810 }, { "epoch": 0.3653976120747916, "grad_norm": 0.6337761277313362, "learning_rate": 1.928729295850557e-05, "loss": 0.7235, "step": 811 }, { "epoch": 0.3658481640009011, "grad_norm": 0.5730627166865875, "learning_rate": 1.9285540384897073e-05, "loss": 0.7042, "step": 812 }, { "epoch": 0.3662987159270106, "grad_norm": 0.5749999053372887, "learning_rate": 1.9283785738936075e-05, "loss": 0.6853, "step": 813 }, { "epoch": 0.36674926785312006, "grad_norm": 0.6064725663171256, "learning_rate": 1.9282029021014175e-05, "loss": 0.7134, "step": 814 }, { "epoch": 0.36719981977922955, "grad_norm": 0.5897493201912227, "learning_rate": 1.9280270231523443e-05, "loss": 0.6985, "step": 815 }, { "epoch": 0.36765037170533904, "grad_norm": 0.6051081678575063, "learning_rate": 1.92785093708564e-05, "loss": 0.699, "step": 816 }, { "epoch": 0.36810092363144853, "grad_norm": 0.6080358519599344, "learning_rate": 1.9276746439406046e-05, "loss": 0.6979, "step": 817 }, { "epoch": 0.368551475557558, "grad_norm": 0.5618933220176939, "learning_rate": 1.927498143756583e-05, "loss": 0.6597, "step": 818 }, { "epoch": 0.3690020274836675, "grad_norm": 0.6166682831429139, "learning_rate": 1.9273214365729655e-05, "loss": 0.685, "step": 819 }, { "epoch": 0.369452579409777, "grad_norm": 0.5993392413823799, "learning_rate": 1.9271445224291908e-05, "loss": 0.6952, "step": 820 }, { "epoch": 0.3699031313358865, "grad_norm": 0.5468395694157784, "learning_rate": 1.9269674013647427e-05, "loss": 0.6611, "step": 821 }, { "epoch": 0.37035368326199597, "grad_norm": 0.6545418361225267, "learning_rate": 1.9267900734191515e-05, "loss": 0.6821, "step": 822 }, { "epoch": 0.3708042351881054, "grad_norm": 0.6108671230676718, "learning_rate": 1.926612538631992e-05, "loss": 0.7393, "step": 823 }, { "epoch": 0.3712547871142149, "grad_norm": 0.6343269613064096, "learning_rate": 1.9264347970428876e-05, "loss": 0.6873, "step": 824 }, { "epoch": 0.3717053390403244, "grad_norm": 0.5913712767885427, "learning_rate": 1.9262568486915067e-05, "loss": 0.6581, "step": 825 }, { "epoch": 0.37215589096643387, "grad_norm": 0.6402253718597349, "learning_rate": 1.9260786936175635e-05, "loss": 0.7156, "step": 826 }, { "epoch": 0.37260644289254335, "grad_norm": 0.57933585513703, "learning_rate": 1.9259003318608192e-05, "loss": 0.6709, "step": 827 }, { "epoch": 0.37305699481865284, "grad_norm": 0.5894262212463814, "learning_rate": 1.9257217634610805e-05, "loss": 0.6658, "step": 828 }, { "epoch": 0.37350754674476233, "grad_norm": 0.6848386076678163, "learning_rate": 1.9255429884582e-05, "loss": 0.6816, "step": 829 }, { "epoch": 0.3739580986708718, "grad_norm": 0.5700479639240614, "learning_rate": 1.9253640068920778e-05, "loss": 0.6451, "step": 830 }, { "epoch": 0.3744086505969813, "grad_norm": 0.633378991057691, "learning_rate": 1.925184818802658e-05, "loss": 0.6849, "step": 831 }, { "epoch": 0.3748592025230908, "grad_norm": 0.5673044124976636, "learning_rate": 1.925005424229933e-05, "loss": 0.689, "step": 832 }, { "epoch": 0.3753097544492003, "grad_norm": 0.5960688280695011, "learning_rate": 1.924825823213939e-05, "loss": 0.6758, "step": 833 }, { "epoch": 0.37576030637530977, "grad_norm": 0.6306539332097822, "learning_rate": 1.9246460157947603e-05, "loss": 0.6795, "step": 834 }, { "epoch": 0.37621085830141926, "grad_norm": 0.6343423759811232, "learning_rate": 1.9244660020125262e-05, "loss": 0.6782, "step": 835 }, { "epoch": 0.37666141022752875, "grad_norm": 0.6018046246361614, "learning_rate": 1.9242857819074125e-05, "loss": 0.7057, "step": 836 }, { "epoch": 0.3771119621536382, "grad_norm": 0.6111871400462829, "learning_rate": 1.9241053555196405e-05, "loss": 0.6999, "step": 837 }, { "epoch": 0.37756251407974767, "grad_norm": 0.670783807008322, "learning_rate": 1.9239247228894776e-05, "loss": 0.6507, "step": 838 }, { "epoch": 0.37801306600585716, "grad_norm": 0.5806003126492272, "learning_rate": 1.9237438840572383e-05, "loss": 0.7211, "step": 839 }, { "epoch": 0.37846361793196664, "grad_norm": 0.6990367863320271, "learning_rate": 1.923562839063282e-05, "loss": 0.6837, "step": 840 }, { "epoch": 0.37891416985807613, "grad_norm": 0.5859212744690212, "learning_rate": 1.9233815879480143e-05, "loss": 0.6896, "step": 841 }, { "epoch": 0.3793647217841856, "grad_norm": 0.6973961321787354, "learning_rate": 1.923200130751887e-05, "loss": 0.7033, "step": 842 }, { "epoch": 0.3798152737102951, "grad_norm": 0.5657369627930682, "learning_rate": 1.9230184675153974e-05, "loss": 0.6772, "step": 843 }, { "epoch": 0.3802658256364046, "grad_norm": 0.6909814899091572, "learning_rate": 1.9228365982790897e-05, "loss": 0.6883, "step": 844 }, { "epoch": 0.3807163775625141, "grad_norm": 0.5896597096008475, "learning_rate": 1.922654523083554e-05, "loss": 0.6764, "step": 845 }, { "epoch": 0.3811669294886236, "grad_norm": 0.7008434595082764, "learning_rate": 1.922472241969425e-05, "loss": 0.6807, "step": 846 }, { "epoch": 0.38161748141473306, "grad_norm": 0.5878043226809961, "learning_rate": 1.922289754977385e-05, "loss": 0.6839, "step": 847 }, { "epoch": 0.38206803334084255, "grad_norm": 0.7295331665543926, "learning_rate": 1.922107062148161e-05, "loss": 0.6788, "step": 848 }, { "epoch": 0.38251858526695204, "grad_norm": 0.6414407788967807, "learning_rate": 1.921924163522527e-05, "loss": 0.6596, "step": 849 }, { "epoch": 0.3829691371930615, "grad_norm": 0.7037998147576329, "learning_rate": 1.921741059141302e-05, "loss": 0.6826, "step": 850 }, { "epoch": 0.38341968911917096, "grad_norm": 0.7289385310944827, "learning_rate": 1.921557749045352e-05, "loss": 0.6855, "step": 851 }, { "epoch": 0.38387024104528045, "grad_norm": 0.6445170090297831, "learning_rate": 1.9213742332755877e-05, "loss": 0.6897, "step": 852 }, { "epoch": 0.38432079297138994, "grad_norm": 0.7192081272811728, "learning_rate": 1.9211905118729667e-05, "loss": 0.6795, "step": 853 }, { "epoch": 0.3847713448974994, "grad_norm": 0.6014182965915482, "learning_rate": 1.9210065848784915e-05, "loss": 0.6978, "step": 854 }, { "epoch": 0.3852218968236089, "grad_norm": 0.7252290661984584, "learning_rate": 1.920822452333211e-05, "loss": 0.7078, "step": 855 }, { "epoch": 0.3856724487497184, "grad_norm": 0.5456451212991532, "learning_rate": 1.9206381142782204e-05, "loss": 0.686, "step": 856 }, { "epoch": 0.3861230006758279, "grad_norm": 0.7041597731841426, "learning_rate": 1.9204535707546602e-05, "loss": 0.6785, "step": 857 }, { "epoch": 0.3865735526019374, "grad_norm": 0.5717845706977636, "learning_rate": 1.9202688218037172e-05, "loss": 0.686, "step": 858 }, { "epoch": 0.38702410452804686, "grad_norm": 0.6611434049511183, "learning_rate": 1.920083867466624e-05, "loss": 0.6757, "step": 859 }, { "epoch": 0.38747465645415635, "grad_norm": 0.582769511054454, "learning_rate": 1.919898707784658e-05, "loss": 0.6607, "step": 860 }, { "epoch": 0.38792520838026584, "grad_norm": 0.7304371072448724, "learning_rate": 1.9197133427991437e-05, "loss": 0.6915, "step": 861 }, { "epoch": 0.38837576030637533, "grad_norm": 0.5808863712286915, "learning_rate": 1.919527772551451e-05, "loss": 0.7072, "step": 862 }, { "epoch": 0.3888263122324848, "grad_norm": 0.6093086071731589, "learning_rate": 1.919341997082995e-05, "loss": 0.6882, "step": 863 }, { "epoch": 0.3892768641585943, "grad_norm": 0.6133439716815157, "learning_rate": 1.9191560164352383e-05, "loss": 0.6695, "step": 864 }, { "epoch": 0.38972741608470374, "grad_norm": 0.5952968109793947, "learning_rate": 1.9189698306496874e-05, "loss": 0.648, "step": 865 }, { "epoch": 0.3901779680108132, "grad_norm": 0.6238331648945428, "learning_rate": 1.9187834397678956e-05, "loss": 0.6743, "step": 866 }, { "epoch": 0.3906285199369227, "grad_norm": 0.680824488722222, "learning_rate": 1.918596843831462e-05, "loss": 0.7103, "step": 867 }, { "epoch": 0.3910790718630322, "grad_norm": 0.5921411428450293, "learning_rate": 1.9184100428820304e-05, "loss": 0.7015, "step": 868 }, { "epoch": 0.3915296237891417, "grad_norm": 0.6683347585375006, "learning_rate": 1.9182230369612912e-05, "loss": 0.7104, "step": 869 }, { "epoch": 0.3919801757152512, "grad_norm": 0.628579405082135, "learning_rate": 1.9180358261109817e-05, "loss": 0.6611, "step": 870 }, { "epoch": 0.39243072764136067, "grad_norm": 0.6305136979376809, "learning_rate": 1.9178484103728826e-05, "loss": 0.6763, "step": 871 }, { "epoch": 0.39288127956747015, "grad_norm": 0.6881716559214911, "learning_rate": 1.9176607897888217e-05, "loss": 0.6989, "step": 872 }, { "epoch": 0.39333183149357964, "grad_norm": 0.6027154344443812, "learning_rate": 1.9174729644006723e-05, "loss": 0.6868, "step": 873 }, { "epoch": 0.39378238341968913, "grad_norm": 0.667744899184287, "learning_rate": 1.9172849342503537e-05, "loss": 0.7159, "step": 874 }, { "epoch": 0.3942329353457986, "grad_norm": 0.6271861983932607, "learning_rate": 1.9170966993798302e-05, "loss": 0.7048, "step": 875 }, { "epoch": 0.3946834872719081, "grad_norm": 0.6669836326354465, "learning_rate": 1.916908259831112e-05, "loss": 0.6598, "step": 876 }, { "epoch": 0.3951340391980176, "grad_norm": 0.5802518507574159, "learning_rate": 1.916719615646256e-05, "loss": 0.6879, "step": 877 }, { "epoch": 0.3955845911241271, "grad_norm": 0.6728372802560119, "learning_rate": 1.916530766867363e-05, "loss": 0.6479, "step": 878 }, { "epoch": 0.3960351430502365, "grad_norm": 0.5924426900269235, "learning_rate": 1.916341713536581e-05, "loss": 0.6643, "step": 879 }, { "epoch": 0.396485694976346, "grad_norm": 0.6665955276265294, "learning_rate": 1.9161524556961027e-05, "loss": 0.6808, "step": 880 }, { "epoch": 0.3969362469024555, "grad_norm": 0.639189090974186, "learning_rate": 1.9159629933881666e-05, "loss": 0.6872, "step": 881 }, { "epoch": 0.397386798828565, "grad_norm": 0.6440228062112381, "learning_rate": 1.9157733266550577e-05, "loss": 0.7008, "step": 882 }, { "epoch": 0.39783735075467447, "grad_norm": 0.6401250758749434, "learning_rate": 1.915583455539105e-05, "loss": 0.6639, "step": 883 }, { "epoch": 0.39828790268078396, "grad_norm": 0.6258612695652692, "learning_rate": 1.9153933800826853e-05, "loss": 0.6875, "step": 884 }, { "epoch": 0.39873845460689344, "grad_norm": 0.7145243854780674, "learning_rate": 1.9152031003282182e-05, "loss": 0.6769, "step": 885 }, { "epoch": 0.39918900653300293, "grad_norm": 0.6349490302705048, "learning_rate": 1.915012616318172e-05, "loss": 0.7235, "step": 886 }, { "epoch": 0.3996395584591124, "grad_norm": 0.7495531972118368, "learning_rate": 1.914821928095058e-05, "loss": 0.6991, "step": 887 }, { "epoch": 0.4000901103852219, "grad_norm": 0.5997295883755807, "learning_rate": 1.9146310357014346e-05, "loss": 0.6854, "step": 888 }, { "epoch": 0.4005406623113314, "grad_norm": 0.7158145449548623, "learning_rate": 1.9144399391799043e-05, "loss": 0.6833, "step": 889 }, { "epoch": 0.4009912142374409, "grad_norm": 0.5990300820179945, "learning_rate": 1.9142486385731178e-05, "loss": 0.7032, "step": 890 }, { "epoch": 0.4014417661635504, "grad_norm": 0.694789372202561, "learning_rate": 1.9140571339237686e-05, "loss": 0.7056, "step": 891 }, { "epoch": 0.4018923180896598, "grad_norm": 0.5809574301789472, "learning_rate": 1.913865425274597e-05, "loss": 0.6421, "step": 892 }, { "epoch": 0.4023428700157693, "grad_norm": 0.6707206522787185, "learning_rate": 1.9136735126683886e-05, "loss": 0.6878, "step": 893 }, { "epoch": 0.4027934219418788, "grad_norm": 0.6770993938111531, "learning_rate": 1.913481396147975e-05, "loss": 0.6255, "step": 894 }, { "epoch": 0.40324397386798827, "grad_norm": 0.5692588294703214, "learning_rate": 1.9132890757562324e-05, "loss": 0.6897, "step": 895 }, { "epoch": 0.40369452579409776, "grad_norm": 0.6277328344614984, "learning_rate": 1.913096551536083e-05, "loss": 0.6693, "step": 896 }, { "epoch": 0.40414507772020725, "grad_norm": 0.5986979656175565, "learning_rate": 1.9129038235304946e-05, "loss": 0.688, "step": 897 }, { "epoch": 0.40459562964631673, "grad_norm": 0.6202123171644051, "learning_rate": 1.9127108917824807e-05, "loss": 0.6656, "step": 898 }, { "epoch": 0.4050461815724262, "grad_norm": 0.5795786200567453, "learning_rate": 1.912517756335099e-05, "loss": 0.6869, "step": 899 }, { "epoch": 0.4054967334985357, "grad_norm": 0.5534266638824137, "learning_rate": 1.9123244172314546e-05, "loss": 0.7265, "step": 900 }, { "epoch": 0.4059472854246452, "grad_norm": 0.5967418200084217, "learning_rate": 1.9121308745146964e-05, "loss": 0.6747, "step": 901 }, { "epoch": 0.4063978373507547, "grad_norm": 0.5528289468366361, "learning_rate": 1.9119371282280197e-05, "loss": 0.6906, "step": 902 }, { "epoch": 0.4068483892768642, "grad_norm": 0.6194152930462627, "learning_rate": 1.911743178414665e-05, "loss": 0.7005, "step": 903 }, { "epoch": 0.40729894120297366, "grad_norm": 0.5949607235759795, "learning_rate": 1.9115490251179173e-05, "loss": 0.7263, "step": 904 }, { "epoch": 0.40774949312908315, "grad_norm": 0.5492609790190639, "learning_rate": 1.911354668381109e-05, "loss": 0.7234, "step": 905 }, { "epoch": 0.4082000450551926, "grad_norm": 0.5733312286559493, "learning_rate": 1.9111601082476162e-05, "loss": 0.6929, "step": 906 }, { "epoch": 0.40865059698130207, "grad_norm": 0.5412857707931608, "learning_rate": 1.9109653447608607e-05, "loss": 0.6631, "step": 907 }, { "epoch": 0.40910114890741156, "grad_norm": 0.6330430868236705, "learning_rate": 1.9107703779643106e-05, "loss": 0.685, "step": 908 }, { "epoch": 0.40955170083352105, "grad_norm": 0.6292826797508955, "learning_rate": 1.9105752079014782e-05, "loss": 0.73, "step": 909 }, { "epoch": 0.41000225275963054, "grad_norm": 0.5827246181437622, "learning_rate": 1.9103798346159214e-05, "loss": 0.6911, "step": 910 }, { "epoch": 0.41045280468574, "grad_norm": 0.5929836077238922, "learning_rate": 1.9101842581512447e-05, "loss": 0.6723, "step": 911 }, { "epoch": 0.4109033566118495, "grad_norm": 0.5535695626710536, "learning_rate": 1.909988478551096e-05, "loss": 0.6772, "step": 912 }, { "epoch": 0.411353908537959, "grad_norm": 0.6283279455727008, "learning_rate": 1.90979249585917e-05, "loss": 0.7109, "step": 913 }, { "epoch": 0.4118044604640685, "grad_norm": 0.5447466307723922, "learning_rate": 1.9095963101192062e-05, "loss": 0.6685, "step": 914 }, { "epoch": 0.412255012390178, "grad_norm": 0.630153990570523, "learning_rate": 1.9093999213749894e-05, "loss": 0.6769, "step": 915 }, { "epoch": 0.41270556431628747, "grad_norm": 0.5913866089292139, "learning_rate": 1.9092033296703495e-05, "loss": 0.709, "step": 916 }, { "epoch": 0.41315611624239695, "grad_norm": 0.6700697720296006, "learning_rate": 1.909006535049163e-05, "loss": 0.6998, "step": 917 }, { "epoch": 0.41360666816850644, "grad_norm": 0.5456573349242501, "learning_rate": 1.908809537555349e-05, "loss": 0.6717, "step": 918 }, { "epoch": 0.41405722009461593, "grad_norm": 0.6050723800098474, "learning_rate": 1.9086123372328748e-05, "loss": 0.6608, "step": 919 }, { "epoch": 0.41450777202072536, "grad_norm": 0.5744402094538187, "learning_rate": 1.9084149341257506e-05, "loss": 0.6841, "step": 920 }, { "epoch": 0.41495832394683485, "grad_norm": 0.6219196026193153, "learning_rate": 1.9082173282780345e-05, "loss": 0.7079, "step": 921 }, { "epoch": 0.41540887587294434, "grad_norm": 0.6063513054187447, "learning_rate": 1.908019519733827e-05, "loss": 0.6967, "step": 922 }, { "epoch": 0.4158594277990538, "grad_norm": 0.5387652017441116, "learning_rate": 1.9078215085372753e-05, "loss": 0.6504, "step": 923 }, { "epoch": 0.4163099797251633, "grad_norm": 0.5955687835632361, "learning_rate": 1.9076232947325724e-05, "loss": 0.6431, "step": 924 }, { "epoch": 0.4167605316512728, "grad_norm": 0.5607160230946928, "learning_rate": 1.9074248783639547e-05, "loss": 0.7013, "step": 925 }, { "epoch": 0.4172110835773823, "grad_norm": 0.604245163318599, "learning_rate": 1.907226259475706e-05, "loss": 0.6699, "step": 926 }, { "epoch": 0.4176616355034918, "grad_norm": 0.5485023348183301, "learning_rate": 1.907027438112153e-05, "loss": 0.6414, "step": 927 }, { "epoch": 0.41811218742960127, "grad_norm": 0.6011276356562998, "learning_rate": 1.9068284143176698e-05, "loss": 0.691, "step": 928 }, { "epoch": 0.41856273935571076, "grad_norm": 0.575336933248017, "learning_rate": 1.906629188136674e-05, "loss": 0.6892, "step": 929 }, { "epoch": 0.41901329128182024, "grad_norm": 0.5975531283089295, "learning_rate": 1.9064297596136298e-05, "loss": 0.6958, "step": 930 }, { "epoch": 0.41946384320792973, "grad_norm": 0.5647018811803469, "learning_rate": 1.9062301287930448e-05, "loss": 0.6355, "step": 931 }, { "epoch": 0.4199143951340392, "grad_norm": 0.5628429377848978, "learning_rate": 1.9060302957194732e-05, "loss": 0.6678, "step": 932 }, { "epoch": 0.4203649470601487, "grad_norm": 0.5414286556693534, "learning_rate": 1.9058302604375137e-05, "loss": 0.6793, "step": 933 }, { "epoch": 0.42081549898625814, "grad_norm": 0.5635556909888324, "learning_rate": 1.9056300229918107e-05, "loss": 0.6921, "step": 934 }, { "epoch": 0.42126605091236763, "grad_norm": 0.5528989703788371, "learning_rate": 1.905429583427053e-05, "loss": 0.6413, "step": 935 }, { "epoch": 0.4217166028384771, "grad_norm": 0.5734147320113822, "learning_rate": 1.905228941787975e-05, "loss": 0.6771, "step": 936 }, { "epoch": 0.4221671547645866, "grad_norm": 0.5433952910491283, "learning_rate": 1.9050280981193555e-05, "loss": 0.6637, "step": 937 }, { "epoch": 0.4226177066906961, "grad_norm": 0.5969702199316773, "learning_rate": 1.9048270524660197e-05, "loss": 0.708, "step": 938 }, { "epoch": 0.4230682586168056, "grad_norm": 0.5611828080315551, "learning_rate": 1.9046258048728365e-05, "loss": 0.6961, "step": 939 }, { "epoch": 0.42351881054291507, "grad_norm": 0.5553347747614569, "learning_rate": 1.9044243553847205e-05, "loss": 0.6636, "step": 940 }, { "epoch": 0.42396936246902456, "grad_norm": 0.5754741475714824, "learning_rate": 1.9042227040466317e-05, "loss": 0.6672, "step": 941 }, { "epoch": 0.42441991439513405, "grad_norm": 0.5419027689097546, "learning_rate": 1.9040208509035745e-05, "loss": 0.6721, "step": 942 }, { "epoch": 0.42487046632124353, "grad_norm": 0.5806020644035986, "learning_rate": 1.9038187960005988e-05, "loss": 0.6843, "step": 943 }, { "epoch": 0.425321018247353, "grad_norm": 0.5709001876356017, "learning_rate": 1.903616539382799e-05, "loss": 0.6882, "step": 944 }, { "epoch": 0.4257715701734625, "grad_norm": 0.5897995413216948, "learning_rate": 1.903414081095315e-05, "loss": 0.7017, "step": 945 }, { "epoch": 0.426222122099572, "grad_norm": 0.6150599650117469, "learning_rate": 1.9032114211833316e-05, "loss": 0.6601, "step": 946 }, { "epoch": 0.4266726740256815, "grad_norm": 0.5709291270472104, "learning_rate": 1.9030085596920786e-05, "loss": 0.6816, "step": 947 }, { "epoch": 0.4271232259517909, "grad_norm": 0.6105547459795462, "learning_rate": 1.9028054966668307e-05, "loss": 0.7041, "step": 948 }, { "epoch": 0.4275737778779004, "grad_norm": 0.5696698762594028, "learning_rate": 1.9026022321529076e-05, "loss": 0.6737, "step": 949 }, { "epoch": 0.4280243298040099, "grad_norm": 0.5939781810527424, "learning_rate": 1.902398766195674e-05, "loss": 0.68, "step": 950 }, { "epoch": 0.4284748817301194, "grad_norm": 0.5913988930092283, "learning_rate": 1.9021950988405397e-05, "loss": 0.7084, "step": 951 }, { "epoch": 0.42892543365622887, "grad_norm": 0.5738208182272381, "learning_rate": 1.9019912301329593e-05, "loss": 0.6659, "step": 952 }, { "epoch": 0.42937598558233836, "grad_norm": 0.5887759937153484, "learning_rate": 1.9017871601184318e-05, "loss": 0.6739, "step": 953 }, { "epoch": 0.42982653750844785, "grad_norm": 0.5677987748604064, "learning_rate": 1.9015828888425023e-05, "loss": 0.6262, "step": 954 }, { "epoch": 0.43027708943455734, "grad_norm": 0.5938336191568137, "learning_rate": 1.90137841635076e-05, "loss": 0.6808, "step": 955 }, { "epoch": 0.4307276413606668, "grad_norm": 0.5428176906596988, "learning_rate": 1.9011737426888394e-05, "loss": 0.671, "step": 956 }, { "epoch": 0.4311781932867763, "grad_norm": 0.567029578164292, "learning_rate": 1.900968867902419e-05, "loss": 0.6783, "step": 957 }, { "epoch": 0.4316287452128858, "grad_norm": 0.5804534244837253, "learning_rate": 1.900763792037224e-05, "loss": 0.6892, "step": 958 }, { "epoch": 0.4320792971389953, "grad_norm": 0.5770725042593036, "learning_rate": 1.9005585151390224e-05, "loss": 0.6739, "step": 959 }, { "epoch": 0.4325298490651048, "grad_norm": 0.5852265189195079, "learning_rate": 1.9003530372536282e-05, "loss": 0.6797, "step": 960 }, { "epoch": 0.43298040099121426, "grad_norm": 0.5823134122641497, "learning_rate": 1.9001473584269002e-05, "loss": 0.6896, "step": 961 }, { "epoch": 0.4334309529173237, "grad_norm": 0.5701282977303878, "learning_rate": 1.899941478704742e-05, "loss": 0.708, "step": 962 }, { "epoch": 0.4338815048434332, "grad_norm": 0.6327461342264767, "learning_rate": 1.899735398133102e-05, "loss": 0.7043, "step": 963 }, { "epoch": 0.4343320567695427, "grad_norm": 0.6021635264192113, "learning_rate": 1.899529116757973e-05, "loss": 0.6918, "step": 964 }, { "epoch": 0.43478260869565216, "grad_norm": 0.6099668880379563, "learning_rate": 1.8993226346253934e-05, "loss": 0.7011, "step": 965 }, { "epoch": 0.43523316062176165, "grad_norm": 0.5849344986643065, "learning_rate": 1.8991159517814463e-05, "loss": 0.6912, "step": 966 }, { "epoch": 0.43568371254787114, "grad_norm": 0.6189982487699047, "learning_rate": 1.8989090682722583e-05, "loss": 0.683, "step": 967 }, { "epoch": 0.4361342644739806, "grad_norm": 0.5646631615713938, "learning_rate": 1.8987019841440028e-05, "loss": 0.6864, "step": 968 }, { "epoch": 0.4365848164000901, "grad_norm": 0.6122152520546557, "learning_rate": 1.898494699442896e-05, "loss": 0.7267, "step": 969 }, { "epoch": 0.4370353683261996, "grad_norm": 0.5624338622393229, "learning_rate": 1.8982872142152008e-05, "loss": 0.6681, "step": 970 }, { "epoch": 0.4374859202523091, "grad_norm": 0.6209816210819402, "learning_rate": 1.8980795285072234e-05, "loss": 0.6754, "step": 971 }, { "epoch": 0.4379364721784186, "grad_norm": 0.550324787799884, "learning_rate": 1.8978716423653153e-05, "loss": 0.6603, "step": 972 }, { "epoch": 0.43838702410452807, "grad_norm": 0.6322309349483505, "learning_rate": 1.897663555835872e-05, "loss": 0.6816, "step": 973 }, { "epoch": 0.43883757603063756, "grad_norm": 0.5750189204672066, "learning_rate": 1.897455268965336e-05, "loss": 0.7366, "step": 974 }, { "epoch": 0.439288127956747, "grad_norm": 0.604594790630414, "learning_rate": 1.8972467818001914e-05, "loss": 0.6888, "step": 975 }, { "epoch": 0.4397386798828565, "grad_norm": 0.5966110058139527, "learning_rate": 1.897038094386969e-05, "loss": 0.7252, "step": 976 }, { "epoch": 0.44018923180896596, "grad_norm": 0.5614579020033307, "learning_rate": 1.8968292067722433e-05, "loss": 0.6732, "step": 977 }, { "epoch": 0.44063978373507545, "grad_norm": 0.6253482258624531, "learning_rate": 1.896620119002635e-05, "loss": 0.6835, "step": 978 }, { "epoch": 0.44109033566118494, "grad_norm": 0.6216979204885444, "learning_rate": 1.8964108311248076e-05, "loss": 0.6871, "step": 979 }, { "epoch": 0.44154088758729443, "grad_norm": 0.6368874496670075, "learning_rate": 1.8962013431854705e-05, "loss": 0.6761, "step": 980 }, { "epoch": 0.4419914395134039, "grad_norm": 0.5703785240298969, "learning_rate": 1.895991655231377e-05, "loss": 0.6536, "step": 981 }, { "epoch": 0.4424419914395134, "grad_norm": 0.6693280692044404, "learning_rate": 1.8957817673093258e-05, "loss": 0.6654, "step": 982 }, { "epoch": 0.4428925433656229, "grad_norm": 0.6263581456999108, "learning_rate": 1.8955716794661594e-05, "loss": 0.6617, "step": 983 }, { "epoch": 0.4433430952917324, "grad_norm": 0.5861292503409785, "learning_rate": 1.8953613917487657e-05, "loss": 0.661, "step": 984 }, { "epoch": 0.44379364721784187, "grad_norm": 0.6095968110175319, "learning_rate": 1.8951509042040764e-05, "loss": 0.7161, "step": 985 }, { "epoch": 0.44424419914395136, "grad_norm": 0.6147448759999667, "learning_rate": 1.8949402168790685e-05, "loss": 0.6672, "step": 986 }, { "epoch": 0.44469475107006085, "grad_norm": 0.6411045739861455, "learning_rate": 1.8947293298207637e-05, "loss": 0.6894, "step": 987 }, { "epoch": 0.44514530299617033, "grad_norm": 0.6041376989821523, "learning_rate": 1.894518243076227e-05, "loss": 0.6588, "step": 988 }, { "epoch": 0.44559585492227977, "grad_norm": 0.6994609535083585, "learning_rate": 1.8943069566925698e-05, "loss": 0.6933, "step": 989 }, { "epoch": 0.44604640684838925, "grad_norm": 0.6065905623040925, "learning_rate": 1.8940954707169466e-05, "loss": 0.6897, "step": 990 }, { "epoch": 0.44649695877449874, "grad_norm": 0.6566667361461106, "learning_rate": 1.893883785196557e-05, "loss": 0.6757, "step": 991 }, { "epoch": 0.44694751070060823, "grad_norm": 0.5862522363671862, "learning_rate": 1.8936719001786453e-05, "loss": 0.6537, "step": 992 }, { "epoch": 0.4473980626267177, "grad_norm": 0.7850083438178136, "learning_rate": 1.8934598157105e-05, "loss": 0.6807, "step": 993 }, { "epoch": 0.4478486145528272, "grad_norm": 0.5748881460622577, "learning_rate": 1.8932475318394542e-05, "loss": 0.6862, "step": 994 }, { "epoch": 0.4482991664789367, "grad_norm": 0.6873586395106196, "learning_rate": 1.8930350486128855e-05, "loss": 0.6469, "step": 995 }, { "epoch": 0.4487497184050462, "grad_norm": 0.6197795003683645, "learning_rate": 1.892822366078216e-05, "loss": 0.6831, "step": 996 }, { "epoch": 0.44920027033115567, "grad_norm": 0.6742852772303225, "learning_rate": 1.8926094842829128e-05, "loss": 0.6739, "step": 997 }, { "epoch": 0.44965082225726516, "grad_norm": 0.6347669282432544, "learning_rate": 1.8923964032744866e-05, "loss": 0.7108, "step": 998 }, { "epoch": 0.45010137418337465, "grad_norm": 0.6183967255106245, "learning_rate": 1.8921831231004926e-05, "loss": 0.6738, "step": 999 }, { "epoch": 0.45055192610948414, "grad_norm": 0.6454314436701737, "learning_rate": 1.8919696438085315e-05, "loss": 0.6991, "step": 1000 }, { "epoch": 0.4510024780355936, "grad_norm": 0.5952826013714723, "learning_rate": 1.8917559654462474e-05, "loss": 0.6686, "step": 1001 }, { "epoch": 0.4514530299617031, "grad_norm": 0.6792779852031383, "learning_rate": 1.891542088061329e-05, "loss": 0.6916, "step": 1002 }, { "epoch": 0.45190358188781254, "grad_norm": 0.5608961436304026, "learning_rate": 1.89132801170151e-05, "loss": 0.6725, "step": 1003 }, { "epoch": 0.45235413381392203, "grad_norm": 0.6090291037411772, "learning_rate": 1.8911137364145675e-05, "loss": 0.6753, "step": 1004 }, { "epoch": 0.4528046857400315, "grad_norm": 0.6112616873262537, "learning_rate": 1.8908992622483242e-05, "loss": 0.6698, "step": 1005 }, { "epoch": 0.453255237666141, "grad_norm": 0.5419597344849721, "learning_rate": 1.8906845892506463e-05, "loss": 0.6864, "step": 1006 }, { "epoch": 0.4537057895922505, "grad_norm": 0.6117060197675015, "learning_rate": 1.8904697174694447e-05, "loss": 0.6703, "step": 1007 }, { "epoch": 0.45415634151836, "grad_norm": 0.6282358990876972, "learning_rate": 1.8902546469526744e-05, "loss": 0.6829, "step": 1008 }, { "epoch": 0.4546068934444695, "grad_norm": 0.5764097462927967, "learning_rate": 1.8900393777483354e-05, "loss": 0.6666, "step": 1009 }, { "epoch": 0.45505744537057896, "grad_norm": 0.5776024229048721, "learning_rate": 1.8898239099044713e-05, "loss": 0.6766, "step": 1010 }, { "epoch": 0.45550799729668845, "grad_norm": 0.6098642460187843, "learning_rate": 1.8896082434691702e-05, "loss": 0.6712, "step": 1011 }, { "epoch": 0.45595854922279794, "grad_norm": 0.6239639066509542, "learning_rate": 1.8893923784905647e-05, "loss": 0.6772, "step": 1012 }, { "epoch": 0.4564091011489074, "grad_norm": 0.5729741713172605, "learning_rate": 1.8891763150168323e-05, "loss": 0.6847, "step": 1013 }, { "epoch": 0.4568596530750169, "grad_norm": 0.5850889859845099, "learning_rate": 1.8889600530961935e-05, "loss": 0.6664, "step": 1014 }, { "epoch": 0.4573102050011264, "grad_norm": 0.5826807361356316, "learning_rate": 1.8887435927769137e-05, "loss": 0.689, "step": 1015 }, { "epoch": 0.4577607569272359, "grad_norm": 0.5770865098906778, "learning_rate": 1.8885269341073034e-05, "loss": 0.6818, "step": 1016 }, { "epoch": 0.4582113088533453, "grad_norm": 0.5976348122471122, "learning_rate": 1.888310077135716e-05, "loss": 0.6492, "step": 1017 }, { "epoch": 0.4586618607794548, "grad_norm": 0.5709422311102121, "learning_rate": 1.88809302191055e-05, "loss": 0.6786, "step": 1018 }, { "epoch": 0.4591124127055643, "grad_norm": 0.5677088778843076, "learning_rate": 1.8878757684802474e-05, "loss": 0.6966, "step": 1019 }, { "epoch": 0.4595629646316738, "grad_norm": 0.6052339304621245, "learning_rate": 1.8876583168932957e-05, "loss": 0.6426, "step": 1020 }, { "epoch": 0.4600135165577833, "grad_norm": 0.6048100194981031, "learning_rate": 1.8874406671982255e-05, "loss": 0.6941, "step": 1021 }, { "epoch": 0.46046406848389276, "grad_norm": 0.5583504513349751, "learning_rate": 1.887222819443612e-05, "loss": 0.7034, "step": 1022 }, { "epoch": 0.46091462041000225, "grad_norm": 0.6245975744558699, "learning_rate": 1.887004773678075e-05, "loss": 0.6896, "step": 1023 }, { "epoch": 0.46136517233611174, "grad_norm": 0.5859154247246579, "learning_rate": 1.886786529950277e-05, "loss": 0.6531, "step": 1024 }, { "epoch": 0.4618157242622212, "grad_norm": 0.5501518375231017, "learning_rate": 1.886568088308927e-05, "loss": 0.648, "step": 1025 }, { "epoch": 0.4622662761883307, "grad_norm": 0.6090575467865375, "learning_rate": 1.8863494488027763e-05, "loss": 0.6759, "step": 1026 }, { "epoch": 0.4627168281144402, "grad_norm": 0.57466950745878, "learning_rate": 1.886130611480621e-05, "loss": 0.6803, "step": 1027 }, { "epoch": 0.4631673800405497, "grad_norm": 0.5725984202977035, "learning_rate": 1.8859115763913016e-05, "loss": 0.6879, "step": 1028 }, { "epoch": 0.4636179319666592, "grad_norm": 0.5476141934131734, "learning_rate": 1.8856923435837024e-05, "loss": 0.6509, "step": 1029 }, { "epoch": 0.46406848389276867, "grad_norm": 0.592948594665043, "learning_rate": 1.8854729131067517e-05, "loss": 0.7118, "step": 1030 }, { "epoch": 0.4645190358188781, "grad_norm": 0.5653945611484399, "learning_rate": 1.8852532850094226e-05, "loss": 0.7117, "step": 1031 }, { "epoch": 0.4649695877449876, "grad_norm": 0.6182362787837052, "learning_rate": 1.885033459340731e-05, "loss": 0.699, "step": 1032 }, { "epoch": 0.4654201396710971, "grad_norm": 0.5758399909426141, "learning_rate": 1.8848134361497385e-05, "loss": 0.7083, "step": 1033 }, { "epoch": 0.46587069159720657, "grad_norm": 0.5868236340170246, "learning_rate": 1.8845932154855498e-05, "loss": 0.662, "step": 1034 }, { "epoch": 0.46632124352331605, "grad_norm": 0.5686717289513261, "learning_rate": 1.884372797397314e-05, "loss": 0.7141, "step": 1035 }, { "epoch": 0.46677179544942554, "grad_norm": 0.5819174259307529, "learning_rate": 1.8841521819342237e-05, "loss": 0.7139, "step": 1036 }, { "epoch": 0.46722234737553503, "grad_norm": 0.5466167561267398, "learning_rate": 1.8839313691455163e-05, "loss": 0.6542, "step": 1037 }, { "epoch": 0.4676728993016445, "grad_norm": 0.584425650219919, "learning_rate": 1.883710359080473e-05, "loss": 0.6633, "step": 1038 }, { "epoch": 0.468123451227754, "grad_norm": 0.5914444580286475, "learning_rate": 1.8834891517884188e-05, "loss": 0.708, "step": 1039 }, { "epoch": 0.4685740031538635, "grad_norm": 0.5678341446423233, "learning_rate": 1.8832677473187228e-05, "loss": 0.6951, "step": 1040 }, { "epoch": 0.469024555079973, "grad_norm": 0.5925946370206846, "learning_rate": 1.8830461457207984e-05, "loss": 0.6818, "step": 1041 }, { "epoch": 0.46947510700608247, "grad_norm": 0.553412484149959, "learning_rate": 1.8828243470441026e-05, "loss": 0.6948, "step": 1042 }, { "epoch": 0.46992565893219196, "grad_norm": 0.6751523648067929, "learning_rate": 1.8826023513381372e-05, "loss": 0.7142, "step": 1043 }, { "epoch": 0.47037621085830145, "grad_norm": 0.5658493483059694, "learning_rate": 1.8823801586524465e-05, "loss": 0.6508, "step": 1044 }, { "epoch": 0.4708267627844109, "grad_norm": 0.6132117236465419, "learning_rate": 1.8821577690366194e-05, "loss": 0.6864, "step": 1045 }, { "epoch": 0.47127731471052037, "grad_norm": 0.597948842189407, "learning_rate": 1.88193518254029e-05, "loss": 0.6847, "step": 1046 }, { "epoch": 0.47172786663662986, "grad_norm": 0.6023693306241737, "learning_rate": 1.8817123992131344e-05, "loss": 0.6687, "step": 1047 }, { "epoch": 0.47217841856273934, "grad_norm": 0.6284442329610882, "learning_rate": 1.8814894191048744e-05, "loss": 0.6697, "step": 1048 }, { "epoch": 0.47262897048884883, "grad_norm": 0.6414551572386332, "learning_rate": 1.8812662422652733e-05, "loss": 0.6658, "step": 1049 }, { "epoch": 0.4730795224149583, "grad_norm": 0.5679398442195762, "learning_rate": 1.8810428687441415e-05, "loss": 0.6769, "step": 1050 }, { "epoch": 0.4735300743410678, "grad_norm": 0.6358577282062878, "learning_rate": 1.880819298591331e-05, "loss": 0.6651, "step": 1051 }, { "epoch": 0.4739806262671773, "grad_norm": 0.5520681120770723, "learning_rate": 1.880595531856738e-05, "loss": 0.6804, "step": 1052 }, { "epoch": 0.4744311781932868, "grad_norm": 0.6330912840545794, "learning_rate": 1.8803715685903034e-05, "loss": 0.7148, "step": 1053 }, { "epoch": 0.4748817301193963, "grad_norm": 0.5600765537005309, "learning_rate": 1.880147408842011e-05, "loss": 0.6584, "step": 1054 }, { "epoch": 0.47533228204550576, "grad_norm": 0.6292134616752805, "learning_rate": 1.8799230526618896e-05, "loss": 0.683, "step": 1055 }, { "epoch": 0.47578283397161525, "grad_norm": 0.5828264651704995, "learning_rate": 1.8796985001000104e-05, "loss": 0.6644, "step": 1056 }, { "epoch": 0.47623338589772474, "grad_norm": 0.6759346169458792, "learning_rate": 1.879473751206489e-05, "loss": 0.7095, "step": 1057 }, { "epoch": 0.47668393782383417, "grad_norm": 0.6592089116088745, "learning_rate": 1.8792488060314862e-05, "loss": 0.6897, "step": 1058 }, { "epoch": 0.47713448974994366, "grad_norm": 0.5680021934126724, "learning_rate": 1.879023664625204e-05, "loss": 0.6665, "step": 1059 }, { "epoch": 0.47758504167605315, "grad_norm": 0.6750870363520871, "learning_rate": 1.8787983270378908e-05, "loss": 0.7055, "step": 1060 }, { "epoch": 0.47803559360216263, "grad_norm": 0.5771594994330355, "learning_rate": 1.878572793319837e-05, "loss": 0.6816, "step": 1061 }, { "epoch": 0.4784861455282721, "grad_norm": 0.6178738626447426, "learning_rate": 1.878347063521377e-05, "loss": 0.6657, "step": 1062 }, { "epoch": 0.4789366974543816, "grad_norm": 0.5947297867440946, "learning_rate": 1.8781211376928898e-05, "loss": 0.6601, "step": 1063 }, { "epoch": 0.4793872493804911, "grad_norm": 0.5741367623082271, "learning_rate": 1.8778950158847976e-05, "loss": 0.6749, "step": 1064 }, { "epoch": 0.4798378013066006, "grad_norm": 0.6130759797998155, "learning_rate": 1.8776686981475664e-05, "loss": 0.6513, "step": 1065 }, { "epoch": 0.4802883532327101, "grad_norm": 0.6042870691349929, "learning_rate": 1.877442184531706e-05, "loss": 0.6991, "step": 1066 }, { "epoch": 0.48073890515881956, "grad_norm": 0.625737308839797, "learning_rate": 1.8772154750877696e-05, "loss": 0.6721, "step": 1067 }, { "epoch": 0.48118945708492905, "grad_norm": 0.5793086996864694, "learning_rate": 1.8769885698663546e-05, "loss": 0.6481, "step": 1068 }, { "epoch": 0.48164000901103854, "grad_norm": 0.621615952957373, "learning_rate": 1.8767614689181017e-05, "loss": 0.7043, "step": 1069 }, { "epoch": 0.482090560937148, "grad_norm": 0.6409740986296336, "learning_rate": 1.8765341722936952e-05, "loss": 0.6885, "step": 1070 }, { "epoch": 0.4825411128632575, "grad_norm": 0.5767917048962207, "learning_rate": 1.8763066800438638e-05, "loss": 0.7239, "step": 1071 }, { "epoch": 0.48299166478936695, "grad_norm": 0.6046634881554055, "learning_rate": 1.876078992219379e-05, "loss": 0.6723, "step": 1072 }, { "epoch": 0.48344221671547644, "grad_norm": 0.5533539733562594, "learning_rate": 1.8758511088710564e-05, "loss": 0.6726, "step": 1073 }, { "epoch": 0.4838927686415859, "grad_norm": 0.5852878688436488, "learning_rate": 1.8756230300497553e-05, "loss": 0.6969, "step": 1074 }, { "epoch": 0.4843433205676954, "grad_norm": 0.60219559723547, "learning_rate": 1.875394755806378e-05, "loss": 0.6892, "step": 1075 }, { "epoch": 0.4847938724938049, "grad_norm": 0.5709560980804564, "learning_rate": 1.8751662861918716e-05, "loss": 0.6905, "step": 1076 }, { "epoch": 0.4852444244199144, "grad_norm": 0.5617745776277269, "learning_rate": 1.8749376212572254e-05, "loss": 0.6713, "step": 1077 }, { "epoch": 0.4856949763460239, "grad_norm": 0.5507388621117723, "learning_rate": 1.8747087610534735e-05, "loss": 0.7023, "step": 1078 }, { "epoch": 0.48614552827213336, "grad_norm": 0.5469450911979419, "learning_rate": 1.874479705631693e-05, "loss": 0.6907, "step": 1079 }, { "epoch": 0.48659608019824285, "grad_norm": 0.5860366364344379, "learning_rate": 1.874250455043004e-05, "loss": 0.6928, "step": 1080 }, { "epoch": 0.48704663212435234, "grad_norm": 0.5264438883734882, "learning_rate": 1.8740210093385712e-05, "loss": 0.6493, "step": 1081 }, { "epoch": 0.48749718405046183, "grad_norm": 0.6004467873215276, "learning_rate": 1.873791368569603e-05, "loss": 0.7055, "step": 1082 }, { "epoch": 0.4879477359765713, "grad_norm": 0.5513132385953922, "learning_rate": 1.87356153278735e-05, "loss": 0.6493, "step": 1083 }, { "epoch": 0.4883982879026808, "grad_norm": 0.5522486152410854, "learning_rate": 1.8733315020431072e-05, "loss": 0.6591, "step": 1084 }, { "epoch": 0.4888488398287903, "grad_norm": 0.5190072930568933, "learning_rate": 1.8731012763882132e-05, "loss": 0.6804, "step": 1085 }, { "epoch": 0.4892993917548997, "grad_norm": 0.5706009739675693, "learning_rate": 1.8728708558740497e-05, "loss": 0.6923, "step": 1086 }, { "epoch": 0.4897499436810092, "grad_norm": 0.563960624474572, "learning_rate": 1.8726402405520425e-05, "loss": 0.6893, "step": 1087 }, { "epoch": 0.4902004956071187, "grad_norm": 0.5779234959177258, "learning_rate": 1.87240943047366e-05, "loss": 0.7312, "step": 1088 }, { "epoch": 0.4906510475332282, "grad_norm": 0.5296460456357791, "learning_rate": 1.8721784256904148e-05, "loss": 0.6977, "step": 1089 }, { "epoch": 0.4911015994593377, "grad_norm": 0.5727132948403655, "learning_rate": 1.8719472262538624e-05, "loss": 0.7053, "step": 1090 }, { "epoch": 0.49155215138544717, "grad_norm": 0.5575564485521026, "learning_rate": 1.8717158322156025e-05, "loss": 0.7157, "step": 1091 }, { "epoch": 0.49200270331155666, "grad_norm": 0.5774320979986938, "learning_rate": 1.8714842436272774e-05, "loss": 0.6453, "step": 1092 }, { "epoch": 0.49245325523766614, "grad_norm": 0.5342982022189675, "learning_rate": 1.8712524605405733e-05, "loss": 0.6202, "step": 1093 }, { "epoch": 0.49290380716377563, "grad_norm": 0.5164983819712905, "learning_rate": 1.8710204830072197e-05, "loss": 0.6501, "step": 1094 }, { "epoch": 0.4933543590898851, "grad_norm": 0.5332655126891364, "learning_rate": 1.870788311078989e-05, "loss": 0.7093, "step": 1095 }, { "epoch": 0.4938049110159946, "grad_norm": 0.5272475788770223, "learning_rate": 1.8705559448076987e-05, "loss": 0.6633, "step": 1096 }, { "epoch": 0.4942554629421041, "grad_norm": 0.569238986913456, "learning_rate": 1.8703233842452072e-05, "loss": 0.688, "step": 1097 }, { "epoch": 0.4947060148682136, "grad_norm": 0.5284733781598008, "learning_rate": 1.870090629443418e-05, "loss": 0.6784, "step": 1098 }, { "epoch": 0.49515656679432307, "grad_norm": 0.6013292982325386, "learning_rate": 1.8698576804542775e-05, "loss": 0.7023, "step": 1099 }, { "epoch": 0.4956071187204325, "grad_norm": 0.5695753332571705, "learning_rate": 1.8696245373297756e-05, "loss": 0.6811, "step": 1100 }, { "epoch": 0.496057670646542, "grad_norm": 0.5415306955928785, "learning_rate": 1.8693912001219443e-05, "loss": 0.6725, "step": 1101 }, { "epoch": 0.4965082225726515, "grad_norm": 0.552499474572317, "learning_rate": 1.8691576688828613e-05, "loss": 0.6641, "step": 1102 }, { "epoch": 0.49695877449876097, "grad_norm": 0.5934545565407001, "learning_rate": 1.8689239436646454e-05, "loss": 0.6825, "step": 1103 }, { "epoch": 0.49740932642487046, "grad_norm": 0.5269890166067902, "learning_rate": 1.86869002451946e-05, "loss": 0.6744, "step": 1104 }, { "epoch": 0.49785987835097995, "grad_norm": 0.5840298203900371, "learning_rate": 1.8684559114995106e-05, "loss": 0.6788, "step": 1105 }, { "epoch": 0.49831043027708943, "grad_norm": 0.564448777608455, "learning_rate": 1.8682216046570477e-05, "loss": 0.6907, "step": 1106 }, { "epoch": 0.4987609822031989, "grad_norm": 0.5912982415649296, "learning_rate": 1.8679871040443632e-05, "loss": 0.6681, "step": 1107 }, { "epoch": 0.4992115341293084, "grad_norm": 0.576082179809888, "learning_rate": 1.8677524097137936e-05, "loss": 0.6822, "step": 1108 }, { "epoch": 0.4996620860554179, "grad_norm": 0.6376476677567713, "learning_rate": 1.8675175217177176e-05, "loss": 0.7029, "step": 1109 }, { "epoch": 0.5001126379815274, "grad_norm": 0.6024002600086726, "learning_rate": 1.8672824401085582e-05, "loss": 0.6857, "step": 1110 }, { "epoch": 0.5001126379815274, "eval_loss": 0.6503483057022095, "eval_runtime": 24.3748, "eval_samples_per_second": 11.446, "eval_steps_per_second": 0.492, "step": 1110 }, { "epoch": 0.5005631899076368, "grad_norm": 0.6227377745848116, "learning_rate": 1.8670471649387808e-05, "loss": 0.6352, "step": 1111 }, { "epoch": 0.5010137418337464, "grad_norm": 0.6489183016207347, "learning_rate": 1.866811696260894e-05, "loss": 0.7201, "step": 1112 }, { "epoch": 0.5014642937598558, "grad_norm": 0.5770854344204118, "learning_rate": 1.8665760341274505e-05, "loss": 0.691, "step": 1113 }, { "epoch": 0.5019148456859653, "grad_norm": 0.6246012145020519, "learning_rate": 1.866340178591045e-05, "loss": 0.6691, "step": 1114 }, { "epoch": 0.5023653976120748, "grad_norm": 0.5144492072703452, "learning_rate": 1.8661041297043164e-05, "loss": 0.667, "step": 1115 }, { "epoch": 0.5028159495381843, "grad_norm": 0.5348324444611896, "learning_rate": 1.865867887519945e-05, "loss": 0.6685, "step": 1116 }, { "epoch": 0.5032665014642937, "grad_norm": 0.5893235949679978, "learning_rate": 1.865631452090657e-05, "loss": 0.6789, "step": 1117 }, { "epoch": 0.5037170533904033, "grad_norm": 0.5625264105436037, "learning_rate": 1.8653948234692195e-05, "loss": 0.6511, "step": 1118 }, { "epoch": 0.5041676053165127, "grad_norm": 0.5590435655283446, "learning_rate": 1.865158001708443e-05, "loss": 0.7265, "step": 1119 }, { "epoch": 0.5046181572426223, "grad_norm": 0.6770871830252959, "learning_rate": 1.8649209868611822e-05, "loss": 0.7012, "step": 1120 }, { "epoch": 0.5050687091687317, "grad_norm": 0.5525450024546693, "learning_rate": 1.864683778980334e-05, "loss": 0.6341, "step": 1121 }, { "epoch": 0.5055192610948411, "grad_norm": 0.6491432058095081, "learning_rate": 1.8644463781188387e-05, "loss": 0.7073, "step": 1122 }, { "epoch": 0.5059698130209507, "grad_norm": 0.631856386664538, "learning_rate": 1.8642087843296793e-05, "loss": 0.719, "step": 1123 }, { "epoch": 0.5064203649470601, "grad_norm": 0.5796615693233559, "learning_rate": 1.8639709976658822e-05, "loss": 0.6974, "step": 1124 }, { "epoch": 0.5068709168731697, "grad_norm": 0.6050581374762166, "learning_rate": 1.8637330181805167e-05, "loss": 0.6546, "step": 1125 }, { "epoch": 0.5073214687992791, "grad_norm": 0.6110784232903589, "learning_rate": 1.8634948459266957e-05, "loss": 0.6851, "step": 1126 }, { "epoch": 0.5077720207253886, "grad_norm": 0.6441830946556758, "learning_rate": 1.863256480957574e-05, "loss": 0.6488, "step": 1127 }, { "epoch": 0.5082225726514981, "grad_norm": 0.5702871050503732, "learning_rate": 1.8630179233263505e-05, "loss": 0.676, "step": 1128 }, { "epoch": 0.5086731245776076, "grad_norm": 0.5963069318021057, "learning_rate": 1.8627791730862663e-05, "loss": 0.6864, "step": 1129 }, { "epoch": 0.509123676503717, "grad_norm": 0.6198178979090223, "learning_rate": 1.8625402302906058e-05, "loss": 0.6959, "step": 1130 }, { "epoch": 0.5095742284298266, "grad_norm": 0.611266632270247, "learning_rate": 1.862301094992697e-05, "loss": 0.6744, "step": 1131 }, { "epoch": 0.510024780355936, "grad_norm": 0.6863988570188357, "learning_rate": 1.8620617672459097e-05, "loss": 0.6671, "step": 1132 }, { "epoch": 0.5104753322820456, "grad_norm": 0.62701245928533, "learning_rate": 1.8618222471036575e-05, "loss": 0.6842, "step": 1133 }, { "epoch": 0.510925884208155, "grad_norm": 0.6372665316544257, "learning_rate": 1.861582534619396e-05, "loss": 0.6482, "step": 1134 }, { "epoch": 0.5113764361342644, "grad_norm": 0.5906073611054105, "learning_rate": 1.8613426298466253e-05, "loss": 0.6736, "step": 1135 }, { "epoch": 0.511826988060374, "grad_norm": 0.6556545382799398, "learning_rate": 1.8611025328388867e-05, "loss": 0.6891, "step": 1136 }, { "epoch": 0.5122775399864834, "grad_norm": 0.616078157923779, "learning_rate": 1.8608622436497657e-05, "loss": 0.6898, "step": 1137 }, { "epoch": 0.5127280919125929, "grad_norm": 0.6221160743624736, "learning_rate": 1.8606217623328898e-05, "loss": 0.6808, "step": 1138 }, { "epoch": 0.5131786438387024, "grad_norm": 0.6140838077301787, "learning_rate": 1.8603810889419306e-05, "loss": 0.673, "step": 1139 }, { "epoch": 0.5136291957648119, "grad_norm": 0.602371983659265, "learning_rate": 1.8601402235306007e-05, "loss": 0.6936, "step": 1140 }, { "epoch": 0.5140797476909214, "grad_norm": 0.5709019185456803, "learning_rate": 1.859899166152657e-05, "loss": 0.6792, "step": 1141 }, { "epoch": 0.5145302996170309, "grad_norm": 0.552913295251678, "learning_rate": 1.859657916861899e-05, "loss": 0.7057, "step": 1142 }, { "epoch": 0.5149808515431403, "grad_norm": 0.6252998921881732, "learning_rate": 1.8594164757121688e-05, "loss": 0.694, "step": 1143 }, { "epoch": 0.5154314034692499, "grad_norm": 0.6127680395628723, "learning_rate": 1.859174842757351e-05, "loss": 0.6848, "step": 1144 }, { "epoch": 0.5158819553953593, "grad_norm": 0.6279999509859239, "learning_rate": 1.858933018051374e-05, "loss": 0.6736, "step": 1145 }, { "epoch": 0.5163325073214688, "grad_norm": 0.5415341683202036, "learning_rate": 1.858691001648208e-05, "loss": 0.6948, "step": 1146 }, { "epoch": 0.5167830592475783, "grad_norm": 0.5745472430467878, "learning_rate": 1.8584487936018663e-05, "loss": 0.6784, "step": 1147 }, { "epoch": 0.5172336111736877, "grad_norm": 0.5783891247453713, "learning_rate": 1.8582063939664052e-05, "loss": 0.66, "step": 1148 }, { "epoch": 0.5176841630997973, "grad_norm": 0.5916793871339253, "learning_rate": 1.8579638027959234e-05, "loss": 0.655, "step": 1149 }, { "epoch": 0.5181347150259067, "grad_norm": 0.5730490602150755, "learning_rate": 1.8577210201445634e-05, "loss": 0.7106, "step": 1150 }, { "epoch": 0.5185852669520162, "grad_norm": 0.5428881731191245, "learning_rate": 1.857478046066508e-05, "loss": 0.6686, "step": 1151 }, { "epoch": 0.5190358188781257, "grad_norm": 0.6010440703293176, "learning_rate": 1.8572348806159857e-05, "loss": 0.6746, "step": 1152 }, { "epoch": 0.5194863708042352, "grad_norm": 0.5502930349573192, "learning_rate": 1.8569915238472657e-05, "loss": 0.6589, "step": 1153 }, { "epoch": 0.5199369227303446, "grad_norm": 0.5710735957720621, "learning_rate": 1.8567479758146608e-05, "loss": 0.698, "step": 1154 }, { "epoch": 0.5203874746564542, "grad_norm": 0.5683238787380646, "learning_rate": 1.856504236572526e-05, "loss": 0.6801, "step": 1155 }, { "epoch": 0.5208380265825636, "grad_norm": 0.5580132816029447, "learning_rate": 1.856260306175259e-05, "loss": 0.7037, "step": 1156 }, { "epoch": 0.5212885785086732, "grad_norm": 0.5695766244139115, "learning_rate": 1.8560161846773002e-05, "loss": 0.661, "step": 1157 }, { "epoch": 0.5217391304347826, "grad_norm": 0.5941133319289373, "learning_rate": 1.8557718721331336e-05, "loss": 0.6869, "step": 1158 }, { "epoch": 0.5221896823608921, "grad_norm": 0.531564462333528, "learning_rate": 1.8555273685972842e-05, "loss": 0.6352, "step": 1159 }, { "epoch": 0.5226402342870016, "grad_norm": 0.6019239868709052, "learning_rate": 1.855282674124321e-05, "loss": 0.6877, "step": 1160 }, { "epoch": 0.5230907862131111, "grad_norm": 0.5620639060806033, "learning_rate": 1.8550377887688545e-05, "loss": 0.6535, "step": 1161 }, { "epoch": 0.5235413381392205, "grad_norm": 0.5904966367574523, "learning_rate": 1.854792712585539e-05, "loss": 0.7003, "step": 1162 }, { "epoch": 0.52399189006533, "grad_norm": 0.5596585501909831, "learning_rate": 1.8545474456290705e-05, "loss": 0.6813, "step": 1163 }, { "epoch": 0.5244424419914395, "grad_norm": 0.5976322335022602, "learning_rate": 1.8543019879541876e-05, "loss": 0.7019, "step": 1164 }, { "epoch": 0.524892993917549, "grad_norm": 0.5747048286177595, "learning_rate": 1.8540563396156722e-05, "loss": 0.7064, "step": 1165 }, { "epoch": 0.5253435458436585, "grad_norm": 0.5684999513078429, "learning_rate": 1.8538105006683474e-05, "loss": 0.6856, "step": 1166 }, { "epoch": 0.5257940977697679, "grad_norm": 0.56820135059807, "learning_rate": 1.8535644711670804e-05, "loss": 0.6544, "step": 1167 }, { "epoch": 0.5262446496958775, "grad_norm": 0.632393875848379, "learning_rate": 1.8533182511667797e-05, "loss": 0.7066, "step": 1168 }, { "epoch": 0.5266952016219869, "grad_norm": 0.5725806123034395, "learning_rate": 1.8530718407223976e-05, "loss": 0.7037, "step": 1169 }, { "epoch": 0.5271457535480965, "grad_norm": 0.5814888175555157, "learning_rate": 1.8528252398889277e-05, "loss": 0.6857, "step": 1170 }, { "epoch": 0.5275963054742059, "grad_norm": 0.577823120057046, "learning_rate": 1.8525784487214064e-05, "loss": 0.6591, "step": 1171 }, { "epoch": 0.5280468574003154, "grad_norm": 0.576321952938968, "learning_rate": 1.8523314672749123e-05, "loss": 0.6655, "step": 1172 }, { "epoch": 0.5284974093264249, "grad_norm": 0.6336910968983103, "learning_rate": 1.8520842956045676e-05, "loss": 0.6578, "step": 1173 }, { "epoch": 0.5289479612525344, "grad_norm": 0.5678548651236375, "learning_rate": 1.8518369337655363e-05, "loss": 0.7057, "step": 1174 }, { "epoch": 0.5293985131786438, "grad_norm": 0.6050261990160175, "learning_rate": 1.8515893818130243e-05, "loss": 0.6748, "step": 1175 }, { "epoch": 0.5298490651047533, "grad_norm": 0.5998748798887009, "learning_rate": 1.8513416398022803e-05, "loss": 0.6585, "step": 1176 }, { "epoch": 0.5302996170308628, "grad_norm": 0.5652487040685084, "learning_rate": 1.851093707788596e-05, "loss": 0.6636, "step": 1177 }, { "epoch": 0.5307501689569722, "grad_norm": 0.6459317405461462, "learning_rate": 1.8508455858273045e-05, "loss": 0.6879, "step": 1178 }, { "epoch": 0.5312007208830818, "grad_norm": 0.5922458469047949, "learning_rate": 1.8505972739737822e-05, "loss": 0.7136, "step": 1179 }, { "epoch": 0.5316512728091912, "grad_norm": 0.5999317287004171, "learning_rate": 1.8503487722834476e-05, "loss": 0.7035, "step": 1180 }, { "epoch": 0.5321018247353008, "grad_norm": 0.6298850877113731, "learning_rate": 1.850100080811761e-05, "loss": 0.6686, "step": 1181 }, { "epoch": 0.5325523766614102, "grad_norm": 0.5781766477704815, "learning_rate": 1.8498511996142255e-05, "loss": 0.6491, "step": 1182 }, { "epoch": 0.5330029285875197, "grad_norm": 0.6132106796122279, "learning_rate": 1.849602128746387e-05, "loss": 0.7351, "step": 1183 }, { "epoch": 0.5334534805136292, "grad_norm": 0.6157892502305242, "learning_rate": 1.849352868263833e-05, "loss": 0.6905, "step": 1184 }, { "epoch": 0.5339040324397387, "grad_norm": 0.5371126848581431, "learning_rate": 1.849103418222194e-05, "loss": 0.6431, "step": 1185 }, { "epoch": 0.5343545843658482, "grad_norm": 0.5714141502945124, "learning_rate": 1.8488537786771417e-05, "loss": 0.6568, "step": 1186 }, { "epoch": 0.5348051362919577, "grad_norm": 0.6302392453774933, "learning_rate": 1.848603949684391e-05, "loss": 0.6735, "step": 1187 }, { "epoch": 0.5352556882180671, "grad_norm": 0.6023038173775288, "learning_rate": 1.8483539312996997e-05, "loss": 0.6791, "step": 1188 }, { "epoch": 0.5357062401441767, "grad_norm": 0.6055633421688397, "learning_rate": 1.848103723578866e-05, "loss": 0.6367, "step": 1189 }, { "epoch": 0.5361567920702861, "grad_norm": 0.5339612612409594, "learning_rate": 1.847853326577732e-05, "loss": 0.6891, "step": 1190 }, { "epoch": 0.5366073439963955, "grad_norm": 0.6741363481940019, "learning_rate": 1.847602740352181e-05, "loss": 0.7195, "step": 1191 }, { "epoch": 0.5370578959225051, "grad_norm": 0.5926746774749257, "learning_rate": 1.8473519649581396e-05, "loss": 0.7005, "step": 1192 }, { "epoch": 0.5375084478486145, "grad_norm": 0.5784741752674392, "learning_rate": 1.8471010004515757e-05, "loss": 0.6866, "step": 1193 }, { "epoch": 0.5379589997747241, "grad_norm": 0.6160797111260502, "learning_rate": 1.8468498468884997e-05, "loss": 0.7022, "step": 1194 }, { "epoch": 0.5384095517008335, "grad_norm": 0.68032457053801, "learning_rate": 1.846598504324964e-05, "loss": 0.6903, "step": 1195 }, { "epoch": 0.538860103626943, "grad_norm": 0.5831155402080687, "learning_rate": 1.8463469728170635e-05, "loss": 0.7029, "step": 1196 }, { "epoch": 0.5393106555530525, "grad_norm": 0.6428472883656214, "learning_rate": 1.8460952524209355e-05, "loss": 0.7215, "step": 1197 }, { "epoch": 0.539761207479162, "grad_norm": 0.5638449253409583, "learning_rate": 1.845843343192759e-05, "loss": 0.6873, "step": 1198 }, { "epoch": 0.5402117594052714, "grad_norm": 0.5904415141204172, "learning_rate": 1.8455912451887548e-05, "loss": 0.6584, "step": 1199 }, { "epoch": 0.540662311331381, "grad_norm": 0.5531500728817322, "learning_rate": 1.8453389584651863e-05, "loss": 0.6893, "step": 1200 }, { "epoch": 0.5411128632574904, "grad_norm": 0.6074289844123711, "learning_rate": 1.84508648307836e-05, "loss": 0.6698, "step": 1201 }, { "epoch": 0.5415634151836, "grad_norm": 0.5643236125974241, "learning_rate": 1.844833819084622e-05, "loss": 0.6365, "step": 1202 }, { "epoch": 0.5420139671097094, "grad_norm": 0.5682313047470735, "learning_rate": 1.8445809665403635e-05, "loss": 0.6579, "step": 1203 }, { "epoch": 0.5424645190358188, "grad_norm": 0.5474522628523086, "learning_rate": 1.8443279255020153e-05, "loss": 0.6667, "step": 1204 }, { "epoch": 0.5429150709619284, "grad_norm": 0.6009572835830052, "learning_rate": 1.8440746960260517e-05, "loss": 0.6603, "step": 1205 }, { "epoch": 0.5433656228880378, "grad_norm": 0.5756098647320187, "learning_rate": 1.843821278168988e-05, "loss": 0.7123, "step": 1206 }, { "epoch": 0.5438161748141473, "grad_norm": 0.5395006376647099, "learning_rate": 1.8435676719873828e-05, "loss": 0.6733, "step": 1207 }, { "epoch": 0.5442667267402568, "grad_norm": 0.5654783224467729, "learning_rate": 1.843313877537836e-05, "loss": 0.6582, "step": 1208 }, { "epoch": 0.5447172786663663, "grad_norm": 0.5733190550756676, "learning_rate": 1.8430598948769892e-05, "loss": 0.6583, "step": 1209 }, { "epoch": 0.5451678305924758, "grad_norm": 0.577401167915574, "learning_rate": 1.842805724061527e-05, "loss": 0.7129, "step": 1210 }, { "epoch": 0.5456183825185853, "grad_norm": 0.6551111775095596, "learning_rate": 1.8425513651481748e-05, "loss": 0.6592, "step": 1211 }, { "epoch": 0.5460689344446947, "grad_norm": 0.5827477546325899, "learning_rate": 1.842296818193701e-05, "loss": 0.659, "step": 1212 }, { "epoch": 0.5465194863708043, "grad_norm": 0.6014263877859664, "learning_rate": 1.8420420832549154e-05, "loss": 0.7033, "step": 1213 }, { "epoch": 0.5469700382969137, "grad_norm": 0.6028129866117687, "learning_rate": 1.8417871603886698e-05, "loss": 0.6645, "step": 1214 }, { "epoch": 0.5474205902230233, "grad_norm": 0.556205381342657, "learning_rate": 1.8415320496518582e-05, "loss": 0.7233, "step": 1215 }, { "epoch": 0.5478711421491327, "grad_norm": 0.5650125756535757, "learning_rate": 1.841276751101416e-05, "loss": 0.7161, "step": 1216 }, { "epoch": 0.5483216940752421, "grad_norm": 0.5935156780577101, "learning_rate": 1.8410212647943215e-05, "loss": 0.6886, "step": 1217 }, { "epoch": 0.5487722460013517, "grad_norm": 0.5348016813420882, "learning_rate": 1.8407655907875937e-05, "loss": 0.6641, "step": 1218 }, { "epoch": 0.5492227979274611, "grad_norm": 0.6163642057980234, "learning_rate": 1.840509729138295e-05, "loss": 0.6712, "step": 1219 }, { "epoch": 0.5496733498535706, "grad_norm": 0.5767362912352102, "learning_rate": 1.8402536799035274e-05, "loss": 0.6874, "step": 1220 }, { "epoch": 0.5501239017796801, "grad_norm": 0.611601288973219, "learning_rate": 1.839997443140437e-05, "loss": 0.692, "step": 1221 }, { "epoch": 0.5505744537057896, "grad_norm": 0.5446819248367788, "learning_rate": 1.8397410189062106e-05, "loss": 0.6482, "step": 1222 }, { "epoch": 0.551025005631899, "grad_norm": 0.5787172380252542, "learning_rate": 1.8394844072580772e-05, "loss": 0.6641, "step": 1223 }, { "epoch": 0.5514755575580086, "grad_norm": 0.5100011912031546, "learning_rate": 1.839227608253308e-05, "loss": 0.6583, "step": 1224 }, { "epoch": 0.551926109484118, "grad_norm": 0.5955683822836078, "learning_rate": 1.8389706219492147e-05, "loss": 0.6818, "step": 1225 }, { "epoch": 0.5523766614102276, "grad_norm": 0.5376309414323521, "learning_rate": 1.838713448403152e-05, "loss": 0.6706, "step": 1226 }, { "epoch": 0.552827213336337, "grad_norm": 0.5624059591223907, "learning_rate": 1.8384560876725163e-05, "loss": 0.6449, "step": 1227 }, { "epoch": 0.5532777652624465, "grad_norm": 0.5631844285962034, "learning_rate": 1.8381985398147454e-05, "loss": 0.6529, "step": 1228 }, { "epoch": 0.553728317188556, "grad_norm": 0.6081047677764068, "learning_rate": 1.837940804887319e-05, "loss": 0.7354, "step": 1229 }, { "epoch": 0.5541788691146655, "grad_norm": 0.5709269675887828, "learning_rate": 1.8376828829477583e-05, "loss": 0.6775, "step": 1230 }, { "epoch": 0.554629421040775, "grad_norm": 0.555142389081679, "learning_rate": 1.8374247740536267e-05, "loss": 0.664, "step": 1231 }, { "epoch": 0.5550799729668844, "grad_norm": 0.550298884621343, "learning_rate": 1.8371664782625287e-05, "loss": 0.6805, "step": 1232 }, { "epoch": 0.5555305248929939, "grad_norm": 0.5462020899097142, "learning_rate": 1.8369079956321115e-05, "loss": 0.6879, "step": 1233 }, { "epoch": 0.5559810768191034, "grad_norm": 0.5632429525214296, "learning_rate": 1.836649326220063e-05, "loss": 0.6675, "step": 1234 }, { "epoch": 0.5564316287452129, "grad_norm": 0.5633953148502978, "learning_rate": 1.836390470084114e-05, "loss": 0.6878, "step": 1235 }, { "epoch": 0.5568821806713223, "grad_norm": 0.534029344140987, "learning_rate": 1.836131427282035e-05, "loss": 0.6937, "step": 1236 }, { "epoch": 0.5573327325974319, "grad_norm": 0.5739411180247042, "learning_rate": 1.8358721978716398e-05, "loss": 0.706, "step": 1237 }, { "epoch": 0.5577832845235413, "grad_norm": 0.5633497997497555, "learning_rate": 1.8356127819107832e-05, "loss": 0.6655, "step": 1238 }, { "epoch": 0.5582338364496509, "grad_norm": 0.5577116467218328, "learning_rate": 1.8353531794573623e-05, "loss": 0.6333, "step": 1239 }, { "epoch": 0.5586843883757603, "grad_norm": 0.5756877714065839, "learning_rate": 1.8350933905693155e-05, "loss": 0.6962, "step": 1240 }, { "epoch": 0.5591349403018698, "grad_norm": 0.5419335721422498, "learning_rate": 1.8348334153046217e-05, "loss": 0.7164, "step": 1241 }, { "epoch": 0.5595854922279793, "grad_norm": 0.5704924365020215, "learning_rate": 1.834573253721303e-05, "loss": 0.6636, "step": 1242 }, { "epoch": 0.5600360441540888, "grad_norm": 0.5592211574218315, "learning_rate": 1.834312905877422e-05, "loss": 0.6793, "step": 1243 }, { "epoch": 0.5604865960801982, "grad_norm": 0.5360386193531836, "learning_rate": 1.8340523718310836e-05, "loss": 0.6741, "step": 1244 }, { "epoch": 0.5609371480063077, "grad_norm": 0.6129932136294463, "learning_rate": 1.8337916516404343e-05, "loss": 0.633, "step": 1245 }, { "epoch": 0.5613876999324172, "grad_norm": 0.5343729939151841, "learning_rate": 1.833530745363661e-05, "loss": 0.6566, "step": 1246 }, { "epoch": 0.5618382518585266, "grad_norm": 0.602753252546135, "learning_rate": 1.8332696530589936e-05, "loss": 0.6733, "step": 1247 }, { "epoch": 0.5622888037846362, "grad_norm": 0.566767816694739, "learning_rate": 1.833008374784702e-05, "loss": 0.7015, "step": 1248 }, { "epoch": 0.5627393557107456, "grad_norm": 0.556320341686907, "learning_rate": 1.8327469105990997e-05, "loss": 0.6683, "step": 1249 }, { "epoch": 0.5631899076368552, "grad_norm": 0.5849883744629866, "learning_rate": 1.8324852605605393e-05, "loss": 0.6667, "step": 1250 }, { "epoch": 0.5636404595629646, "grad_norm": 0.5679673269818408, "learning_rate": 1.8322234247274164e-05, "loss": 0.6838, "step": 1251 }, { "epoch": 0.5640910114890741, "grad_norm": 0.5716451644587696, "learning_rate": 1.831961403158168e-05, "loss": 0.6736, "step": 1252 }, { "epoch": 0.5645415634151836, "grad_norm": 0.5458622905346836, "learning_rate": 1.831699195911272e-05, "loss": 0.6556, "step": 1253 }, { "epoch": 0.5649921153412931, "grad_norm": 0.5532634666097928, "learning_rate": 1.8314368030452475e-05, "loss": 0.7185, "step": 1254 }, { "epoch": 0.5654426672674026, "grad_norm": 0.5501644718683005, "learning_rate": 1.8311742246186563e-05, "loss": 0.6824, "step": 1255 }, { "epoch": 0.5658932191935121, "grad_norm": 0.5528132378109233, "learning_rate": 1.8309114606901e-05, "loss": 0.6732, "step": 1256 }, { "epoch": 0.5663437711196215, "grad_norm": 0.5073386555606747, "learning_rate": 1.830648511318223e-05, "loss": 0.6786, "step": 1257 }, { "epoch": 0.5667943230457311, "grad_norm": 0.5384140531997265, "learning_rate": 1.8303853765617107e-05, "loss": 0.6872, "step": 1258 }, { "epoch": 0.5672448749718405, "grad_norm": 0.5621178487190301, "learning_rate": 1.8301220564792887e-05, "loss": 0.6693, "step": 1259 }, { "epoch": 0.5676954268979499, "grad_norm": 0.5208384515234031, "learning_rate": 1.8298585511297263e-05, "loss": 0.687, "step": 1260 }, { "epoch": 0.5681459788240595, "grad_norm": 0.5582596923963405, "learning_rate": 1.8295948605718316e-05, "loss": 0.6543, "step": 1261 }, { "epoch": 0.5685965307501689, "grad_norm": 0.5419914707853236, "learning_rate": 1.8293309848644554e-05, "loss": 0.6759, "step": 1262 }, { "epoch": 0.5690470826762785, "grad_norm": 0.5817592099409602, "learning_rate": 1.8290669240664903e-05, "loss": 0.6751, "step": 1263 }, { "epoch": 0.5694976346023879, "grad_norm": 0.535268202304325, "learning_rate": 1.8288026782368686e-05, "loss": 0.6798, "step": 1264 }, { "epoch": 0.5699481865284974, "grad_norm": 0.5667372017116133, "learning_rate": 1.8285382474345657e-05, "loss": 0.6711, "step": 1265 }, { "epoch": 0.5703987384546069, "grad_norm": 0.5502462377780205, "learning_rate": 1.8282736317185974e-05, "loss": 0.6797, "step": 1266 }, { "epoch": 0.5708492903807164, "grad_norm": 0.533627314608588, "learning_rate": 1.8280088311480203e-05, "loss": 0.6956, "step": 1267 }, { "epoch": 0.5712998423068258, "grad_norm": 0.5550360263244417, "learning_rate": 1.827743845781933e-05, "loss": 0.6771, "step": 1268 }, { "epoch": 0.5717503942329354, "grad_norm": 0.578034607958336, "learning_rate": 1.8274786756794748e-05, "loss": 0.6734, "step": 1269 }, { "epoch": 0.5722009461590448, "grad_norm": 0.5411285439716634, "learning_rate": 1.827213320899827e-05, "loss": 0.6822, "step": 1270 }, { "epoch": 0.5726514980851544, "grad_norm": 0.6140827292263492, "learning_rate": 1.826947781502211e-05, "loss": 0.6554, "step": 1271 }, { "epoch": 0.5731020500112638, "grad_norm": 0.6336211647920716, "learning_rate": 1.8266820575458908e-05, "loss": 0.6933, "step": 1272 }, { "epoch": 0.5735526019373732, "grad_norm": 0.6388294715944511, "learning_rate": 1.8264161490901706e-05, "loss": 0.7055, "step": 1273 }, { "epoch": 0.5740031538634828, "grad_norm": 0.5767097503631325, "learning_rate": 1.8261500561943956e-05, "loss": 0.6496, "step": 1274 }, { "epoch": 0.5744537057895922, "grad_norm": 0.5776448893674009, "learning_rate": 1.825883778917953e-05, "loss": 0.6443, "step": 1275 }, { "epoch": 0.5749042577157017, "grad_norm": 0.6485254995754011, "learning_rate": 1.8256173173202706e-05, "loss": 0.6633, "step": 1276 }, { "epoch": 0.5753548096418112, "grad_norm": 0.5968274108614874, "learning_rate": 1.8253506714608176e-05, "loss": 0.691, "step": 1277 }, { "epoch": 0.5758053615679207, "grad_norm": 0.5925742241195742, "learning_rate": 1.8250838413991038e-05, "loss": 0.6966, "step": 1278 }, { "epoch": 0.5762559134940302, "grad_norm": 0.5621985952335761, "learning_rate": 1.8248168271946807e-05, "loss": 0.6776, "step": 1279 }, { "epoch": 0.5767064654201397, "grad_norm": 0.5433802805948117, "learning_rate": 1.824549628907141e-05, "loss": 0.6728, "step": 1280 }, { "epoch": 0.5771570173462491, "grad_norm": 0.6232136162667353, "learning_rate": 1.8242822465961177e-05, "loss": 0.7044, "step": 1281 }, { "epoch": 0.5776075692723587, "grad_norm": 0.6182960722856746, "learning_rate": 1.8240146803212854e-05, "loss": 0.7081, "step": 1282 }, { "epoch": 0.5780581211984681, "grad_norm": 0.5418368443910772, "learning_rate": 1.82374693014236e-05, "loss": 0.6687, "step": 1283 }, { "epoch": 0.5785086731245777, "grad_norm": 0.6005446832336038, "learning_rate": 1.823478996119098e-05, "loss": 0.6949, "step": 1284 }, { "epoch": 0.5789592250506871, "grad_norm": 0.592600712274767, "learning_rate": 1.823210878311297e-05, "loss": 0.7005, "step": 1285 }, { "epoch": 0.5794097769767966, "grad_norm": 0.5937000787750846, "learning_rate": 1.8229425767787957e-05, "loss": 0.6518, "step": 1286 }, { "epoch": 0.5798603289029061, "grad_norm": 0.5628124053588943, "learning_rate": 1.822674091581474e-05, "loss": 0.6585, "step": 1287 }, { "epoch": 0.5803108808290155, "grad_norm": 0.6192514870781664, "learning_rate": 1.8224054227792524e-05, "loss": 0.6682, "step": 1288 }, { "epoch": 0.580761432755125, "grad_norm": 0.5564031767703826, "learning_rate": 1.8221365704320925e-05, "loss": 0.696, "step": 1289 }, { "epoch": 0.5812119846812345, "grad_norm": 0.5510342949333025, "learning_rate": 1.8218675345999973e-05, "loss": 0.6874, "step": 1290 }, { "epoch": 0.581662536607344, "grad_norm": 0.5572230311318389, "learning_rate": 1.8215983153430098e-05, "loss": 0.6539, "step": 1291 }, { "epoch": 0.5821130885334534, "grad_norm": 0.5425388843050903, "learning_rate": 1.8213289127212152e-05, "loss": 0.6685, "step": 1292 }, { "epoch": 0.582563640459563, "grad_norm": 0.5706039143073139, "learning_rate": 1.821059326794738e-05, "loss": 0.6782, "step": 1293 }, { "epoch": 0.5830141923856724, "grad_norm": 0.5574318124165233, "learning_rate": 1.8207895576237454e-05, "loss": 0.6654, "step": 1294 }, { "epoch": 0.583464744311782, "grad_norm": 0.5372751593157755, "learning_rate": 1.8205196052684445e-05, "loss": 0.6762, "step": 1295 }, { "epoch": 0.5839152962378914, "grad_norm": 0.597293645232969, "learning_rate": 1.8202494697890833e-05, "loss": 0.6889, "step": 1296 }, { "epoch": 0.584365848164001, "grad_norm": 0.5318288280801419, "learning_rate": 1.8199791512459507e-05, "loss": 0.6654, "step": 1297 }, { "epoch": 0.5848164000901104, "grad_norm": 0.5501609096087681, "learning_rate": 1.8197086496993767e-05, "loss": 0.6477, "step": 1298 }, { "epoch": 0.5852669520162199, "grad_norm": 0.5473959348102192, "learning_rate": 1.819437965209732e-05, "loss": 0.6755, "step": 1299 }, { "epoch": 0.5857175039423294, "grad_norm": 0.5425130047233666, "learning_rate": 1.819167097837428e-05, "loss": 0.7011, "step": 1300 }, { "epoch": 0.5861680558684388, "grad_norm": 0.5713428078350204, "learning_rate": 1.8188960476429174e-05, "loss": 0.7318, "step": 1301 }, { "epoch": 0.5866186077945483, "grad_norm": 0.566109777109985, "learning_rate": 1.8186248146866928e-05, "loss": 0.676, "step": 1302 }, { "epoch": 0.5870691597206578, "grad_norm": 0.5717573249371405, "learning_rate": 1.818353399029288e-05, "loss": 0.6806, "step": 1303 }, { "epoch": 0.5875197116467673, "grad_norm": 0.5221798283800823, "learning_rate": 1.8180818007312788e-05, "loss": 0.6522, "step": 1304 }, { "epoch": 0.5879702635728767, "grad_norm": 0.5498647055949618, "learning_rate": 1.8178100198532798e-05, "loss": 0.6652, "step": 1305 }, { "epoch": 0.5884208154989863, "grad_norm": 0.5723666513850714, "learning_rate": 1.817538056455947e-05, "loss": 0.69, "step": 1306 }, { "epoch": 0.5888713674250957, "grad_norm": 0.5888232032017573, "learning_rate": 1.817265910599978e-05, "loss": 0.6829, "step": 1307 }, { "epoch": 0.5893219193512053, "grad_norm": 0.5486958437180639, "learning_rate": 1.8169935823461107e-05, "loss": 0.6344, "step": 1308 }, { "epoch": 0.5897724712773147, "grad_norm": 0.5158699228302857, "learning_rate": 1.8167210717551224e-05, "loss": 0.6685, "step": 1309 }, { "epoch": 0.5902230232034242, "grad_norm": 0.5508691738216056, "learning_rate": 1.816448378887833e-05, "loss": 0.6626, "step": 1310 }, { "epoch": 0.5906735751295337, "grad_norm": 0.5313839152476695, "learning_rate": 1.816175503805102e-05, "loss": 0.6423, "step": 1311 }, { "epoch": 0.5911241270556432, "grad_norm": 0.5294290248272844, "learning_rate": 1.81590244656783e-05, "loss": 0.6588, "step": 1312 }, { "epoch": 0.5915746789817526, "grad_norm": 0.5203954785130686, "learning_rate": 1.815629207236958e-05, "loss": 0.6547, "step": 1313 }, { "epoch": 0.5920252309078621, "grad_norm": 0.5184050548685957, "learning_rate": 1.8153557858734678e-05, "loss": 0.68, "step": 1314 }, { "epoch": 0.5924757828339716, "grad_norm": 0.515403094155556, "learning_rate": 1.815082182538381e-05, "loss": 0.6641, "step": 1315 }, { "epoch": 0.592926334760081, "grad_norm": 0.5537391336077441, "learning_rate": 1.8148083972927617e-05, "loss": 0.6407, "step": 1316 }, { "epoch": 0.5933768866861906, "grad_norm": 0.538128003974861, "learning_rate": 1.8145344301977126e-05, "loss": 0.7064, "step": 1317 }, { "epoch": 0.5938274386123, "grad_norm": 0.5375580771893473, "learning_rate": 1.8142602813143786e-05, "loss": 0.6729, "step": 1318 }, { "epoch": 0.5942779905384096, "grad_norm": 0.531351799367912, "learning_rate": 1.8139859507039438e-05, "loss": 0.6906, "step": 1319 }, { "epoch": 0.594728542464519, "grad_norm": 0.5296602319905875, "learning_rate": 1.813711438427634e-05, "loss": 0.6617, "step": 1320 }, { "epoch": 0.5951790943906285, "grad_norm": 0.5457855347727515, "learning_rate": 1.813436744546714e-05, "loss": 0.6352, "step": 1321 }, { "epoch": 0.595629646316738, "grad_norm": 0.5610074812986778, "learning_rate": 1.8131618691224916e-05, "loss": 0.664, "step": 1322 }, { "epoch": 0.5960801982428475, "grad_norm": 0.5433936820683016, "learning_rate": 1.8128868122163125e-05, "loss": 0.6733, "step": 1323 }, { "epoch": 0.596530750168957, "grad_norm": 0.5555147125653933, "learning_rate": 1.812611573889564e-05, "loss": 0.65, "step": 1324 }, { "epoch": 0.5969813020950665, "grad_norm": 0.5463968000900136, "learning_rate": 1.812336154203675e-05, "loss": 0.7033, "step": 1325 }, { "epoch": 0.5974318540211759, "grad_norm": 0.5690911979313239, "learning_rate": 1.8120605532201132e-05, "loss": 0.7002, "step": 1326 }, { "epoch": 0.5978824059472855, "grad_norm": 0.5252621376205834, "learning_rate": 1.811784771000387e-05, "loss": 0.6604, "step": 1327 }, { "epoch": 0.5983329578733949, "grad_norm": 0.559777606611427, "learning_rate": 1.8115088076060465e-05, "loss": 0.696, "step": 1328 }, { "epoch": 0.5987835097995043, "grad_norm": 0.5313602586101225, "learning_rate": 1.811232663098681e-05, "loss": 0.6478, "step": 1329 }, { "epoch": 0.5992340617256139, "grad_norm": 0.5326902222899041, "learning_rate": 1.81095633753992e-05, "loss": 0.6961, "step": 1330 }, { "epoch": 0.5996846136517233, "grad_norm": 0.582801887328382, "learning_rate": 1.810679830991435e-05, "loss": 0.6533, "step": 1331 }, { "epoch": 0.6001351655778329, "grad_norm": 0.5337000012627867, "learning_rate": 1.8104031435149366e-05, "loss": 0.652, "step": 1332 }, { "epoch": 0.6005857175039423, "grad_norm": 0.546293143889847, "learning_rate": 1.810126275172176e-05, "loss": 0.6569, "step": 1333 }, { "epoch": 0.6010362694300518, "grad_norm": 0.5458228424786987, "learning_rate": 1.809849226024944e-05, "loss": 0.6738, "step": 1334 }, { "epoch": 0.6014868213561613, "grad_norm": 0.5548661179267539, "learning_rate": 1.8095719961350742e-05, "loss": 0.7227, "step": 1335 }, { "epoch": 0.6019373732822708, "grad_norm": 0.5801048628516993, "learning_rate": 1.8092945855644376e-05, "loss": 0.6281, "step": 1336 }, { "epoch": 0.6023879252083802, "grad_norm": 0.5486092641991989, "learning_rate": 1.8090169943749477e-05, "loss": 0.689, "step": 1337 }, { "epoch": 0.6028384771344898, "grad_norm": 0.5836824157521479, "learning_rate": 1.8087392226285567e-05, "loss": 0.6733, "step": 1338 }, { "epoch": 0.6032890290605992, "grad_norm": 0.5787749227357271, "learning_rate": 1.8084612703872588e-05, "loss": 0.684, "step": 1339 }, { "epoch": 0.6037395809867088, "grad_norm": 0.5480931799452387, "learning_rate": 1.8081831377130867e-05, "loss": 0.6316, "step": 1340 }, { "epoch": 0.6041901329128182, "grad_norm": 0.5834993617626774, "learning_rate": 1.8079048246681147e-05, "loss": 0.7279, "step": 1341 }, { "epoch": 0.6046406848389276, "grad_norm": 0.5492904936647202, "learning_rate": 1.8076263313144568e-05, "loss": 0.6845, "step": 1342 }, { "epoch": 0.6050912367650372, "grad_norm": 0.5180915196753001, "learning_rate": 1.8073476577142673e-05, "loss": 0.6764, "step": 1343 }, { "epoch": 0.6055417886911466, "grad_norm": 0.552093786067384, "learning_rate": 1.8070688039297403e-05, "loss": 0.6768, "step": 1344 }, { "epoch": 0.6059923406172562, "grad_norm": 0.5197295535095373, "learning_rate": 1.8067897700231115e-05, "loss": 0.6752, "step": 1345 }, { "epoch": 0.6064428925433656, "grad_norm": 0.5269714829074844, "learning_rate": 1.8065105560566548e-05, "loss": 0.6314, "step": 1346 }, { "epoch": 0.6068934444694751, "grad_norm": 0.5818221020467479, "learning_rate": 1.806231162092686e-05, "loss": 0.6978, "step": 1347 }, { "epoch": 0.6073439963955846, "grad_norm": 0.5257179179173683, "learning_rate": 1.8059515881935604e-05, "loss": 0.6715, "step": 1348 }, { "epoch": 0.6077945483216941, "grad_norm": 0.5441524355749587, "learning_rate": 1.8056718344216736e-05, "loss": 0.6744, "step": 1349 }, { "epoch": 0.6082451002478035, "grad_norm": 0.5259651250521976, "learning_rate": 1.8053919008394603e-05, "loss": 0.6666, "step": 1350 }, { "epoch": 0.6086956521739131, "grad_norm": 0.5229653833460537, "learning_rate": 1.8051117875093974e-05, "loss": 0.6846, "step": 1351 }, { "epoch": 0.6091462041000225, "grad_norm": 0.5336598428619196, "learning_rate": 1.804831494494e-05, "loss": 0.6955, "step": 1352 }, { "epoch": 0.6095967560261321, "grad_norm": 0.5167924170447594, "learning_rate": 1.8045510218558244e-05, "loss": 0.6495, "step": 1353 }, { "epoch": 0.6100473079522415, "grad_norm": 0.5386657272438679, "learning_rate": 1.8042703696574665e-05, "loss": 0.6861, "step": 1354 }, { "epoch": 0.610497859878351, "grad_norm": 0.5137292434485625, "learning_rate": 1.8039895379615626e-05, "loss": 0.6348, "step": 1355 }, { "epoch": 0.6109484118044605, "grad_norm": 0.5145857312730241, "learning_rate": 1.8037085268307887e-05, "loss": 0.6676, "step": 1356 }, { "epoch": 0.6113989637305699, "grad_norm": 0.5489389120442745, "learning_rate": 1.8034273363278615e-05, "loss": 0.6367, "step": 1357 }, { "epoch": 0.6118495156566794, "grad_norm": 0.5074793809651635, "learning_rate": 1.8031459665155363e-05, "loss": 0.6692, "step": 1358 }, { "epoch": 0.6123000675827889, "grad_norm": 0.5366276182269417, "learning_rate": 1.8028644174566103e-05, "loss": 0.6337, "step": 1359 }, { "epoch": 0.6127506195088984, "grad_norm": 0.5359066217339049, "learning_rate": 1.8025826892139194e-05, "loss": 0.668, "step": 1360 }, { "epoch": 0.6132011714350079, "grad_norm": 0.5221691239353264, "learning_rate": 1.80230078185034e-05, "loss": 0.6452, "step": 1361 }, { "epoch": 0.6136517233611174, "grad_norm": 0.5371889191273655, "learning_rate": 1.8020186954287883e-05, "loss": 0.6781, "step": 1362 }, { "epoch": 0.6141022752872268, "grad_norm": 0.5237855204097461, "learning_rate": 1.8017364300122204e-05, "loss": 0.6413, "step": 1363 }, { "epoch": 0.6145528272133364, "grad_norm": 0.5368832041895196, "learning_rate": 1.8014539856636327e-05, "loss": 0.7117, "step": 1364 }, { "epoch": 0.6150033791394458, "grad_norm": 0.5027800403589736, "learning_rate": 1.8011713624460608e-05, "loss": 0.6796, "step": 1365 }, { "epoch": 0.6154539310655553, "grad_norm": 0.5451156915626936, "learning_rate": 1.8008885604225815e-05, "loss": 0.6956, "step": 1366 }, { "epoch": 0.6159044829916648, "grad_norm": 0.5562553202914334, "learning_rate": 1.8006055796563103e-05, "loss": 0.6787, "step": 1367 }, { "epoch": 0.6163550349177743, "grad_norm": 0.5453573940235691, "learning_rate": 1.800322420210403e-05, "loss": 0.689, "step": 1368 }, { "epoch": 0.6168055868438838, "grad_norm": 0.5335447719368624, "learning_rate": 1.8000390821480552e-05, "loss": 0.6808, "step": 1369 }, { "epoch": 0.6172561387699932, "grad_norm": 0.5908717153408354, "learning_rate": 1.799755565532503e-05, "loss": 0.6836, "step": 1370 }, { "epoch": 0.6177066906961027, "grad_norm": 0.5401520873637377, "learning_rate": 1.7994718704270208e-05, "loss": 0.6853, "step": 1371 }, { "epoch": 0.6181572426222122, "grad_norm": 0.5226460175605909, "learning_rate": 1.7991879968949248e-05, "loss": 0.7014, "step": 1372 }, { "epoch": 0.6186077945483217, "grad_norm": 0.5496829135117876, "learning_rate": 1.79890394499957e-05, "loss": 0.6459, "step": 1373 }, { "epoch": 0.6190583464744311, "grad_norm": 0.5676181808594614, "learning_rate": 1.7986197148043506e-05, "loss": 0.6441, "step": 1374 }, { "epoch": 0.6195088984005407, "grad_norm": 0.5681772563888243, "learning_rate": 1.7983353063727014e-05, "loss": 0.6832, "step": 1375 }, { "epoch": 0.6199594503266501, "grad_norm": 0.5547687213641709, "learning_rate": 1.7980507197680977e-05, "loss": 0.6708, "step": 1376 }, { "epoch": 0.6204100022527597, "grad_norm": 0.5639005325517856, "learning_rate": 1.797765955054053e-05, "loss": 0.6633, "step": 1377 }, { "epoch": 0.6208605541788691, "grad_norm": 0.5232281945426899, "learning_rate": 1.7974810122941208e-05, "loss": 0.6703, "step": 1378 }, { "epoch": 0.6213111061049786, "grad_norm": 0.5592078982389969, "learning_rate": 1.797195891551896e-05, "loss": 0.6492, "step": 1379 }, { "epoch": 0.6217616580310881, "grad_norm": 0.5102332123588654, "learning_rate": 1.796910592891011e-05, "loss": 0.6686, "step": 1380 }, { "epoch": 0.6222122099571976, "grad_norm": 0.56205599842483, "learning_rate": 1.7966251163751395e-05, "loss": 0.6939, "step": 1381 }, { "epoch": 0.622662761883307, "grad_norm": 0.5517324920513873, "learning_rate": 1.7963394620679945e-05, "loss": 0.6462, "step": 1382 }, { "epoch": 0.6231133138094165, "grad_norm": 0.5583530609023029, "learning_rate": 1.7960536300333275e-05, "loss": 0.6808, "step": 1383 }, { "epoch": 0.623563865735526, "grad_norm": 0.5360774050902728, "learning_rate": 1.7957676203349317e-05, "loss": 0.6565, "step": 1384 }, { "epoch": 0.6240144176616355, "grad_norm": 0.5973186689585207, "learning_rate": 1.7954814330366385e-05, "loss": 0.6871, "step": 1385 }, { "epoch": 0.624464969587745, "grad_norm": 0.5808769389752444, "learning_rate": 1.795195068202319e-05, "loss": 0.6755, "step": 1386 }, { "epoch": 0.6249155215138544, "grad_norm": 0.5295825827426017, "learning_rate": 1.7949085258958853e-05, "loss": 0.6787, "step": 1387 }, { "epoch": 0.625366073439964, "grad_norm": 0.5614372767706864, "learning_rate": 1.7946218061812867e-05, "loss": 0.6995, "step": 1388 }, { "epoch": 0.6258166253660734, "grad_norm": 0.538391237611238, "learning_rate": 1.794334909122515e-05, "loss": 0.6633, "step": 1389 }, { "epoch": 0.626267177292183, "grad_norm": 0.5468139807375855, "learning_rate": 1.7940478347835986e-05, "loss": 0.6987, "step": 1390 }, { "epoch": 0.6267177292182924, "grad_norm": 0.5495236800472321, "learning_rate": 1.793760583228608e-05, "loss": 0.6888, "step": 1391 }, { "epoch": 0.6271682811444019, "grad_norm": 0.5176457640490245, "learning_rate": 1.7934731545216515e-05, "loss": 0.6534, "step": 1392 }, { "epoch": 0.6276188330705114, "grad_norm": 0.5678908975909329, "learning_rate": 1.793185548726878e-05, "loss": 0.6953, "step": 1393 }, { "epoch": 0.6280693849966209, "grad_norm": 0.538573020516702, "learning_rate": 1.7928977659084753e-05, "loss": 0.6763, "step": 1394 }, { "epoch": 0.6285199369227303, "grad_norm": 0.5105447311021557, "learning_rate": 1.7926098061306712e-05, "loss": 0.673, "step": 1395 }, { "epoch": 0.6289704888488399, "grad_norm": 0.53995794830331, "learning_rate": 1.792321669457733e-05, "loss": 0.6712, "step": 1396 }, { "epoch": 0.6294210407749493, "grad_norm": 0.5469926777271302, "learning_rate": 1.792033355953966e-05, "loss": 0.6632, "step": 1397 }, { "epoch": 0.6298715927010587, "grad_norm": 0.5667055367991765, "learning_rate": 1.7917448656837174e-05, "loss": 0.6766, "step": 1398 }, { "epoch": 0.6303221446271683, "grad_norm": 0.5569267412719471, "learning_rate": 1.7914561987113724e-05, "loss": 0.6715, "step": 1399 }, { "epoch": 0.6307726965532777, "grad_norm": 0.527355273324448, "learning_rate": 1.7911673551013553e-05, "loss": 0.6367, "step": 1400 }, { "epoch": 0.6312232484793873, "grad_norm": 0.5945361870713366, "learning_rate": 1.790878334918131e-05, "loss": 0.6938, "step": 1401 }, { "epoch": 0.6316738004054967, "grad_norm": 0.5327742583514121, "learning_rate": 1.790589138226203e-05, "loss": 0.6623, "step": 1402 }, { "epoch": 0.6321243523316062, "grad_norm": 0.5381268851903158, "learning_rate": 1.7902997650901147e-05, "loss": 0.6632, "step": 1403 }, { "epoch": 0.6325749042577157, "grad_norm": 0.5386414467849202, "learning_rate": 1.790010215574448e-05, "loss": 0.6738, "step": 1404 }, { "epoch": 0.6330254561838252, "grad_norm": 0.5504311097489355, "learning_rate": 1.7897204897438248e-05, "loss": 0.6702, "step": 1405 }, { "epoch": 0.6334760081099347, "grad_norm": 0.5359287203146235, "learning_rate": 1.7894305876629064e-05, "loss": 0.676, "step": 1406 }, { "epoch": 0.6339265600360442, "grad_norm": 0.5783811363328815, "learning_rate": 1.789140509396394e-05, "loss": 0.6567, "step": 1407 }, { "epoch": 0.6343771119621536, "grad_norm": 0.5639019844704268, "learning_rate": 1.7888502550090262e-05, "loss": 0.6458, "step": 1408 }, { "epoch": 0.6348276638882632, "grad_norm": 0.6057203887345539, "learning_rate": 1.788559824565583e-05, "loss": 0.6757, "step": 1409 }, { "epoch": 0.6352782158143726, "grad_norm": 0.5312439352267188, "learning_rate": 1.7882692181308824e-05, "loss": 0.6513, "step": 1410 }, { "epoch": 0.635728767740482, "grad_norm": 0.5548451565473141, "learning_rate": 1.787978435769783e-05, "loss": 0.6792, "step": 1411 }, { "epoch": 0.6361793196665916, "grad_norm": 0.5837618888055784, "learning_rate": 1.7876874775471806e-05, "loss": 0.6959, "step": 1412 }, { "epoch": 0.636629871592701, "grad_norm": 0.5674676229825648, "learning_rate": 1.7873963435280122e-05, "loss": 0.6776, "step": 1413 }, { "epoch": 0.6370804235188106, "grad_norm": 0.5427461542582359, "learning_rate": 1.7871050337772527e-05, "loss": 0.6811, "step": 1414 }, { "epoch": 0.63753097544492, "grad_norm": 0.5800108375349189, "learning_rate": 1.7868135483599175e-05, "loss": 0.6906, "step": 1415 }, { "epoch": 0.6379815273710295, "grad_norm": 0.5527470874252767, "learning_rate": 1.78652188734106e-05, "loss": 0.7163, "step": 1416 }, { "epoch": 0.638432079297139, "grad_norm": 0.5763978863252693, "learning_rate": 1.7862300507857733e-05, "loss": 0.6991, "step": 1417 }, { "epoch": 0.6388826312232485, "grad_norm": 0.5015732875137796, "learning_rate": 1.7859380387591895e-05, "loss": 0.6651, "step": 1418 }, { "epoch": 0.6393331831493579, "grad_norm": 0.5830177720115315, "learning_rate": 1.7856458513264808e-05, "loss": 0.6614, "step": 1419 }, { "epoch": 0.6397837350754675, "grad_norm": 0.5350800210781717, "learning_rate": 1.7853534885528566e-05, "loss": 0.6494, "step": 1420 }, { "epoch": 0.6402342870015769, "grad_norm": 0.5502138095266602, "learning_rate": 1.785060950503568e-05, "loss": 0.635, "step": 1421 }, { "epoch": 0.6406848389276865, "grad_norm": 0.5662041346900767, "learning_rate": 1.7847682372439024e-05, "loss": 0.7184, "step": 1422 }, { "epoch": 0.6411353908537959, "grad_norm": 0.5526941354549036, "learning_rate": 1.784475348839189e-05, "loss": 0.6759, "step": 1423 }, { "epoch": 0.6415859427799054, "grad_norm": 0.5537159186404946, "learning_rate": 1.7841822853547934e-05, "loss": 0.6673, "step": 1424 }, { "epoch": 0.6420364947060149, "grad_norm": 0.5736775506833716, "learning_rate": 1.783889046856123e-05, "loss": 0.6756, "step": 1425 }, { "epoch": 0.6424870466321243, "grad_norm": 0.5762956845604579, "learning_rate": 1.7835956334086223e-05, "loss": 0.6507, "step": 1426 }, { "epoch": 0.6429375985582338, "grad_norm": 0.6203271539495478, "learning_rate": 1.7833020450777756e-05, "loss": 0.6768, "step": 1427 }, { "epoch": 0.6433881504843433, "grad_norm": 0.6322610194730355, "learning_rate": 1.783008281929106e-05, "loss": 0.7041, "step": 1428 }, { "epoch": 0.6438387024104528, "grad_norm": 0.5698542201194561, "learning_rate": 1.782714344028176e-05, "loss": 0.6868, "step": 1429 }, { "epoch": 0.6442892543365623, "grad_norm": 0.5860752858180872, "learning_rate": 1.7824202314405867e-05, "loss": 0.6568, "step": 1430 }, { "epoch": 0.6447398062626718, "grad_norm": 0.6038657154402928, "learning_rate": 1.782125944231978e-05, "loss": 0.6383, "step": 1431 }, { "epoch": 0.6451903581887812, "grad_norm": 0.5349056954989897, "learning_rate": 1.78183148246803e-05, "loss": 0.6819, "step": 1432 }, { "epoch": 0.6456409101148908, "grad_norm": 0.5931001339588224, "learning_rate": 1.78153684621446e-05, "loss": 0.6954, "step": 1433 }, { "epoch": 0.6460914620410002, "grad_norm": 0.5562038320349005, "learning_rate": 1.781242035537025e-05, "loss": 0.698, "step": 1434 }, { "epoch": 0.6465420139671098, "grad_norm": 0.5456867260912841, "learning_rate": 1.780947050501522e-05, "loss": 0.6788, "step": 1435 }, { "epoch": 0.6469925658932192, "grad_norm": 0.5511568583811647, "learning_rate": 1.780651891173785e-05, "loss": 0.6406, "step": 1436 }, { "epoch": 0.6474431178193287, "grad_norm": 0.5771472159955278, "learning_rate": 1.7803565576196884e-05, "loss": 0.7032, "step": 1437 }, { "epoch": 0.6478936697454382, "grad_norm": 0.6111025002890262, "learning_rate": 1.7800610499051444e-05, "loss": 0.6867, "step": 1438 }, { "epoch": 0.6483442216715476, "grad_norm": 0.58091440651752, "learning_rate": 1.779765368096105e-05, "loss": 0.6731, "step": 1439 }, { "epoch": 0.6487947735976571, "grad_norm": 0.5716957028915509, "learning_rate": 1.7794695122585607e-05, "loss": 0.6541, "step": 1440 }, { "epoch": 0.6492453255237666, "grad_norm": 0.5699550392405598, "learning_rate": 1.7791734824585406e-05, "loss": 0.6891, "step": 1441 }, { "epoch": 0.6496958774498761, "grad_norm": 0.5404584217300871, "learning_rate": 1.7788772787621126e-05, "loss": 0.6957, "step": 1442 }, { "epoch": 0.6501464293759855, "grad_norm": 0.570877811368172, "learning_rate": 1.778580901235384e-05, "loss": 0.6877, "step": 1443 }, { "epoch": 0.6505969813020951, "grad_norm": 0.548171775989779, "learning_rate": 1.7782843499445e-05, "loss": 0.6958, "step": 1444 }, { "epoch": 0.6510475332282045, "grad_norm": 0.5400662956306683, "learning_rate": 1.777987624955646e-05, "loss": 0.64, "step": 1445 }, { "epoch": 0.6514980851543141, "grad_norm": 0.5098112827772788, "learning_rate": 1.7776907263350446e-05, "loss": 0.6789, "step": 1446 }, { "epoch": 0.6519486370804235, "grad_norm": 0.5643140076411525, "learning_rate": 1.7773936541489577e-05, "loss": 0.6544, "step": 1447 }, { "epoch": 0.652399189006533, "grad_norm": 0.5734227506859498, "learning_rate": 1.777096408463686e-05, "loss": 0.6471, "step": 1448 }, { "epoch": 0.6528497409326425, "grad_norm": 0.5401323076013235, "learning_rate": 1.7767989893455696e-05, "loss": 0.6707, "step": 1449 }, { "epoch": 0.653300292858752, "grad_norm": 0.5946621638219726, "learning_rate": 1.7765013968609865e-05, "loss": 0.6702, "step": 1450 }, { "epoch": 0.6537508447848615, "grad_norm": 0.5461923111607266, "learning_rate": 1.7762036310763533e-05, "loss": 0.6485, "step": 1451 }, { "epoch": 0.654201396710971, "grad_norm": 0.6151754720974275, "learning_rate": 1.7759056920581256e-05, "loss": 0.6967, "step": 1452 }, { "epoch": 0.6546519486370804, "grad_norm": 0.50994550162697, "learning_rate": 1.7756075798727975e-05, "loss": 0.6776, "step": 1453 }, { "epoch": 0.6551025005631899, "grad_norm": 0.5952621086618382, "learning_rate": 1.7753092945869023e-05, "loss": 0.6683, "step": 1454 }, { "epoch": 0.6555530524892994, "grad_norm": 0.5444076971337918, "learning_rate": 1.775010836267011e-05, "loss": 0.676, "step": 1455 }, { "epoch": 0.6560036044154088, "grad_norm": 0.5471837494536251, "learning_rate": 1.7747122049797336e-05, "loss": 0.6573, "step": 1456 }, { "epoch": 0.6564541563415184, "grad_norm": 0.6057587206237397, "learning_rate": 1.7744134007917195e-05, "loss": 0.6665, "step": 1457 }, { "epoch": 0.6569047082676278, "grad_norm": 0.5145435133491917, "learning_rate": 1.7741144237696556e-05, "loss": 0.6718, "step": 1458 }, { "epoch": 0.6573552601937374, "grad_norm": 0.576597086965957, "learning_rate": 1.7738152739802677e-05, "loss": 0.6229, "step": 1459 }, { "epoch": 0.6578058121198468, "grad_norm": 0.5434890427964651, "learning_rate": 1.77351595149032e-05, "loss": 0.6957, "step": 1460 }, { "epoch": 0.6582563640459563, "grad_norm": 0.5399808689426286, "learning_rate": 1.7732164563666163e-05, "loss": 0.6944, "step": 1461 }, { "epoch": 0.6587069159720658, "grad_norm": 0.5395090218465035, "learning_rate": 1.7729167886759974e-05, "loss": 0.6424, "step": 1462 }, { "epoch": 0.6591574678981753, "grad_norm": 0.5287472352233292, "learning_rate": 1.7726169484853438e-05, "loss": 0.6958, "step": 1463 }, { "epoch": 0.6596080198242847, "grad_norm": 0.5215527690959073, "learning_rate": 1.7723169358615734e-05, "loss": 0.6454, "step": 1464 }, { "epoch": 0.6600585717503943, "grad_norm": 0.6064788870589527, "learning_rate": 1.7720167508716435e-05, "loss": 0.686, "step": 1465 }, { "epoch": 0.6605091236765037, "grad_norm": 0.5714637627205276, "learning_rate": 1.77171639358255e-05, "loss": 0.6768, "step": 1466 }, { "epoch": 0.6609596756026132, "grad_norm": 0.5417682089740229, "learning_rate": 1.771415864061326e-05, "loss": 0.6732, "step": 1467 }, { "epoch": 0.6614102275287227, "grad_norm": 0.5384739233781508, "learning_rate": 1.771115162375044e-05, "loss": 0.6807, "step": 1468 }, { "epoch": 0.6618607794548321, "grad_norm": 0.568249123570642, "learning_rate": 1.7708142885908157e-05, "loss": 0.6566, "step": 1469 }, { "epoch": 0.6623113313809417, "grad_norm": 0.5470897999450784, "learning_rate": 1.7705132427757895e-05, "loss": 0.6624, "step": 1470 }, { "epoch": 0.6627618833070511, "grad_norm": 0.5500798735316088, "learning_rate": 1.7702120249971528e-05, "loss": 0.675, "step": 1471 }, { "epoch": 0.6632124352331606, "grad_norm": 0.6052577353218541, "learning_rate": 1.7699106353221322e-05, "loss": 0.6754, "step": 1472 }, { "epoch": 0.6636629871592701, "grad_norm": 0.5147290921096986, "learning_rate": 1.7696090738179917e-05, "loss": 0.6643, "step": 1473 }, { "epoch": 0.6641135390853796, "grad_norm": 0.5731979542971716, "learning_rate": 1.769307340552034e-05, "loss": 0.6554, "step": 1474 }, { "epoch": 0.664564091011489, "grad_norm": 0.5335299834823467, "learning_rate": 1.7690054355916e-05, "loss": 0.6851, "step": 1475 }, { "epoch": 0.6650146429375986, "grad_norm": 0.5357305130860749, "learning_rate": 1.7687033590040693e-05, "loss": 0.6804, "step": 1476 }, { "epoch": 0.665465194863708, "grad_norm": 0.5335478571838002, "learning_rate": 1.7684011108568593e-05, "loss": 0.6959, "step": 1477 }, { "epoch": 0.6659157467898176, "grad_norm": 0.5615722793886967, "learning_rate": 1.7680986912174257e-05, "loss": 0.6699, "step": 1478 }, { "epoch": 0.666366298715927, "grad_norm": 0.5675036441671247, "learning_rate": 1.7677961001532634e-05, "loss": 0.6707, "step": 1479 }, { "epoch": 0.6668168506420364, "grad_norm": 0.5866641031914556, "learning_rate": 1.7674933377319042e-05, "loss": 0.6718, "step": 1480 }, { "epoch": 0.667267402568146, "grad_norm": 0.5516253056959686, "learning_rate": 1.7671904040209196e-05, "loss": 0.6757, "step": 1481 }, { "epoch": 0.6677179544942554, "grad_norm": 0.582962580932435, "learning_rate": 1.7668872990879175e-05, "loss": 0.6877, "step": 1482 }, { "epoch": 0.668168506420365, "grad_norm": 0.5612202165821252, "learning_rate": 1.7665840230005457e-05, "loss": 0.6997, "step": 1483 }, { "epoch": 0.6686190583464744, "grad_norm": 0.568829854047303, "learning_rate": 1.7662805758264894e-05, "loss": 0.6355, "step": 1484 }, { "epoch": 0.6690696102725839, "grad_norm": 0.5780524592750271, "learning_rate": 1.765976957633472e-05, "loss": 0.6977, "step": 1485 }, { "epoch": 0.6695201621986934, "grad_norm": 0.5424851680718032, "learning_rate": 1.7656731684892553e-05, "loss": 0.6247, "step": 1486 }, { "epoch": 0.6699707141248029, "grad_norm": 0.6156167409033322, "learning_rate": 1.765369208461639e-05, "loss": 0.6897, "step": 1487 }, { "epoch": 0.6704212660509123, "grad_norm": 0.5719646375287151, "learning_rate": 1.7650650776184615e-05, "loss": 0.7026, "step": 1488 }, { "epoch": 0.6708718179770219, "grad_norm": 0.6195924666317099, "learning_rate": 1.7647607760275987e-05, "loss": 0.6538, "step": 1489 }, { "epoch": 0.6713223699031313, "grad_norm": 0.5848151764181155, "learning_rate": 1.764456303756964e-05, "loss": 0.6486, "step": 1490 }, { "epoch": 0.6717729218292409, "grad_norm": 0.5648059784970519, "learning_rate": 1.7641516608745114e-05, "loss": 0.6648, "step": 1491 }, { "epoch": 0.6722234737553503, "grad_norm": 0.5723731031621438, "learning_rate": 1.7638468474482297e-05, "loss": 0.6439, "step": 1492 }, { "epoch": 0.6726740256814598, "grad_norm": 0.566647160480705, "learning_rate": 1.763541863546148e-05, "loss": 0.666, "step": 1493 }, { "epoch": 0.6731245776075693, "grad_norm": 0.5743294346994388, "learning_rate": 1.7632367092363333e-05, "loss": 0.6811, "step": 1494 }, { "epoch": 0.6735751295336787, "grad_norm": 0.5540267186080886, "learning_rate": 1.762931384586889e-05, "loss": 0.6434, "step": 1495 }, { "epoch": 0.6740256814597883, "grad_norm": 0.6434033124365902, "learning_rate": 1.7626258896659584e-05, "loss": 0.7275, "step": 1496 }, { "epoch": 0.6744762333858977, "grad_norm": 0.5652368015157891, "learning_rate": 1.762320224541722e-05, "loss": 0.6934, "step": 1497 }, { "epoch": 0.6749267853120072, "grad_norm": 0.611620928650808, "learning_rate": 1.7620143892823977e-05, "loss": 0.656, "step": 1498 }, { "epoch": 0.6753773372381167, "grad_norm": 0.5431647901116993, "learning_rate": 1.761708383956243e-05, "loss": 0.677, "step": 1499 }, { "epoch": 0.6758278891642262, "grad_norm": 0.6017411322262288, "learning_rate": 1.7614022086315515e-05, "loss": 0.6805, "step": 1500 }, { "epoch": 0.6762784410903356, "grad_norm": 0.5557198576615173, "learning_rate": 1.761095863376656e-05, "loss": 0.636, "step": 1501 }, { "epoch": 0.6767289930164452, "grad_norm": 0.6526059690714968, "learning_rate": 1.760789348259927e-05, "loss": 0.6451, "step": 1502 }, { "epoch": 0.6771795449425546, "grad_norm": 0.575912952114641, "learning_rate": 1.7604826633497722e-05, "loss": 0.7059, "step": 1503 }, { "epoch": 0.6776300968686642, "grad_norm": 0.593997438488782, "learning_rate": 1.7601758087146385e-05, "loss": 0.6604, "step": 1504 }, { "epoch": 0.6780806487947736, "grad_norm": 0.6110756643680259, "learning_rate": 1.759868784423009e-05, "loss": 0.6767, "step": 1505 }, { "epoch": 0.6785312007208831, "grad_norm": 0.5612544928421378, "learning_rate": 1.7595615905434055e-05, "loss": 0.6629, "step": 1506 }, { "epoch": 0.6789817526469926, "grad_norm": 0.547427162630966, "learning_rate": 1.7592542271443888e-05, "loss": 0.6664, "step": 1507 }, { "epoch": 0.679432304573102, "grad_norm": 0.5853886161964915, "learning_rate": 1.7589466942945556e-05, "loss": 0.6543, "step": 1508 }, { "epoch": 0.6798828564992115, "grad_norm": 0.5342795634785458, "learning_rate": 1.7586389920625414e-05, "loss": 0.7029, "step": 1509 }, { "epoch": 0.680333408425321, "grad_norm": 0.561390364643965, "learning_rate": 1.75833112051702e-05, "loss": 0.636, "step": 1510 }, { "epoch": 0.6807839603514305, "grad_norm": 0.5656105681671725, "learning_rate": 1.7580230797267014e-05, "loss": 0.6348, "step": 1511 }, { "epoch": 0.68123451227754, "grad_norm": 0.5405922473963755, "learning_rate": 1.757714869760335e-05, "loss": 0.7184, "step": 1512 }, { "epoch": 0.6816850642036495, "grad_norm": 0.5528325413243527, "learning_rate": 1.7574064906867067e-05, "loss": 0.6961, "step": 1513 }, { "epoch": 0.6821356161297589, "grad_norm": 0.5357777024054658, "learning_rate": 1.7570979425746414e-05, "loss": 0.6517, "step": 1514 }, { "epoch": 0.6825861680558685, "grad_norm": 0.5312617176381719, "learning_rate": 1.7567892254930005e-05, "loss": 0.6774, "step": 1515 }, { "epoch": 0.6830367199819779, "grad_norm": 0.5776270499363553, "learning_rate": 1.756480339510684e-05, "loss": 0.6418, "step": 1516 }, { "epoch": 0.6834872719080874, "grad_norm": 0.5631566702999786, "learning_rate": 1.756171284696629e-05, "loss": 0.6627, "step": 1517 }, { "epoch": 0.6839378238341969, "grad_norm": 0.5721357746379692, "learning_rate": 1.7558620611198107e-05, "loss": 0.6716, "step": 1518 }, { "epoch": 0.6843883757603064, "grad_norm": 0.5158586865154773, "learning_rate": 1.7555526688492418e-05, "loss": 0.6538, "step": 1519 }, { "epoch": 0.6848389276864159, "grad_norm": 0.5198038920314204, "learning_rate": 1.755243107953973e-05, "loss": 0.6492, "step": 1520 }, { "epoch": 0.6852894796125254, "grad_norm": 0.556090727413734, "learning_rate": 1.7549333785030917e-05, "loss": 0.6739, "step": 1521 }, { "epoch": 0.6857400315386348, "grad_norm": 0.5448966986530638, "learning_rate": 1.7546234805657235e-05, "loss": 0.6706, "step": 1522 }, { "epoch": 0.6861905834647443, "grad_norm": 0.5246514875976219, "learning_rate": 1.754313414211032e-05, "loss": 0.6684, "step": 1523 }, { "epoch": 0.6866411353908538, "grad_norm": 0.5151164298130605, "learning_rate": 1.754003179508218e-05, "loss": 0.6449, "step": 1524 }, { "epoch": 0.6870916873169632, "grad_norm": 0.5601210306833664, "learning_rate": 1.7536927765265196e-05, "loss": 0.7088, "step": 1525 }, { "epoch": 0.6875422392430728, "grad_norm": 0.5255280448374359, "learning_rate": 1.7533822053352127e-05, "loss": 0.682, "step": 1526 }, { "epoch": 0.6879927911691822, "grad_norm": 0.5273700193404584, "learning_rate": 1.7530714660036112e-05, "loss": 0.6906, "step": 1527 }, { "epoch": 0.6884433430952918, "grad_norm": 0.581018747074027, "learning_rate": 1.7527605586010653e-05, "loss": 0.7096, "step": 1528 }, { "epoch": 0.6888938950214012, "grad_norm": 0.505420087667638, "learning_rate": 1.7524494831969647e-05, "loss": 0.6619, "step": 1529 }, { "epoch": 0.6893444469475107, "grad_norm": 0.5540711859910656, "learning_rate": 1.752138239860734e-05, "loss": 0.6697, "step": 1530 }, { "epoch": 0.6897949988736202, "grad_norm": 0.5480913379707304, "learning_rate": 1.751826828661838e-05, "loss": 0.6704, "step": 1531 }, { "epoch": 0.6902455507997297, "grad_norm": 0.5479560183943342, "learning_rate": 1.7515152496697765e-05, "loss": 0.6543, "step": 1532 }, { "epoch": 0.6906961027258391, "grad_norm": 0.5443301439926028, "learning_rate": 1.7512035029540887e-05, "loss": 0.6856, "step": 1533 }, { "epoch": 0.6911466546519487, "grad_norm": 0.5648344806135326, "learning_rate": 1.7508915885843498e-05, "loss": 0.6862, "step": 1534 }, { "epoch": 0.6915972065780581, "grad_norm": 0.5245903696926669, "learning_rate": 1.7505795066301735e-05, "loss": 0.7209, "step": 1535 }, { "epoch": 0.6920477585041676, "grad_norm": 0.5443269933058138, "learning_rate": 1.75026725716121e-05, "loss": 0.6636, "step": 1536 }, { "epoch": 0.6924983104302771, "grad_norm": 0.5563302698564019, "learning_rate": 1.749954840247148e-05, "loss": 0.669, "step": 1537 }, { "epoch": 0.6929488623563865, "grad_norm": 0.5187739195358232, "learning_rate": 1.7496422559577125e-05, "loss": 0.6237, "step": 1538 }, { "epoch": 0.6933994142824961, "grad_norm": 0.5410319223074812, "learning_rate": 1.7493295043626663e-05, "loss": 0.6745, "step": 1539 }, { "epoch": 0.6938499662086055, "grad_norm": 0.5429635943403543, "learning_rate": 1.7490165855318097e-05, "loss": 0.6796, "step": 1540 }, { "epoch": 0.694300518134715, "grad_norm": 0.554560886425628, "learning_rate": 1.7487034995349792e-05, "loss": 0.6704, "step": 1541 }, { "epoch": 0.6947510700608245, "grad_norm": 0.5263675023366869, "learning_rate": 1.7483902464420507e-05, "loss": 0.6691, "step": 1542 }, { "epoch": 0.695201621986934, "grad_norm": 0.5293807449598258, "learning_rate": 1.7480768263229352e-05, "loss": 0.685, "step": 1543 }, { "epoch": 0.6956521739130435, "grad_norm": 0.553421672328301, "learning_rate": 1.7477632392475827e-05, "loss": 0.6291, "step": 1544 }, { "epoch": 0.696102725839153, "grad_norm": 0.5383828110612829, "learning_rate": 1.7474494852859796e-05, "loss": 0.6744, "step": 1545 }, { "epoch": 0.6965532777652624, "grad_norm": 0.5492862465478154, "learning_rate": 1.74713556450815e-05, "loss": 0.6573, "step": 1546 }, { "epoch": 0.697003829691372, "grad_norm": 0.564999693014637, "learning_rate": 1.7468214769841542e-05, "loss": 0.6905, "step": 1547 }, { "epoch": 0.6974543816174814, "grad_norm": 0.5321768428927685, "learning_rate": 1.7465072227840906e-05, "loss": 0.7025, "step": 1548 }, { "epoch": 0.6979049335435908, "grad_norm": 0.5338963440699739, "learning_rate": 1.7461928019780953e-05, "loss": 0.6492, "step": 1549 }, { "epoch": 0.6983554854697004, "grad_norm": 0.5596680176809699, "learning_rate": 1.74587821463634e-05, "loss": 0.6716, "step": 1550 }, { "epoch": 0.6988060373958098, "grad_norm": 0.5575186590197081, "learning_rate": 1.7455634608290354e-05, "loss": 0.6752, "step": 1551 }, { "epoch": 0.6992565893219194, "grad_norm": 0.5278383346842603, "learning_rate": 1.7452485406264278e-05, "loss": 0.6253, "step": 1552 }, { "epoch": 0.6997071412480288, "grad_norm": 0.5251219019597226, "learning_rate": 1.7449334540988016e-05, "loss": 0.6822, "step": 1553 }, { "epoch": 0.7001576931741383, "grad_norm": 0.5470824515311022, "learning_rate": 1.744618201316478e-05, "loss": 0.6529, "step": 1554 }, { "epoch": 0.7006082451002478, "grad_norm": 0.5800157141139115, "learning_rate": 1.7443027823498146e-05, "loss": 0.6685, "step": 1555 }, { "epoch": 0.7010587970263573, "grad_norm": 0.5363117477558803, "learning_rate": 1.743987197269208e-05, "loss": 0.6669, "step": 1556 }, { "epoch": 0.7015093489524667, "grad_norm": 0.5179798664288805, "learning_rate": 1.74367144614509e-05, "loss": 0.6293, "step": 1557 }, { "epoch": 0.7019599008785763, "grad_norm": 0.627151989935199, "learning_rate": 1.74335552904793e-05, "loss": 0.6518, "step": 1558 }, { "epoch": 0.7024104528046857, "grad_norm": 0.5717071846490467, "learning_rate": 1.743039446048235e-05, "loss": 0.7047, "step": 1559 }, { "epoch": 0.7028610047307953, "grad_norm": 0.5984890856108345, "learning_rate": 1.742723197216548e-05, "loss": 0.703, "step": 1560 }, { "epoch": 0.7033115566569047, "grad_norm": 0.5667535163235065, "learning_rate": 1.74240678262345e-05, "loss": 0.6045, "step": 1561 }, { "epoch": 0.7037621085830142, "grad_norm": 0.5685476929881582, "learning_rate": 1.742090202339559e-05, "loss": 0.6454, "step": 1562 }, { "epoch": 0.7042126605091237, "grad_norm": 0.556374562371425, "learning_rate": 1.7417734564355285e-05, "loss": 0.6911, "step": 1563 }, { "epoch": 0.7046632124352331, "grad_norm": 0.584208599435642, "learning_rate": 1.741456544982051e-05, "loss": 0.6797, "step": 1564 }, { "epoch": 0.7051137643613427, "grad_norm": 0.6067498512593538, "learning_rate": 1.741139468049855e-05, "loss": 0.6949, "step": 1565 }, { "epoch": 0.7055643162874521, "grad_norm": 0.5846020598042859, "learning_rate": 1.7408222257097055e-05, "loss": 0.6836, "step": 1566 }, { "epoch": 0.7060148682135616, "grad_norm": 0.570967473561169, "learning_rate": 1.7405048180324046e-05, "loss": 0.712, "step": 1567 }, { "epoch": 0.7064654201396711, "grad_norm": 0.6023781288495221, "learning_rate": 1.7401872450887917e-05, "loss": 0.6731, "step": 1568 }, { "epoch": 0.7069159720657806, "grad_norm": 0.588779823851611, "learning_rate": 1.7398695069497437e-05, "loss": 0.6477, "step": 1569 }, { "epoch": 0.70736652399189, "grad_norm": 0.5801219670072679, "learning_rate": 1.7395516036861722e-05, "loss": 0.6833, "step": 1570 }, { "epoch": 0.7078170759179996, "grad_norm": 0.5867164834719628, "learning_rate": 1.7392335353690285e-05, "loss": 0.6853, "step": 1571 }, { "epoch": 0.708267627844109, "grad_norm": 0.5753384436117869, "learning_rate": 1.7389153020692985e-05, "loss": 0.6747, "step": 1572 }, { "epoch": 0.7087181797702186, "grad_norm": 0.6377322683211526, "learning_rate": 1.7385969038580058e-05, "loss": 0.6444, "step": 1573 }, { "epoch": 0.709168731696328, "grad_norm": 0.5389162051568299, "learning_rate": 1.7382783408062103e-05, "loss": 0.691, "step": 1574 }, { "epoch": 0.7096192836224375, "grad_norm": 0.5664112504797167, "learning_rate": 1.7379596129850098e-05, "loss": 0.6817, "step": 1575 }, { "epoch": 0.710069835548547, "grad_norm": 0.5855792048593368, "learning_rate": 1.737640720465538e-05, "loss": 0.6703, "step": 1576 }, { "epoch": 0.7105203874746564, "grad_norm": 0.5008173540597888, "learning_rate": 1.7373216633189653e-05, "loss": 0.6236, "step": 1577 }, { "epoch": 0.7109709394007659, "grad_norm": 0.5763121452200514, "learning_rate": 1.737002441616499e-05, "loss": 0.6507, "step": 1578 }, { "epoch": 0.7114214913268754, "grad_norm": 0.6002522295829581, "learning_rate": 1.736683055429383e-05, "loss": 0.6639, "step": 1579 }, { "epoch": 0.7118720432529849, "grad_norm": 0.5480988509513021, "learning_rate": 1.7363635048288993e-05, "loss": 0.679, "step": 1580 }, { "epoch": 0.7123225951790944, "grad_norm": 0.6504979837735863, "learning_rate": 1.736043789886364e-05, "loss": 0.7076, "step": 1581 }, { "epoch": 0.7127731471052039, "grad_norm": 0.5414224214666499, "learning_rate": 1.735723910673132e-05, "loss": 0.6369, "step": 1582 }, { "epoch": 0.7132236990313133, "grad_norm": 0.6814036984446655, "learning_rate": 1.7354038672605937e-05, "loss": 0.6855, "step": 1583 }, { "epoch": 0.7136742509574229, "grad_norm": 0.5612792887999645, "learning_rate": 1.7350836597201767e-05, "loss": 0.6578, "step": 1584 }, { "epoch": 0.7141248028835323, "grad_norm": 0.602936973221129, "learning_rate": 1.7347632881233458e-05, "loss": 0.6481, "step": 1585 }, { "epoch": 0.7145753548096418, "grad_norm": 0.521242059217057, "learning_rate": 1.7344427525416008e-05, "loss": 0.6943, "step": 1586 }, { "epoch": 0.7150259067357513, "grad_norm": 0.5724432355249277, "learning_rate": 1.7341220530464796e-05, "loss": 0.6561, "step": 1587 }, { "epoch": 0.7154764586618608, "grad_norm": 0.5187914842814829, "learning_rate": 1.7338011897095558e-05, "loss": 0.6765, "step": 1588 }, { "epoch": 0.7159270105879703, "grad_norm": 0.5273507469212939, "learning_rate": 1.73348016260244e-05, "loss": 0.6596, "step": 1589 }, { "epoch": 0.7163775625140798, "grad_norm": 0.6042963426438088, "learning_rate": 1.733158971796779e-05, "loss": 0.6338, "step": 1590 }, { "epoch": 0.7168281144401892, "grad_norm": 0.5456708737056564, "learning_rate": 1.732837617364257e-05, "loss": 0.7134, "step": 1591 }, { "epoch": 0.7172786663662987, "grad_norm": 0.5434350537881801, "learning_rate": 1.7325160993765934e-05, "loss": 0.6714, "step": 1592 }, { "epoch": 0.7177292182924082, "grad_norm": 0.5189512620687524, "learning_rate": 1.7321944179055448e-05, "loss": 0.6839, "step": 1593 }, { "epoch": 0.7181797702185176, "grad_norm": 0.5382994541140297, "learning_rate": 1.7318725730229048e-05, "loss": 0.6897, "step": 1594 }, { "epoch": 0.7186303221446272, "grad_norm": 0.5454399970313664, "learning_rate": 1.7315505648005024e-05, "loss": 0.6811, "step": 1595 }, { "epoch": 0.7190808740707366, "grad_norm": 0.5369567847708115, "learning_rate": 1.731228393310204e-05, "loss": 0.6693, "step": 1596 }, { "epoch": 0.7195314259968462, "grad_norm": 0.5519719239774944, "learning_rate": 1.7309060586239117e-05, "loss": 0.6854, "step": 1597 }, { "epoch": 0.7199819779229556, "grad_norm": 0.547517108381286, "learning_rate": 1.7305835608135645e-05, "loss": 0.6943, "step": 1598 }, { "epoch": 0.7204325298490651, "grad_norm": 0.5390568345952682, "learning_rate": 1.7302608999511374e-05, "loss": 0.6605, "step": 1599 }, { "epoch": 0.7208830817751746, "grad_norm": 0.5479361376815526, "learning_rate": 1.7299380761086423e-05, "loss": 0.6771, "step": 1600 }, { "epoch": 0.7213336337012841, "grad_norm": 0.5075389068270756, "learning_rate": 1.7296150893581276e-05, "loss": 0.6691, "step": 1601 }, { "epoch": 0.7217841856273935, "grad_norm": 0.5306869117499822, "learning_rate": 1.7292919397716772e-05, "loss": 0.71, "step": 1602 }, { "epoch": 0.7222347375535031, "grad_norm": 0.5319914243815171, "learning_rate": 1.7289686274214116e-05, "loss": 0.6988, "step": 1603 }, { "epoch": 0.7226852894796125, "grad_norm": 0.5129889572162862, "learning_rate": 1.7286451523794885e-05, "loss": 0.6023, "step": 1604 }, { "epoch": 0.723135841405722, "grad_norm": 0.5673968102875693, "learning_rate": 1.7283215147181006e-05, "loss": 0.6502, "step": 1605 }, { "epoch": 0.7235863933318315, "grad_norm": 0.5407383862856656, "learning_rate": 1.727997714509478e-05, "loss": 0.6874, "step": 1606 }, { "epoch": 0.7240369452579409, "grad_norm": 0.5270973982363327, "learning_rate": 1.7276737518258865e-05, "loss": 0.6884, "step": 1607 }, { "epoch": 0.7244874971840505, "grad_norm": 0.5659860505590742, "learning_rate": 1.7273496267396283e-05, "loss": 0.6674, "step": 1608 }, { "epoch": 0.7249380491101599, "grad_norm": 0.5332326356584912, "learning_rate": 1.7270253393230415e-05, "loss": 0.6517, "step": 1609 }, { "epoch": 0.7253886010362695, "grad_norm": 0.5368181696603898, "learning_rate": 1.726700889648501e-05, "loss": 0.6826, "step": 1610 }, { "epoch": 0.7258391529623789, "grad_norm": 0.5606367190267284, "learning_rate": 1.7263762777884178e-05, "loss": 0.6962, "step": 1611 }, { "epoch": 0.7262897048884884, "grad_norm": 0.5225931501064286, "learning_rate": 1.7260515038152393e-05, "loss": 0.6263, "step": 1612 }, { "epoch": 0.7267402568145979, "grad_norm": 0.5799391755689144, "learning_rate": 1.725726567801448e-05, "loss": 0.6549, "step": 1613 }, { "epoch": 0.7271908087407074, "grad_norm": 0.5344967903251598, "learning_rate": 1.7254014698195638e-05, "loss": 0.6715, "step": 1614 }, { "epoch": 0.7276413606668168, "grad_norm": 0.598059523800199, "learning_rate": 1.725076209942142e-05, "loss": 0.6765, "step": 1615 }, { "epoch": 0.7280919125929264, "grad_norm": 0.5335320400256894, "learning_rate": 1.7247507882417745e-05, "loss": 0.6778, "step": 1616 }, { "epoch": 0.7285424645190358, "grad_norm": 0.5352217040914089, "learning_rate": 1.7244252047910893e-05, "loss": 0.6715, "step": 1617 }, { "epoch": 0.7289930164451452, "grad_norm": 0.5535718291689006, "learning_rate": 1.7240994596627497e-05, "loss": 0.672, "step": 1618 }, { "epoch": 0.7294435683712548, "grad_norm": 0.5161613982007304, "learning_rate": 1.7237735529294563e-05, "loss": 0.6564, "step": 1619 }, { "epoch": 0.7298941202973642, "grad_norm": 0.5706351757590586, "learning_rate": 1.7234474846639444e-05, "loss": 0.7129, "step": 1620 }, { "epoch": 0.7303446722234738, "grad_norm": 0.5261700196199676, "learning_rate": 1.7231212549389867e-05, "loss": 0.6782, "step": 1621 }, { "epoch": 0.7307952241495832, "grad_norm": 0.5691380454964641, "learning_rate": 1.7227948638273918e-05, "loss": 0.684, "step": 1622 }, { "epoch": 0.7312457760756927, "grad_norm": 0.5190241585654815, "learning_rate": 1.7224683114020028e-05, "loss": 0.6951, "step": 1623 }, { "epoch": 0.7316963280018022, "grad_norm": 0.5338019309025334, "learning_rate": 1.7221415977357008e-05, "loss": 0.6888, "step": 1624 }, { "epoch": 0.7321468799279117, "grad_norm": 0.5685150785463635, "learning_rate": 1.721814722901401e-05, "loss": 0.6738, "step": 1625 }, { "epoch": 0.7325974318540212, "grad_norm": 0.5321914650570378, "learning_rate": 1.7214876869720567e-05, "loss": 0.6632, "step": 1626 }, { "epoch": 0.7330479837801307, "grad_norm": 0.5281469547652962, "learning_rate": 1.7211604900206552e-05, "loss": 0.6657, "step": 1627 }, { "epoch": 0.7334985357062401, "grad_norm": 0.5659443676302748, "learning_rate": 1.7208331321202203e-05, "loss": 0.7068, "step": 1628 }, { "epoch": 0.7339490876323497, "grad_norm": 0.5510173438371531, "learning_rate": 1.7205056133438123e-05, "loss": 0.6808, "step": 1629 }, { "epoch": 0.7343996395584591, "grad_norm": 0.5300352440741681, "learning_rate": 1.7201779337645274e-05, "loss": 0.6825, "step": 1630 }, { "epoch": 0.7348501914845686, "grad_norm": 0.5035467977187946, "learning_rate": 1.7198500934554966e-05, "loss": 0.6287, "step": 1631 }, { "epoch": 0.7353007434106781, "grad_norm": 0.5312782618014407, "learning_rate": 1.7195220924898883e-05, "loss": 0.6461, "step": 1632 }, { "epoch": 0.7357512953367875, "grad_norm": 0.5443239144028506, "learning_rate": 1.719193930940905e-05, "loss": 0.6887, "step": 1633 }, { "epoch": 0.7362018472628971, "grad_norm": 0.5295104593393142, "learning_rate": 1.718865608881787e-05, "loss": 0.6754, "step": 1634 }, { "epoch": 0.7366523991890065, "grad_norm": 0.5721049358968867, "learning_rate": 1.7185371263858085e-05, "loss": 0.6966, "step": 1635 }, { "epoch": 0.737102951115116, "grad_norm": 0.5601720970062668, "learning_rate": 1.718208483526281e-05, "loss": 0.6529, "step": 1636 }, { "epoch": 0.7375535030412255, "grad_norm": 0.5284797108625169, "learning_rate": 1.717879680376551e-05, "loss": 0.6392, "step": 1637 }, { "epoch": 0.738004054967335, "grad_norm": 0.5923177508774183, "learning_rate": 1.717550717010001e-05, "loss": 0.6323, "step": 1638 }, { "epoch": 0.7384546068934444, "grad_norm": 0.5164969821623371, "learning_rate": 1.7172215935000493e-05, "loss": 0.6597, "step": 1639 }, { "epoch": 0.738905158819554, "grad_norm": 0.5269041792222272, "learning_rate": 1.7168923099201497e-05, "loss": 0.6485, "step": 1640 }, { "epoch": 0.7393557107456634, "grad_norm": 0.5841301256105025, "learning_rate": 1.7165628663437923e-05, "loss": 0.6899, "step": 1641 }, { "epoch": 0.739806262671773, "grad_norm": 0.5176919353340445, "learning_rate": 1.7162332628445024e-05, "loss": 0.6603, "step": 1642 }, { "epoch": 0.7402568145978824, "grad_norm": 0.6009492756635593, "learning_rate": 1.7159034994958408e-05, "loss": 0.6859, "step": 1643 }, { "epoch": 0.7407073665239919, "grad_norm": 0.5360580806737303, "learning_rate": 1.7155735763714045e-05, "loss": 0.6526, "step": 1644 }, { "epoch": 0.7411579184501014, "grad_norm": 0.5541877861847683, "learning_rate": 1.7152434935448257e-05, "loss": 0.6981, "step": 1645 }, { "epoch": 0.7416084703762108, "grad_norm": 0.5773360290890391, "learning_rate": 1.7149132510897726e-05, "loss": 0.6979, "step": 1646 }, { "epoch": 0.7420590223023203, "grad_norm": 0.5566664080541243, "learning_rate": 1.7145828490799497e-05, "loss": 0.7139, "step": 1647 }, { "epoch": 0.7425095742284298, "grad_norm": 0.5487474125328416, "learning_rate": 1.714252287589095e-05, "loss": 0.696, "step": 1648 }, { "epoch": 0.7429601261545393, "grad_norm": 0.540956553332323, "learning_rate": 1.7139215666909844e-05, "loss": 0.6666, "step": 1649 }, { "epoch": 0.7434106780806488, "grad_norm": 0.5100246734403437, "learning_rate": 1.7135906864594278e-05, "loss": 0.6645, "step": 1650 }, { "epoch": 0.7438612300067583, "grad_norm": 0.5022341140555423, "learning_rate": 1.7132596469682715e-05, "loss": 0.6841, "step": 1651 }, { "epoch": 0.7443117819328677, "grad_norm": 0.6219157495517307, "learning_rate": 1.7129284482913973e-05, "loss": 0.6524, "step": 1652 }, { "epoch": 0.7447623338589773, "grad_norm": 0.5040040127290002, "learning_rate": 1.712597090502722e-05, "loss": 0.6514, "step": 1653 }, { "epoch": 0.7452128857850867, "grad_norm": 0.5712057050339534, "learning_rate": 1.712265573676198e-05, "loss": 0.6731, "step": 1654 }, { "epoch": 0.7456634377111963, "grad_norm": 0.5269403445777534, "learning_rate": 1.711933897885814e-05, "loss": 0.6813, "step": 1655 }, { "epoch": 0.7461139896373057, "grad_norm": 0.5359488564853331, "learning_rate": 1.7116020632055933e-05, "loss": 0.66, "step": 1656 }, { "epoch": 0.7465645415634152, "grad_norm": 0.5373266857135687, "learning_rate": 1.7112700697095955e-05, "loss": 0.7042, "step": 1657 }, { "epoch": 0.7470150934895247, "grad_norm": 0.5004469874198246, "learning_rate": 1.7109379174719138e-05, "loss": 0.6472, "step": 1658 }, { "epoch": 0.7474656454156342, "grad_norm": 0.51658703902115, "learning_rate": 1.7106056065666793e-05, "loss": 0.6336, "step": 1659 }, { "epoch": 0.7479161973417436, "grad_norm": 0.5369482386292548, "learning_rate": 1.710273137068057e-05, "loss": 0.709, "step": 1660 }, { "epoch": 0.7483667492678531, "grad_norm": 0.5160806393272812, "learning_rate": 1.709940509050248e-05, "loss": 0.6627, "step": 1661 }, { "epoch": 0.7488173011939626, "grad_norm": 0.5132276684932321, "learning_rate": 1.709607722587488e-05, "loss": 0.666, "step": 1662 }, { "epoch": 0.749267853120072, "grad_norm": 0.531249192055358, "learning_rate": 1.7092747777540482e-05, "loss": 0.6749, "step": 1663 }, { "epoch": 0.7497184050461816, "grad_norm": 0.5460198004179057, "learning_rate": 1.708941674624236e-05, "loss": 0.7043, "step": 1664 }, { "epoch": 0.750168956972291, "grad_norm": 0.5212631852020938, "learning_rate": 1.708608413272393e-05, "loss": 0.6592, "step": 1665 }, { "epoch": 0.750168956972291, "eval_loss": 0.6418542861938477, "eval_runtime": 24.3866, "eval_samples_per_second": 11.441, "eval_steps_per_second": 0.492, "step": 1665 }, { "epoch": 0.7506195088984006, "grad_norm": 0.5078911773718229, "learning_rate": 1.7082749937728972e-05, "loss": 0.6948, "step": 1666 }, { "epoch": 0.75107006082451, "grad_norm": 0.5068247590285778, "learning_rate": 1.7079414162001617e-05, "loss": 0.6841, "step": 1667 }, { "epoch": 0.7515206127506195, "grad_norm": 0.5353660693440839, "learning_rate": 1.7076076806286334e-05, "loss": 0.7094, "step": 1668 }, { "epoch": 0.751971164676729, "grad_norm": 0.5090402713710361, "learning_rate": 1.707273787132796e-05, "loss": 0.6984, "step": 1669 }, { "epoch": 0.7524217166028385, "grad_norm": 0.5475333278540959, "learning_rate": 1.706939735787169e-05, "loss": 0.6399, "step": 1670 }, { "epoch": 0.752872268528948, "grad_norm": 0.5226294675029083, "learning_rate": 1.706605526666305e-05, "loss": 0.6659, "step": 1671 }, { "epoch": 0.7533228204550575, "grad_norm": 0.5682329310183686, "learning_rate": 1.7062711598447936e-05, "loss": 0.6839, "step": 1672 }, { "epoch": 0.7537733723811669, "grad_norm": 0.5295180634675509, "learning_rate": 1.705936635397259e-05, "loss": 0.6823, "step": 1673 }, { "epoch": 0.7542239243072764, "grad_norm": 0.576630383140226, "learning_rate": 1.7056019533983603e-05, "loss": 0.686, "step": 1674 }, { "epoch": 0.7546744762333859, "grad_norm": 0.5531934219538648, "learning_rate": 1.705267113922792e-05, "loss": 0.6785, "step": 1675 }, { "epoch": 0.7551250281594953, "grad_norm": 0.5784102910695924, "learning_rate": 1.704932117045284e-05, "loss": 0.6684, "step": 1676 }, { "epoch": 0.7555755800856049, "grad_norm": 0.5706810110549215, "learning_rate": 1.7045969628406013e-05, "loss": 0.6653, "step": 1677 }, { "epoch": 0.7560261320117143, "grad_norm": 0.5682499215673974, "learning_rate": 1.704261651383543e-05, "loss": 0.6774, "step": 1678 }, { "epoch": 0.7564766839378239, "grad_norm": 0.5342207738621156, "learning_rate": 1.7039261827489452e-05, "loss": 0.7153, "step": 1679 }, { "epoch": 0.7569272358639333, "grad_norm": 0.5049257684424738, "learning_rate": 1.703590557011677e-05, "loss": 0.6563, "step": 1680 }, { "epoch": 0.7573777877900428, "grad_norm": 0.5363053960825324, "learning_rate": 1.703254774246644e-05, "loss": 0.633, "step": 1681 }, { "epoch": 0.7578283397161523, "grad_norm": 0.5416735740706843, "learning_rate": 1.7029188345287868e-05, "loss": 0.6576, "step": 1682 }, { "epoch": 0.7582788916422618, "grad_norm": 0.5188415356225905, "learning_rate": 1.70258273793308e-05, "loss": 0.679, "step": 1683 }, { "epoch": 0.7587294435683712, "grad_norm": 0.5280977859064911, "learning_rate": 1.7022464845345342e-05, "loss": 0.6817, "step": 1684 }, { "epoch": 0.7591799954944808, "grad_norm": 0.5165952468516627, "learning_rate": 1.701910074408194e-05, "loss": 0.6653, "step": 1685 }, { "epoch": 0.7596305474205902, "grad_norm": 0.5194281301708293, "learning_rate": 1.701573507629141e-05, "loss": 0.6697, "step": 1686 }, { "epoch": 0.7600810993466998, "grad_norm": 0.5097216617843842, "learning_rate": 1.7012367842724887e-05, "loss": 0.6585, "step": 1687 }, { "epoch": 0.7605316512728092, "grad_norm": 0.5539590162171986, "learning_rate": 1.7008999044133886e-05, "loss": 0.6891, "step": 1688 }, { "epoch": 0.7609822031989186, "grad_norm": 0.5884967650498518, "learning_rate": 1.7005628681270248e-05, "loss": 0.6851, "step": 1689 }, { "epoch": 0.7614327551250282, "grad_norm": 0.5382869419789663, "learning_rate": 1.700225675488618e-05, "loss": 0.6523, "step": 1690 }, { "epoch": 0.7618833070511376, "grad_norm": 0.5607814695575324, "learning_rate": 1.699888326573422e-05, "loss": 0.7015, "step": 1691 }, { "epoch": 0.7623338589772471, "grad_norm": 0.518816584686078, "learning_rate": 1.6995508214567275e-05, "loss": 0.6653, "step": 1692 }, { "epoch": 0.7627844109033566, "grad_norm": 0.5486138256420067, "learning_rate": 1.699213160213859e-05, "loss": 0.6677, "step": 1693 }, { "epoch": 0.7632349628294661, "grad_norm": 0.5035513817154726, "learning_rate": 1.6988753429201756e-05, "loss": 0.6786, "step": 1694 }, { "epoch": 0.7636855147555756, "grad_norm": 0.5831645882564517, "learning_rate": 1.698537369651072e-05, "loss": 0.6821, "step": 1695 }, { "epoch": 0.7641360666816851, "grad_norm": 0.5682916230889516, "learning_rate": 1.698199240481977e-05, "loss": 0.6349, "step": 1696 }, { "epoch": 0.7645866186077945, "grad_norm": 0.543443915916972, "learning_rate": 1.6978609554883544e-05, "loss": 0.6601, "step": 1697 }, { "epoch": 0.7650371705339041, "grad_norm": 0.5746575670591032, "learning_rate": 1.6975225147457026e-05, "loss": 0.7037, "step": 1698 }, { "epoch": 0.7654877224600135, "grad_norm": 0.5357379626109436, "learning_rate": 1.6971839183295554e-05, "loss": 0.6303, "step": 1699 }, { "epoch": 0.765938274386123, "grad_norm": 0.5404193718207999, "learning_rate": 1.696845166315481e-05, "loss": 0.6704, "step": 1700 }, { "epoch": 0.7663888263122325, "grad_norm": 0.537009047858653, "learning_rate": 1.6965062587790823e-05, "loss": 0.6927, "step": 1701 }, { "epoch": 0.7668393782383419, "grad_norm": 0.5037531816680554, "learning_rate": 1.6961671957959967e-05, "loss": 0.6751, "step": 1702 }, { "epoch": 0.7672899301644515, "grad_norm": 0.5329409103754359, "learning_rate": 1.6958279774418963e-05, "loss": 0.6466, "step": 1703 }, { "epoch": 0.7677404820905609, "grad_norm": 0.5199437279045434, "learning_rate": 1.6954886037924888e-05, "loss": 0.6851, "step": 1704 }, { "epoch": 0.7681910340166704, "grad_norm": 0.51053755864261, "learning_rate": 1.6951490749235148e-05, "loss": 0.6938, "step": 1705 }, { "epoch": 0.7686415859427799, "grad_norm": 0.539426744551815, "learning_rate": 1.694809390910751e-05, "loss": 0.6665, "step": 1706 }, { "epoch": 0.7690921378688894, "grad_norm": 0.5496884207967647, "learning_rate": 1.6944695518300087e-05, "loss": 0.6693, "step": 1707 }, { "epoch": 0.7695426897949988, "grad_norm": 0.5338731515522773, "learning_rate": 1.694129557757133e-05, "loss": 0.6355, "step": 1708 }, { "epoch": 0.7699932417211084, "grad_norm": 0.5333161127628533, "learning_rate": 1.693789408768004e-05, "loss": 0.649, "step": 1709 }, { "epoch": 0.7704437936472178, "grad_norm": 0.5218704192164207, "learning_rate": 1.6934491049385366e-05, "loss": 0.6615, "step": 1710 }, { "epoch": 0.7708943455733274, "grad_norm": 0.5569221792633636, "learning_rate": 1.6931086463446792e-05, "loss": 0.6519, "step": 1711 }, { "epoch": 0.7713448974994368, "grad_norm": 0.576161857424641, "learning_rate": 1.6927680330624165e-05, "loss": 0.6637, "step": 1712 }, { "epoch": 0.7717954494255463, "grad_norm": 0.5757988872460803, "learning_rate": 1.6924272651677666e-05, "loss": 0.6887, "step": 1713 }, { "epoch": 0.7722460013516558, "grad_norm": 0.5699015350485254, "learning_rate": 1.6920863427367815e-05, "loss": 0.6417, "step": 1714 }, { "epoch": 0.7726965532777652, "grad_norm": 0.5449370441736923, "learning_rate": 1.6917452658455496e-05, "loss": 0.6838, "step": 1715 }, { "epoch": 0.7731471052038748, "grad_norm": 0.5433473393161827, "learning_rate": 1.6914040345701922e-05, "loss": 0.6445, "step": 1716 }, { "epoch": 0.7735976571299842, "grad_norm": 0.543610095441825, "learning_rate": 1.691062648986865e-05, "loss": 0.7342, "step": 1717 }, { "epoch": 0.7740482090560937, "grad_norm": 0.5746929879946296, "learning_rate": 1.690721109171759e-05, "loss": 0.6983, "step": 1718 }, { "epoch": 0.7744987609822032, "grad_norm": 0.5322066370374817, "learning_rate": 1.6903794152011e-05, "loss": 0.6618, "step": 1719 }, { "epoch": 0.7749493129083127, "grad_norm": 0.5428611128568053, "learning_rate": 1.690037567151146e-05, "loss": 0.636, "step": 1720 }, { "epoch": 0.7753998648344221, "grad_norm": 0.5341040876834592, "learning_rate": 1.689695565098192e-05, "loss": 0.6407, "step": 1721 }, { "epoch": 0.7758504167605317, "grad_norm": 0.5428880033375367, "learning_rate": 1.6893534091185658e-05, "loss": 0.6501, "step": 1722 }, { "epoch": 0.7763009686866411, "grad_norm": 0.5337039345997382, "learning_rate": 1.6890110992886302e-05, "loss": 0.6875, "step": 1723 }, { "epoch": 0.7767515206127507, "grad_norm": 0.5237841687705527, "learning_rate": 1.6886686356847822e-05, "loss": 0.6781, "step": 1724 }, { "epoch": 0.7772020725388601, "grad_norm": 0.5344060772607003, "learning_rate": 1.6883260183834524e-05, "loss": 0.6755, "step": 1725 }, { "epoch": 0.7776526244649696, "grad_norm": 0.5400022721932206, "learning_rate": 1.6879832474611068e-05, "loss": 0.6799, "step": 1726 }, { "epoch": 0.7781031763910791, "grad_norm": 0.5225719920559355, "learning_rate": 1.6876403229942453e-05, "loss": 0.6619, "step": 1727 }, { "epoch": 0.7785537283171886, "grad_norm": 0.5522778653566002, "learning_rate": 1.687297245059402e-05, "loss": 0.7051, "step": 1728 }, { "epoch": 0.779004280243298, "grad_norm": 0.5439874947932958, "learning_rate": 1.6869540137331445e-05, "loss": 0.6801, "step": 1729 }, { "epoch": 0.7794548321694075, "grad_norm": 0.5320841454303107, "learning_rate": 1.6866106290920765e-05, "loss": 0.6415, "step": 1730 }, { "epoch": 0.779905384095517, "grad_norm": 0.5559808649944455, "learning_rate": 1.686267091212834e-05, "loss": 0.6604, "step": 1731 }, { "epoch": 0.7803559360216265, "grad_norm": 0.5742364563426556, "learning_rate": 1.6859234001720882e-05, "loss": 0.6608, "step": 1732 }, { "epoch": 0.780806487947736, "grad_norm": 0.5574232964688749, "learning_rate": 1.6855795560465447e-05, "loss": 0.6342, "step": 1733 }, { "epoch": 0.7812570398738454, "grad_norm": 0.5407355395860963, "learning_rate": 1.6852355589129418e-05, "loss": 0.6368, "step": 1734 }, { "epoch": 0.781707591799955, "grad_norm": 0.5997879443518909, "learning_rate": 1.6848914088480542e-05, "loss": 0.6749, "step": 1735 }, { "epoch": 0.7821581437260644, "grad_norm": 0.49909470461507593, "learning_rate": 1.684547105928689e-05, "loss": 0.6928, "step": 1736 }, { "epoch": 0.782608695652174, "grad_norm": 0.5995821503295391, "learning_rate": 1.6842026502316874e-05, "loss": 0.6793, "step": 1737 }, { "epoch": 0.7830592475782834, "grad_norm": 0.5476571585138102, "learning_rate": 1.683858041833926e-05, "loss": 0.6888, "step": 1738 }, { "epoch": 0.7835097995043929, "grad_norm": 0.5517844173045867, "learning_rate": 1.6835132808123145e-05, "loss": 0.6657, "step": 1739 }, { "epoch": 0.7839603514305024, "grad_norm": 0.5225957974633214, "learning_rate": 1.683168367243797e-05, "loss": 0.6291, "step": 1740 }, { "epoch": 0.7844109033566119, "grad_norm": 0.536517675873415, "learning_rate": 1.682823301205351e-05, "loss": 0.655, "step": 1741 }, { "epoch": 0.7848614552827213, "grad_norm": 0.5468158750833753, "learning_rate": 1.682478082773989e-05, "loss": 0.6297, "step": 1742 }, { "epoch": 0.7853120072088308, "grad_norm": 0.5500069769367475, "learning_rate": 1.6821327120267567e-05, "loss": 0.6518, "step": 1743 }, { "epoch": 0.7857625591349403, "grad_norm": 0.5783121879434509, "learning_rate": 1.6817871890407347e-05, "loss": 0.65, "step": 1744 }, { "epoch": 0.7862131110610497, "grad_norm": 0.544240387297378, "learning_rate": 1.6814415138930368e-05, "loss": 0.6924, "step": 1745 }, { "epoch": 0.7866636629871593, "grad_norm": 0.5806792636098014, "learning_rate": 1.6810956866608104e-05, "loss": 0.6923, "step": 1746 }, { "epoch": 0.7871142149132687, "grad_norm": 0.5559858769133337, "learning_rate": 1.680749707421238e-05, "loss": 0.6503, "step": 1747 }, { "epoch": 0.7875647668393783, "grad_norm": 0.5024279690866014, "learning_rate": 1.6804035762515355e-05, "loss": 0.636, "step": 1748 }, { "epoch": 0.7880153187654877, "grad_norm": 0.5619858722828954, "learning_rate": 1.680057293228953e-05, "loss": 0.6764, "step": 1749 }, { "epoch": 0.7884658706915972, "grad_norm": 0.5482195033393807, "learning_rate": 1.6797108584307732e-05, "loss": 0.6702, "step": 1750 }, { "epoch": 0.7889164226177067, "grad_norm": 0.52527825162253, "learning_rate": 1.6793642719343143e-05, "loss": 0.6536, "step": 1751 }, { "epoch": 0.7893669745438162, "grad_norm": 0.5939437866857822, "learning_rate": 1.6790175338169277e-05, "loss": 0.6444, "step": 1752 }, { "epoch": 0.7898175264699256, "grad_norm": 0.5296849028916841, "learning_rate": 1.678670644155998e-05, "loss": 0.6947, "step": 1753 }, { "epoch": 0.7902680783960352, "grad_norm": 0.5489302557217304, "learning_rate": 1.6783236030289448e-05, "loss": 0.6852, "step": 1754 }, { "epoch": 0.7907186303221446, "grad_norm": 0.5006824817269908, "learning_rate": 1.677976410513221e-05, "loss": 0.6558, "step": 1755 }, { "epoch": 0.7911691822482542, "grad_norm": 0.53331623392998, "learning_rate": 1.677629066686313e-05, "loss": 0.6775, "step": 1756 }, { "epoch": 0.7916197341743636, "grad_norm": 0.5165778169168129, "learning_rate": 1.6772815716257414e-05, "loss": 0.6913, "step": 1757 }, { "epoch": 0.792070286100473, "grad_norm": 0.5240586133938873, "learning_rate": 1.67693392540906e-05, "loss": 0.649, "step": 1758 }, { "epoch": 0.7925208380265826, "grad_norm": 0.5344759308422009, "learning_rate": 1.6765861281138568e-05, "loss": 0.7073, "step": 1759 }, { "epoch": 0.792971389952692, "grad_norm": 0.5159652095904332, "learning_rate": 1.676238179817754e-05, "loss": 0.6962, "step": 1760 }, { "epoch": 0.7934219418788016, "grad_norm": 0.5446878462874984, "learning_rate": 1.675890080598406e-05, "loss": 0.7026, "step": 1761 }, { "epoch": 0.793872493804911, "grad_norm": 0.512872083429218, "learning_rate": 1.6755418305335026e-05, "loss": 0.6593, "step": 1762 }, { "epoch": 0.7943230457310205, "grad_norm": 0.5147146748611173, "learning_rate": 1.6751934297007655e-05, "loss": 0.6946, "step": 1763 }, { "epoch": 0.79477359765713, "grad_norm": 0.5427236095724531, "learning_rate": 1.674844878177952e-05, "loss": 0.6885, "step": 1764 }, { "epoch": 0.7952241495832395, "grad_norm": 0.5121296906809413, "learning_rate": 1.6744961760428517e-05, "loss": 0.6566, "step": 1765 }, { "epoch": 0.7956747015093489, "grad_norm": 0.5112995866765107, "learning_rate": 1.674147323373288e-05, "loss": 0.6791, "step": 1766 }, { "epoch": 0.7961252534354585, "grad_norm": 0.49356133430970067, "learning_rate": 1.673798320247118e-05, "loss": 0.6372, "step": 1767 }, { "epoch": 0.7965758053615679, "grad_norm": 0.5201106329224758, "learning_rate": 1.6734491667422327e-05, "loss": 0.6613, "step": 1768 }, { "epoch": 0.7970263572876775, "grad_norm": 0.5264614969785244, "learning_rate": 1.6730998629365562e-05, "loss": 0.6507, "step": 1769 }, { "epoch": 0.7974769092137869, "grad_norm": 0.4924756485939106, "learning_rate": 1.6727504089080462e-05, "loss": 0.686, "step": 1770 }, { "epoch": 0.7979274611398963, "grad_norm": 0.5369693717016297, "learning_rate": 1.6724008047346946e-05, "loss": 0.6547, "step": 1771 }, { "epoch": 0.7983780130660059, "grad_norm": 0.5259906383218197, "learning_rate": 1.672051050494526e-05, "loss": 0.6718, "step": 1772 }, { "epoch": 0.7988285649921153, "grad_norm": 0.5194998393508707, "learning_rate": 1.6717011462655985e-05, "loss": 0.6682, "step": 1773 }, { "epoch": 0.7992791169182248, "grad_norm": 0.5030223551719253, "learning_rate": 1.671351092126004e-05, "loss": 0.6332, "step": 1774 }, { "epoch": 0.7997296688443343, "grad_norm": 0.5047552137729802, "learning_rate": 1.671000888153868e-05, "loss": 0.656, "step": 1775 }, { "epoch": 0.8001802207704438, "grad_norm": 0.49900787369436334, "learning_rate": 1.6706505344273492e-05, "loss": 0.6728, "step": 1776 }, { "epoch": 0.8006307726965533, "grad_norm": 0.5307557021395345, "learning_rate": 1.67030003102464e-05, "loss": 0.6848, "step": 1777 }, { "epoch": 0.8010813246226628, "grad_norm": 0.5469355874700071, "learning_rate": 1.6699493780239652e-05, "loss": 0.6999, "step": 1778 }, { "epoch": 0.8015318765487722, "grad_norm": 0.5077021502996858, "learning_rate": 1.669598575503584e-05, "loss": 0.6767, "step": 1779 }, { "epoch": 0.8019824284748818, "grad_norm": 0.532362566023182, "learning_rate": 1.6692476235417897e-05, "loss": 0.6722, "step": 1780 }, { "epoch": 0.8024329804009912, "grad_norm": 0.5091857472264223, "learning_rate": 1.6688965222169068e-05, "loss": 0.6554, "step": 1781 }, { "epoch": 0.8028835323271007, "grad_norm": 0.5195650308766891, "learning_rate": 1.6685452716072946e-05, "loss": 0.6389, "step": 1782 }, { "epoch": 0.8033340842532102, "grad_norm": 0.5404807517095653, "learning_rate": 1.6681938717913455e-05, "loss": 0.674, "step": 1783 }, { "epoch": 0.8037846361793196, "grad_norm": 0.506587234194834, "learning_rate": 1.667842322847485e-05, "loss": 0.6856, "step": 1784 }, { "epoch": 0.8042351881054292, "grad_norm": 0.5302151235055285, "learning_rate": 1.667490624854173e-05, "loss": 0.6485, "step": 1785 }, { "epoch": 0.8046857400315386, "grad_norm": 0.5191502946917662, "learning_rate": 1.6671387778898998e-05, "loss": 0.6546, "step": 1786 }, { "epoch": 0.8051362919576481, "grad_norm": 0.5372173705036197, "learning_rate": 1.6667867820331927e-05, "loss": 0.6832, "step": 1787 }, { "epoch": 0.8055868438837576, "grad_norm": 0.5638888146256105, "learning_rate": 1.666434637362609e-05, "loss": 0.6688, "step": 1788 }, { "epoch": 0.8060373958098671, "grad_norm": 0.5367998308331167, "learning_rate": 1.666082343956741e-05, "loss": 0.6815, "step": 1789 }, { "epoch": 0.8064879477359765, "grad_norm": 0.5665193416278319, "learning_rate": 1.6657299018942138e-05, "loss": 0.679, "step": 1790 }, { "epoch": 0.8069384996620861, "grad_norm": 0.5201770743926076, "learning_rate": 1.665377311253686e-05, "loss": 0.6738, "step": 1791 }, { "epoch": 0.8073890515881955, "grad_norm": 0.546065704576949, "learning_rate": 1.6650245721138483e-05, "loss": 0.6482, "step": 1792 }, { "epoch": 0.8078396035143051, "grad_norm": 0.5759958095555345, "learning_rate": 1.664671684553426e-05, "loss": 0.6907, "step": 1793 }, { "epoch": 0.8082901554404145, "grad_norm": 0.5425176815539826, "learning_rate": 1.664318648651176e-05, "loss": 0.6963, "step": 1794 }, { "epoch": 0.808740707366524, "grad_norm": 0.5281960822304614, "learning_rate": 1.6639654644858892e-05, "loss": 0.6411, "step": 1795 }, { "epoch": 0.8091912592926335, "grad_norm": 0.5467141830834669, "learning_rate": 1.66361213213639e-05, "loss": 0.7197, "step": 1796 }, { "epoch": 0.809641811218743, "grad_norm": 0.53373165443098, "learning_rate": 1.6632586516815346e-05, "loss": 0.6952, "step": 1797 }, { "epoch": 0.8100923631448524, "grad_norm": 0.5157134679630233, "learning_rate": 1.6629050232002138e-05, "loss": 0.7188, "step": 1798 }, { "epoch": 0.8105429150709619, "grad_norm": 0.5174259144217599, "learning_rate": 1.66255124677135e-05, "loss": 0.6647, "step": 1799 }, { "epoch": 0.8109934669970714, "grad_norm": 0.5075325305012514, "learning_rate": 1.6621973224738994e-05, "loss": 0.6883, "step": 1800 }, { "epoch": 0.8114440189231809, "grad_norm": 0.5300314490182596, "learning_rate": 1.6618432503868507e-05, "loss": 0.6478, "step": 1801 }, { "epoch": 0.8118945708492904, "grad_norm": 0.5417283944179572, "learning_rate": 1.6614890305892266e-05, "loss": 0.6596, "step": 1802 }, { "epoch": 0.8123451227753998, "grad_norm": 0.5416750044650184, "learning_rate": 1.6611346631600817e-05, "loss": 0.6685, "step": 1803 }, { "epoch": 0.8127956747015094, "grad_norm": 0.5337729848211966, "learning_rate": 1.6607801481785042e-05, "loss": 0.6739, "step": 1804 }, { "epoch": 0.8132462266276188, "grad_norm": 0.581578728897874, "learning_rate": 1.6604254857236144e-05, "loss": 0.6974, "step": 1805 }, { "epoch": 0.8136967785537284, "grad_norm": 0.5117057293833882, "learning_rate": 1.6600706758745668e-05, "loss": 0.6673, "step": 1806 }, { "epoch": 0.8141473304798378, "grad_norm": 0.5548033383546559, "learning_rate": 1.6597157187105475e-05, "loss": 0.6956, "step": 1807 }, { "epoch": 0.8145978824059473, "grad_norm": 0.5352852179111933, "learning_rate": 1.659360614310776e-05, "loss": 0.6478, "step": 1808 }, { "epoch": 0.8150484343320568, "grad_norm": 0.5821935250826715, "learning_rate": 1.6590053627545054e-05, "loss": 0.6683, "step": 1809 }, { "epoch": 0.8154989862581663, "grad_norm": 0.5675940586268543, "learning_rate": 1.6586499641210204e-05, "loss": 0.6699, "step": 1810 }, { "epoch": 0.8159495381842757, "grad_norm": 0.5247340639579852, "learning_rate": 1.6582944184896393e-05, "loss": 0.6327, "step": 1811 }, { "epoch": 0.8164000901103852, "grad_norm": 0.5330396161886001, "learning_rate": 1.657938725939713e-05, "loss": 0.6664, "step": 1812 }, { "epoch": 0.8168506420364947, "grad_norm": 0.4963105703256002, "learning_rate": 1.6575828865506246e-05, "loss": 0.6949, "step": 1813 }, { "epoch": 0.8173011939626041, "grad_norm": 0.5356378880232637, "learning_rate": 1.6572269004017917e-05, "loss": 0.6869, "step": 1814 }, { "epoch": 0.8177517458887137, "grad_norm": 0.5348029010787786, "learning_rate": 1.6568707675726624e-05, "loss": 0.6637, "step": 1815 }, { "epoch": 0.8182022978148231, "grad_norm": 0.5218675491678105, "learning_rate": 1.656514488142719e-05, "loss": 0.6838, "step": 1816 }, { "epoch": 0.8186528497409327, "grad_norm": 0.5302071880710268, "learning_rate": 1.6561580621914764e-05, "loss": 0.6477, "step": 1817 }, { "epoch": 0.8191034016670421, "grad_norm": 0.49968274418067854, "learning_rate": 1.655801489798482e-05, "loss": 0.6824, "step": 1818 }, { "epoch": 0.8195539535931516, "grad_norm": 0.540416623489955, "learning_rate": 1.6554447710433154e-05, "loss": 0.6489, "step": 1819 }, { "epoch": 0.8200045055192611, "grad_norm": 0.5037970226700362, "learning_rate": 1.6550879060055897e-05, "loss": 0.6764, "step": 1820 }, { "epoch": 0.8204550574453706, "grad_norm": 0.5352582778721797, "learning_rate": 1.6547308947649496e-05, "loss": 0.6594, "step": 1821 }, { "epoch": 0.82090560937148, "grad_norm": 0.5079581092730596, "learning_rate": 1.6543737374010742e-05, "loss": 0.6516, "step": 1822 }, { "epoch": 0.8213561612975896, "grad_norm": 0.539816718209973, "learning_rate": 1.6540164339936734e-05, "loss": 0.6516, "step": 1823 }, { "epoch": 0.821806713223699, "grad_norm": 0.5240573674435496, "learning_rate": 1.65365898462249e-05, "loss": 0.6506, "step": 1824 }, { "epoch": 0.8222572651498086, "grad_norm": 0.5149617204292672, "learning_rate": 1.6533013893673005e-05, "loss": 0.6892, "step": 1825 }, { "epoch": 0.822707817075918, "grad_norm": 0.5637525557289733, "learning_rate": 1.6529436483079132e-05, "loss": 0.671, "step": 1826 }, { "epoch": 0.8231583690020274, "grad_norm": 0.5377157200212205, "learning_rate": 1.6525857615241686e-05, "loss": 0.6889, "step": 1827 }, { "epoch": 0.823608920928137, "grad_norm": 0.5290002292176327, "learning_rate": 1.6522277290959402e-05, "loss": 0.6702, "step": 1828 }, { "epoch": 0.8240594728542464, "grad_norm": 0.5064994788880061, "learning_rate": 1.651869551103134e-05, "loss": 0.6819, "step": 1829 }, { "epoch": 0.824510024780356, "grad_norm": 0.5445880692454691, "learning_rate": 1.6515112276256882e-05, "loss": 0.6927, "step": 1830 }, { "epoch": 0.8249605767064654, "grad_norm": 0.5133119890121506, "learning_rate": 1.6511527587435736e-05, "loss": 0.661, "step": 1831 }, { "epoch": 0.8254111286325749, "grad_norm": 0.5456872641305882, "learning_rate": 1.6507941445367935e-05, "loss": 0.6735, "step": 1832 }, { "epoch": 0.8258616805586844, "grad_norm": 0.5555321221907131, "learning_rate": 1.6504353850853844e-05, "loss": 0.6886, "step": 1833 }, { "epoch": 0.8263122324847939, "grad_norm": 0.5423252706022169, "learning_rate": 1.6500764804694132e-05, "loss": 0.6892, "step": 1834 }, { "epoch": 0.8267627844109033, "grad_norm": 0.5364661770080394, "learning_rate": 1.6497174307689815e-05, "loss": 0.6747, "step": 1835 }, { "epoch": 0.8272133363370129, "grad_norm": 0.559087565433458, "learning_rate": 1.6493582360642216e-05, "loss": 0.626, "step": 1836 }, { "epoch": 0.8276638882631223, "grad_norm": 0.5335230257037951, "learning_rate": 1.648998896435299e-05, "loss": 0.6883, "step": 1837 }, { "epoch": 0.8281144401892319, "grad_norm": 0.5201710591397578, "learning_rate": 1.648639411962411e-05, "loss": 0.6491, "step": 1838 }, { "epoch": 0.8285649921153413, "grad_norm": 0.544771148449769, "learning_rate": 1.6482797827257885e-05, "loss": 0.649, "step": 1839 }, { "epoch": 0.8290155440414507, "grad_norm": 0.4840116555233013, "learning_rate": 1.6479200088056928e-05, "loss": 0.6569, "step": 1840 }, { "epoch": 0.8294660959675603, "grad_norm": 0.5023736030385714, "learning_rate": 1.647560090282419e-05, "loss": 0.6644, "step": 1841 }, { "epoch": 0.8299166478936697, "grad_norm": 0.5235326013541054, "learning_rate": 1.6472000272362937e-05, "loss": 0.649, "step": 1842 }, { "epoch": 0.8303671998197792, "grad_norm": 0.5085012932559354, "learning_rate": 1.646839819747676e-05, "loss": 0.6979, "step": 1843 }, { "epoch": 0.8308177517458887, "grad_norm": 0.5371166062274002, "learning_rate": 1.646479467896957e-05, "loss": 0.6453, "step": 1844 }, { "epoch": 0.8312683036719982, "grad_norm": 0.5266303391026255, "learning_rate": 1.646118971764561e-05, "loss": 0.6937, "step": 1845 }, { "epoch": 0.8317188555981077, "grad_norm": 0.49803191852207734, "learning_rate": 1.645758331430943e-05, "loss": 0.643, "step": 1846 }, { "epoch": 0.8321694075242172, "grad_norm": 0.49258398248756013, "learning_rate": 1.6453975469765913e-05, "loss": 0.6879, "step": 1847 }, { "epoch": 0.8326199594503266, "grad_norm": 0.5505061958549046, "learning_rate": 1.6450366184820256e-05, "loss": 0.653, "step": 1848 }, { "epoch": 0.8330705113764362, "grad_norm": 0.5208568700950313, "learning_rate": 1.6446755460277985e-05, "loss": 0.6787, "step": 1849 }, { "epoch": 0.8335210633025456, "grad_norm": 0.5123388147605508, "learning_rate": 1.6443143296944946e-05, "loss": 0.6702, "step": 1850 }, { "epoch": 0.8339716152286551, "grad_norm": 0.51519765106494, "learning_rate": 1.6439529695627295e-05, "loss": 0.6534, "step": 1851 }, { "epoch": 0.8344221671547646, "grad_norm": 0.5316902622449547, "learning_rate": 1.643591465713153e-05, "loss": 0.642, "step": 1852 }, { "epoch": 0.8348727190808741, "grad_norm": 0.4979236433231837, "learning_rate": 1.6432298182264444e-05, "loss": 0.6557, "step": 1853 }, { "epoch": 0.8353232710069836, "grad_norm": 0.49672779395807876, "learning_rate": 1.6428680271833174e-05, "loss": 0.6386, "step": 1854 }, { "epoch": 0.835773822933093, "grad_norm": 0.5032637841866505, "learning_rate": 1.6425060926645168e-05, "loss": 0.6713, "step": 1855 }, { "epoch": 0.8362243748592025, "grad_norm": 0.6287351167343102, "learning_rate": 1.6421440147508187e-05, "loss": 0.6902, "step": 1856 }, { "epoch": 0.836674926785312, "grad_norm": 0.5026184123893002, "learning_rate": 1.6417817935230318e-05, "loss": 0.6774, "step": 1857 }, { "epoch": 0.8371254787114215, "grad_norm": 0.49559181721392764, "learning_rate": 1.6414194290619975e-05, "loss": 0.655, "step": 1858 }, { "epoch": 0.8375760306375309, "grad_norm": 0.5382011034918758, "learning_rate": 1.641056921448588e-05, "loss": 0.6548, "step": 1859 }, { "epoch": 0.8380265825636405, "grad_norm": 0.5374928133909321, "learning_rate": 1.6406942707637086e-05, "loss": 0.7134, "step": 1860 }, { "epoch": 0.8384771344897499, "grad_norm": 0.5439835448419869, "learning_rate": 1.640331477088295e-05, "loss": 0.7397, "step": 1861 }, { "epoch": 0.8389276864158595, "grad_norm": 0.512085513452912, "learning_rate": 1.6399685405033168e-05, "loss": 0.673, "step": 1862 }, { "epoch": 0.8393782383419689, "grad_norm": 0.5025442796782228, "learning_rate": 1.6396054610897737e-05, "loss": 0.6467, "step": 1863 }, { "epoch": 0.8398287902680784, "grad_norm": 0.5161259021659569, "learning_rate": 1.6392422389286977e-05, "loss": 0.6676, "step": 1864 }, { "epoch": 0.8402793421941879, "grad_norm": 0.5264400652676099, "learning_rate": 1.6388788741011533e-05, "loss": 0.6502, "step": 1865 }, { "epoch": 0.8407298941202974, "grad_norm": 0.5173612816093311, "learning_rate": 1.638515366688237e-05, "loss": 0.6656, "step": 1866 }, { "epoch": 0.8411804460464068, "grad_norm": 0.5250457678387177, "learning_rate": 1.6381517167710757e-05, "loss": 0.6701, "step": 1867 }, { "epoch": 0.8416309979725163, "grad_norm": 0.5068526452532485, "learning_rate": 1.6377879244308297e-05, "loss": 0.6479, "step": 1868 }, { "epoch": 0.8420815498986258, "grad_norm": 0.5442389621742515, "learning_rate": 1.63742398974869e-05, "loss": 0.665, "step": 1869 }, { "epoch": 0.8425321018247353, "grad_norm": 0.542303845060778, "learning_rate": 1.6370599128058797e-05, "loss": 0.656, "step": 1870 }, { "epoch": 0.8429826537508448, "grad_norm": 0.5115109183545948, "learning_rate": 1.6366956936836543e-05, "loss": 0.6442, "step": 1871 }, { "epoch": 0.8434332056769542, "grad_norm": 0.5486219929885341, "learning_rate": 1.6363313324632995e-05, "loss": 0.6839, "step": 1872 }, { "epoch": 0.8438837576030638, "grad_norm": 0.5095792806870239, "learning_rate": 1.6359668292261347e-05, "loss": 0.643, "step": 1873 }, { "epoch": 0.8443343095291732, "grad_norm": 0.5618745897350912, "learning_rate": 1.635602184053509e-05, "loss": 0.7099, "step": 1874 }, { "epoch": 0.8447848614552828, "grad_norm": 0.4996099090007049, "learning_rate": 1.635237397026805e-05, "loss": 0.7127, "step": 1875 }, { "epoch": 0.8452354133813922, "grad_norm": 0.5390634811331548, "learning_rate": 1.6348724682274353e-05, "loss": 0.6465, "step": 1876 }, { "epoch": 0.8456859653075017, "grad_norm": 0.5428578362085199, "learning_rate": 1.6345073977368455e-05, "loss": 0.6481, "step": 1877 }, { "epoch": 0.8461365172336112, "grad_norm": 0.5191298101423281, "learning_rate": 1.6341421856365122e-05, "loss": 0.6468, "step": 1878 }, { "epoch": 0.8465870691597207, "grad_norm": 0.599844270705855, "learning_rate": 1.6337768320079433e-05, "loss": 0.699, "step": 1879 }, { "epoch": 0.8470376210858301, "grad_norm": 0.517065738918833, "learning_rate": 1.633411336932679e-05, "loss": 0.6329, "step": 1880 }, { "epoch": 0.8474881730119396, "grad_norm": 0.5390978163641235, "learning_rate": 1.6330457004922903e-05, "loss": 0.6683, "step": 1881 }, { "epoch": 0.8479387249380491, "grad_norm": 0.5406853378688271, "learning_rate": 1.6326799227683806e-05, "loss": 0.6919, "step": 1882 }, { "epoch": 0.8483892768641585, "grad_norm": 0.529404149852985, "learning_rate": 1.6323140038425842e-05, "loss": 0.6559, "step": 1883 }, { "epoch": 0.8488398287902681, "grad_norm": 0.5650285619247041, "learning_rate": 1.631947943796567e-05, "loss": 0.6711, "step": 1884 }, { "epoch": 0.8492903807163775, "grad_norm": 0.5135026239472046, "learning_rate": 1.6315817427120267e-05, "loss": 0.6553, "step": 1885 }, { "epoch": 0.8497409326424871, "grad_norm": 0.50053454603304, "learning_rate": 1.6312154006706922e-05, "loss": 0.6632, "step": 1886 }, { "epoch": 0.8501914845685965, "grad_norm": 0.5149711251865643, "learning_rate": 1.630848917754324e-05, "loss": 0.6695, "step": 1887 }, { "epoch": 0.850642036494706, "grad_norm": 0.4984742675722939, "learning_rate": 1.630482294044714e-05, "loss": 0.6484, "step": 1888 }, { "epoch": 0.8510925884208155, "grad_norm": 0.5129303982990158, "learning_rate": 1.630115529623685e-05, "loss": 0.6494, "step": 1889 }, { "epoch": 0.851543140346925, "grad_norm": 0.5717516236268644, "learning_rate": 1.6297486245730925e-05, "loss": 0.7038, "step": 1890 }, { "epoch": 0.8519936922730345, "grad_norm": 0.48758786382719277, "learning_rate": 1.6293815789748218e-05, "loss": 0.6689, "step": 1891 }, { "epoch": 0.852444244199144, "grad_norm": 0.5436070386103564, "learning_rate": 1.629014392910791e-05, "loss": 0.6536, "step": 1892 }, { "epoch": 0.8528947961252534, "grad_norm": 0.5344304742997343, "learning_rate": 1.628647066462949e-05, "loss": 0.646, "step": 1893 }, { "epoch": 0.853345348051363, "grad_norm": 0.5300461090554128, "learning_rate": 1.6282795997132755e-05, "loss": 0.6369, "step": 1894 }, { "epoch": 0.8537958999774724, "grad_norm": 0.5873706451717392, "learning_rate": 1.627911992743782e-05, "loss": 0.6818, "step": 1895 }, { "epoch": 0.8542464519035818, "grad_norm": 0.5038611500429004, "learning_rate": 1.627544245636511e-05, "loss": 0.673, "step": 1896 }, { "epoch": 0.8546970038296914, "grad_norm": 0.6105366581879929, "learning_rate": 1.6271763584735373e-05, "loss": 0.6459, "step": 1897 }, { "epoch": 0.8551475557558008, "grad_norm": 0.5433973970212502, "learning_rate": 1.6268083313369652e-05, "loss": 0.678, "step": 1898 }, { "epoch": 0.8555981076819104, "grad_norm": 0.5447626238294669, "learning_rate": 1.626440164308932e-05, "loss": 0.6606, "step": 1899 }, { "epoch": 0.8560486596080198, "grad_norm": 0.6288988787039758, "learning_rate": 1.6260718574716054e-05, "loss": 0.7092, "step": 1900 }, { "epoch": 0.8564992115341293, "grad_norm": 0.5102640026748408, "learning_rate": 1.6257034109071836e-05, "loss": 0.6898, "step": 1901 }, { "epoch": 0.8569497634602388, "grad_norm": 0.5569379556918521, "learning_rate": 1.625334824697898e-05, "loss": 0.6381, "step": 1902 }, { "epoch": 0.8574003153863483, "grad_norm": 0.5061212817996537, "learning_rate": 1.6249660989260084e-05, "loss": 0.6432, "step": 1903 }, { "epoch": 0.8578508673124577, "grad_norm": 0.49735233919265986, "learning_rate": 1.624597233673808e-05, "loss": 0.6732, "step": 1904 }, { "epoch": 0.8583014192385673, "grad_norm": 0.519073152366248, "learning_rate": 1.6242282290236208e-05, "loss": 0.6645, "step": 1905 }, { "epoch": 0.8587519711646767, "grad_norm": 0.5233466323668742, "learning_rate": 1.6238590850578004e-05, "loss": 0.6584, "step": 1906 }, { "epoch": 0.8592025230907863, "grad_norm": 0.5152087173709858, "learning_rate": 1.6234898018587336e-05, "loss": 0.7131, "step": 1907 }, { "epoch": 0.8596530750168957, "grad_norm": 0.5221996674644475, "learning_rate": 1.623120379508837e-05, "loss": 0.673, "step": 1908 }, { "epoch": 0.8601036269430051, "grad_norm": 0.5324563185159471, "learning_rate": 1.622750818090558e-05, "loss": 0.6683, "step": 1909 }, { "epoch": 0.8605541788691147, "grad_norm": 0.5181661685227001, "learning_rate": 1.622381117686376e-05, "loss": 0.6349, "step": 1910 }, { "epoch": 0.8610047307952241, "grad_norm": 0.5391225107030339, "learning_rate": 1.622011278378801e-05, "loss": 0.6809, "step": 1911 }, { "epoch": 0.8614552827213336, "grad_norm": 0.5329967622338992, "learning_rate": 1.6216413002503736e-05, "loss": 0.681, "step": 1912 }, { "epoch": 0.8619058346474431, "grad_norm": 0.5235556752318573, "learning_rate": 1.621271183383666e-05, "loss": 0.6852, "step": 1913 }, { "epoch": 0.8623563865735526, "grad_norm": 0.5267774854683875, "learning_rate": 1.620900927861281e-05, "loss": 0.7093, "step": 1914 }, { "epoch": 0.8628069384996621, "grad_norm": 0.5275521789245615, "learning_rate": 1.6205305337658526e-05, "loss": 0.6551, "step": 1915 }, { "epoch": 0.8632574904257716, "grad_norm": 0.5221192996948392, "learning_rate": 1.6201600011800454e-05, "loss": 0.685, "step": 1916 }, { "epoch": 0.863708042351881, "grad_norm": 0.5300868118172679, "learning_rate": 1.619789330186555e-05, "loss": 0.6132, "step": 1917 }, { "epoch": 0.8641585942779906, "grad_norm": 0.5210847517020849, "learning_rate": 1.6194185208681085e-05, "loss": 0.6719, "step": 1918 }, { "epoch": 0.8646091462041, "grad_norm": 0.5451744920798471, "learning_rate": 1.6190475733074627e-05, "loss": 0.6707, "step": 1919 }, { "epoch": 0.8650596981302096, "grad_norm": 0.5546818199637213, "learning_rate": 1.618676487587406e-05, "loss": 0.6533, "step": 1920 }, { "epoch": 0.865510250056319, "grad_norm": 0.49780857750795865, "learning_rate": 1.618305263790758e-05, "loss": 0.6857, "step": 1921 }, { "epoch": 0.8659608019824285, "grad_norm": 0.5098568951387064, "learning_rate": 1.6179339020003685e-05, "loss": 0.6762, "step": 1922 }, { "epoch": 0.866411353908538, "grad_norm": 0.5385416810774136, "learning_rate": 1.617562402299118e-05, "loss": 0.6384, "step": 1923 }, { "epoch": 0.8668619058346474, "grad_norm": 0.49812575570250833, "learning_rate": 1.6171907647699182e-05, "loss": 0.6714, "step": 1924 }, { "epoch": 0.8673124577607569, "grad_norm": 0.5149666382669621, "learning_rate": 1.616818989495711e-05, "loss": 0.6613, "step": 1925 }, { "epoch": 0.8677630096868664, "grad_norm": 0.5157565827861237, "learning_rate": 1.61644707655947e-05, "loss": 0.6431, "step": 1926 }, { "epoch": 0.8682135616129759, "grad_norm": 0.5193419706928085, "learning_rate": 1.616075026044199e-05, "loss": 0.6508, "step": 1927 }, { "epoch": 0.8686641135390853, "grad_norm": 0.5355480594715254, "learning_rate": 1.6157028380329318e-05, "loss": 0.687, "step": 1928 }, { "epoch": 0.8691146654651949, "grad_norm": 0.5300378915905807, "learning_rate": 1.615330512608734e-05, "loss": 0.6353, "step": 1929 }, { "epoch": 0.8695652173913043, "grad_norm": 0.49621985950852376, "learning_rate": 1.6149580498547015e-05, "loss": 0.6896, "step": 1930 }, { "epoch": 0.8700157693174139, "grad_norm": 0.49083038161854803, "learning_rate": 1.6145854498539603e-05, "loss": 0.6464, "step": 1931 }, { "epoch": 0.8704663212435233, "grad_norm": 0.5397139744212004, "learning_rate": 1.6142127126896682e-05, "loss": 0.6733, "step": 1932 }, { "epoch": 0.8709168731696328, "grad_norm": 0.49346894884979775, "learning_rate": 1.613839838445012e-05, "loss": 0.6826, "step": 1933 }, { "epoch": 0.8713674250957423, "grad_norm": 0.4873383850158781, "learning_rate": 1.6134668272032108e-05, "loss": 0.6549, "step": 1934 }, { "epoch": 0.8718179770218518, "grad_norm": 0.506052417113672, "learning_rate": 1.613093679047513e-05, "loss": 0.6635, "step": 1935 }, { "epoch": 0.8722685289479613, "grad_norm": 0.49171925603823585, "learning_rate": 1.6127203940611984e-05, "loss": 0.6393, "step": 1936 }, { "epoch": 0.8727190808740707, "grad_norm": 0.5039181254699718, "learning_rate": 1.6123469723275766e-05, "loss": 0.6409, "step": 1937 }, { "epoch": 0.8731696328001802, "grad_norm": 0.5421604475027372, "learning_rate": 1.611973413929988e-05, "loss": 0.6804, "step": 1938 }, { "epoch": 0.8736201847262897, "grad_norm": 0.5497228729338486, "learning_rate": 1.6115997189518043e-05, "loss": 0.6915, "step": 1939 }, { "epoch": 0.8740707366523992, "grad_norm": 0.5146279093706602, "learning_rate": 1.6112258874764264e-05, "loss": 0.6761, "step": 1940 }, { "epoch": 0.8745212885785086, "grad_norm": 0.4992428826050214, "learning_rate": 1.610851919587286e-05, "loss": 0.6064, "step": 1941 }, { "epoch": 0.8749718405046182, "grad_norm": 0.5645991029718582, "learning_rate": 1.6104778153678467e-05, "loss": 0.6175, "step": 1942 }, { "epoch": 0.8754223924307276, "grad_norm": 0.5516339591401951, "learning_rate": 1.6101035749016e-05, "loss": 0.7204, "step": 1943 }, { "epoch": 0.8758729443568372, "grad_norm": 0.578676361433414, "learning_rate": 1.609729198272069e-05, "loss": 0.6584, "step": 1944 }, { "epoch": 0.8763234962829466, "grad_norm": 0.5433363616319653, "learning_rate": 1.6093546855628085e-05, "loss": 0.6614, "step": 1945 }, { "epoch": 0.8767740482090561, "grad_norm": 0.5376888712608158, "learning_rate": 1.6089800368574013e-05, "loss": 0.6516, "step": 1946 }, { "epoch": 0.8772246001351656, "grad_norm": 0.5444558524758849, "learning_rate": 1.6086052522394625e-05, "loss": 0.6725, "step": 1947 }, { "epoch": 0.8776751520612751, "grad_norm": 0.564886659913815, "learning_rate": 1.6082303317926365e-05, "loss": 0.6672, "step": 1948 }, { "epoch": 0.8781257039873845, "grad_norm": 0.5345926912817267, "learning_rate": 1.6078552756005983e-05, "loss": 0.657, "step": 1949 }, { "epoch": 0.878576255913494, "grad_norm": 0.5755437206403988, "learning_rate": 1.6074800837470532e-05, "loss": 0.6783, "step": 1950 }, { "epoch": 0.8790268078396035, "grad_norm": 0.536181046205415, "learning_rate": 1.6071047563157364e-05, "loss": 0.6799, "step": 1951 }, { "epoch": 0.879477359765713, "grad_norm": 0.5692022531737487, "learning_rate": 1.6067292933904144e-05, "loss": 0.6821, "step": 1952 }, { "epoch": 0.8799279116918225, "grad_norm": 0.49645302741446284, "learning_rate": 1.6063536950548825e-05, "loss": 0.6239, "step": 1953 }, { "epoch": 0.8803784636179319, "grad_norm": 0.5792422856556627, "learning_rate": 1.6059779613929674e-05, "loss": 0.6376, "step": 1954 }, { "epoch": 0.8808290155440415, "grad_norm": 0.5297792297025086, "learning_rate": 1.6056020924885255e-05, "loss": 0.6408, "step": 1955 }, { "epoch": 0.8812795674701509, "grad_norm": 0.531024267615777, "learning_rate": 1.605226088425443e-05, "loss": 0.6217, "step": 1956 }, { "epoch": 0.8817301193962604, "grad_norm": 0.5781404310619398, "learning_rate": 1.6048499492876378e-05, "loss": 0.6636, "step": 1957 }, { "epoch": 0.8821806713223699, "grad_norm": 0.5669813469516876, "learning_rate": 1.6044736751590556e-05, "loss": 0.7093, "step": 1958 }, { "epoch": 0.8826312232484794, "grad_norm": 0.5490741201806235, "learning_rate": 1.6040972661236746e-05, "loss": 0.665, "step": 1959 }, { "epoch": 0.8830817751745889, "grad_norm": 0.5364504898346699, "learning_rate": 1.603720722265501e-05, "loss": 0.6613, "step": 1960 }, { "epoch": 0.8835323271006984, "grad_norm": 0.5290407884497877, "learning_rate": 1.603344043668573e-05, "loss": 0.6518, "step": 1961 }, { "epoch": 0.8839828790268078, "grad_norm": 0.594172921531498, "learning_rate": 1.602967230416957e-05, "loss": 0.6883, "step": 1962 }, { "epoch": 0.8844334309529174, "grad_norm": 0.54861317417747, "learning_rate": 1.602590282594751e-05, "loss": 0.6833, "step": 1963 }, { "epoch": 0.8848839828790268, "grad_norm": 0.5883459659554394, "learning_rate": 1.6022132002860824e-05, "loss": 0.6732, "step": 1964 }, { "epoch": 0.8853345348051362, "grad_norm": 0.5524795898280936, "learning_rate": 1.6018359835751085e-05, "loss": 0.6553, "step": 1965 }, { "epoch": 0.8857850867312458, "grad_norm": 0.5692930893586184, "learning_rate": 1.601458632546017e-05, "loss": 0.6407, "step": 1966 }, { "epoch": 0.8862356386573552, "grad_norm": 0.555706666497748, "learning_rate": 1.6010811472830253e-05, "loss": 0.6776, "step": 1967 }, { "epoch": 0.8866861905834648, "grad_norm": 0.5378246028748245, "learning_rate": 1.6007035278703804e-05, "loss": 0.6826, "step": 1968 }, { "epoch": 0.8871367425095742, "grad_norm": 0.5445617380671465, "learning_rate": 1.60032577439236e-05, "loss": 0.6037, "step": 1969 }, { "epoch": 0.8875872944356837, "grad_norm": 0.5564403070512235, "learning_rate": 1.5999478869332714e-05, "loss": 0.6846, "step": 1970 }, { "epoch": 0.8880378463617932, "grad_norm": 0.5693356095672034, "learning_rate": 1.5995698655774515e-05, "loss": 0.6748, "step": 1971 }, { "epoch": 0.8884883982879027, "grad_norm": 0.5395132005360166, "learning_rate": 1.5991917104092677e-05, "loss": 0.6671, "step": 1972 }, { "epoch": 0.8889389502140121, "grad_norm": 0.5225228687509397, "learning_rate": 1.5988134215131163e-05, "loss": 0.6121, "step": 1973 }, { "epoch": 0.8893895021401217, "grad_norm": 0.5185581341534552, "learning_rate": 1.5984349989734248e-05, "loss": 0.6448, "step": 1974 }, { "epoch": 0.8898400540662311, "grad_norm": 0.5080882952373498, "learning_rate": 1.5980564428746488e-05, "loss": 0.6498, "step": 1975 }, { "epoch": 0.8902906059923407, "grad_norm": 0.5458922221815351, "learning_rate": 1.5976777533012754e-05, "loss": 0.6219, "step": 1976 }, { "epoch": 0.8907411579184501, "grad_norm": 0.5289189080674404, "learning_rate": 1.5972989303378207e-05, "loss": 0.6711, "step": 1977 }, { "epoch": 0.8911917098445595, "grad_norm": 0.5360763372782484, "learning_rate": 1.596919974068831e-05, "loss": 0.6678, "step": 1978 }, { "epoch": 0.8916422617706691, "grad_norm": 0.5172123757183952, "learning_rate": 1.5965408845788808e-05, "loss": 0.6384, "step": 1979 }, { "epoch": 0.8920928136967785, "grad_norm": 0.559559436225015, "learning_rate": 1.5961616619525763e-05, "loss": 0.6912, "step": 1980 }, { "epoch": 0.892543365622888, "grad_norm": 0.5140953910478963, "learning_rate": 1.595782306274553e-05, "loss": 0.6548, "step": 1981 }, { "epoch": 0.8929939175489975, "grad_norm": 0.5291050388340541, "learning_rate": 1.595402817629475e-05, "loss": 0.6989, "step": 1982 }, { "epoch": 0.893444469475107, "grad_norm": 0.5026974535665883, "learning_rate": 1.5950231961020373e-05, "loss": 0.6402, "step": 1983 }, { "epoch": 0.8938950214012165, "grad_norm": 0.5577133898768112, "learning_rate": 1.5946434417769636e-05, "loss": 0.6659, "step": 1984 }, { "epoch": 0.894345573327326, "grad_norm": 0.5421965779699275, "learning_rate": 1.5942635547390083e-05, "loss": 0.6643, "step": 1985 }, { "epoch": 0.8947961252534354, "grad_norm": 0.5345652847459996, "learning_rate": 1.5938835350729543e-05, "loss": 0.6098, "step": 1986 }, { "epoch": 0.895246677179545, "grad_norm": 0.5289902398814134, "learning_rate": 1.593503382863615e-05, "loss": 0.6791, "step": 1987 }, { "epoch": 0.8956972291056544, "grad_norm": 0.5037841498814655, "learning_rate": 1.593123098195833e-05, "loss": 0.612, "step": 1988 }, { "epoch": 0.896147781031764, "grad_norm": 0.5122879531215416, "learning_rate": 1.5927426811544795e-05, "loss": 0.6805, "step": 1989 }, { "epoch": 0.8965983329578734, "grad_norm": 0.5240994844996295, "learning_rate": 1.5923621318244575e-05, "loss": 0.6579, "step": 1990 }, { "epoch": 0.8970488848839829, "grad_norm": 0.5281427704912629, "learning_rate": 1.591981450290698e-05, "loss": 0.6887, "step": 1991 }, { "epoch": 0.8974994368100924, "grad_norm": 0.5221484738043649, "learning_rate": 1.591600636638161e-05, "loss": 0.6978, "step": 1992 }, { "epoch": 0.8979499887362018, "grad_norm": 0.5693985311884532, "learning_rate": 1.591219690951837e-05, "loss": 0.685, "step": 1993 }, { "epoch": 0.8984005406623113, "grad_norm": 0.5249736023376643, "learning_rate": 1.590838613316746e-05, "loss": 0.6632, "step": 1994 }, { "epoch": 0.8988510925884208, "grad_norm": 0.5277518447913128, "learning_rate": 1.5904574038179372e-05, "loss": 0.6537, "step": 1995 }, { "epoch": 0.8993016445145303, "grad_norm": 0.5010076619593572, "learning_rate": 1.590076062540489e-05, "loss": 0.6558, "step": 1996 }, { "epoch": 0.8997521964406398, "grad_norm": 0.5225992245012209, "learning_rate": 1.589694589569509e-05, "loss": 0.6553, "step": 1997 }, { "epoch": 0.9002027483667493, "grad_norm": 0.5526923642116802, "learning_rate": 1.589312984990135e-05, "loss": 0.6742, "step": 1998 }, { "epoch": 0.9006533002928587, "grad_norm": 0.5385313012615812, "learning_rate": 1.5889312488875338e-05, "loss": 0.7154, "step": 1999 }, { "epoch": 0.9011038522189683, "grad_norm": 0.5448329774297684, "learning_rate": 1.5885493813469012e-05, "loss": 0.6555, "step": 2000 }, { "epoch": 0.9015544041450777, "grad_norm": 0.5101967866193727, "learning_rate": 1.5881673824534628e-05, "loss": 0.692, "step": 2001 }, { "epoch": 0.9020049560711872, "grad_norm": 0.5219026768819528, "learning_rate": 1.5877852522924733e-05, "loss": 0.6631, "step": 2002 }, { "epoch": 0.9024555079972967, "grad_norm": 0.5006883856393376, "learning_rate": 1.5874029909492164e-05, "loss": 0.6496, "step": 2003 }, { "epoch": 0.9029060599234062, "grad_norm": 0.5127013724131783, "learning_rate": 1.5870205985090064e-05, "loss": 0.6423, "step": 2004 }, { "epoch": 0.9033566118495157, "grad_norm": 0.5730964319647597, "learning_rate": 1.586638075057185e-05, "loss": 0.6818, "step": 2005 }, { "epoch": 0.9038071637756251, "grad_norm": 0.49170002257667433, "learning_rate": 1.586255420679124e-05, "loss": 0.6472, "step": 2006 }, { "epoch": 0.9042577157017346, "grad_norm": 0.5981291726542217, "learning_rate": 1.5858726354602248e-05, "loss": 0.7001, "step": 2007 }, { "epoch": 0.9047082676278441, "grad_norm": 0.5388581149152775, "learning_rate": 1.5854897194859175e-05, "loss": 0.6411, "step": 2008 }, { "epoch": 0.9051588195539536, "grad_norm": 0.5419946303810427, "learning_rate": 1.5851066728416617e-05, "loss": 0.6852, "step": 2009 }, { "epoch": 0.905609371480063, "grad_norm": 0.5220625717103733, "learning_rate": 1.584723495612946e-05, "loss": 0.6708, "step": 2010 }, { "epoch": 0.9060599234061726, "grad_norm": 0.5159720760997387, "learning_rate": 1.5843401878852876e-05, "loss": 0.6741, "step": 2011 }, { "epoch": 0.906510475332282, "grad_norm": 0.4869520098877117, "learning_rate": 1.5839567497442338e-05, "loss": 0.7048, "step": 2012 }, { "epoch": 0.9069610272583916, "grad_norm": 0.5279993361322587, "learning_rate": 1.58357318127536e-05, "loss": 0.6677, "step": 2013 }, { "epoch": 0.907411579184501, "grad_norm": 0.5033620869443199, "learning_rate": 1.5831894825642723e-05, "loss": 0.6346, "step": 2014 }, { "epoch": 0.9078621311106105, "grad_norm": 0.5699149725511546, "learning_rate": 1.5828056536966042e-05, "loss": 0.619, "step": 2015 }, { "epoch": 0.90831268303672, "grad_norm": 0.4998351427091685, "learning_rate": 1.5824216947580183e-05, "loss": 0.6893, "step": 2016 }, { "epoch": 0.9087632349628295, "grad_norm": 0.560904911216051, "learning_rate": 1.5820376058342077e-05, "loss": 0.6719, "step": 2017 }, { "epoch": 0.909213786888939, "grad_norm": 0.5278833720308567, "learning_rate": 1.5816533870108934e-05, "loss": 0.6519, "step": 2018 }, { "epoch": 0.9096643388150484, "grad_norm": 0.5283638658663239, "learning_rate": 1.5812690383738253e-05, "loss": 0.7126, "step": 2019 }, { "epoch": 0.9101148907411579, "grad_norm": 0.4949318448635655, "learning_rate": 1.5808845600087827e-05, "loss": 0.647, "step": 2020 }, { "epoch": 0.9105654426672674, "grad_norm": 0.537778919429307, "learning_rate": 1.5804999520015735e-05, "loss": 0.6821, "step": 2021 }, { "epoch": 0.9110159945933769, "grad_norm": 0.48958641647966283, "learning_rate": 1.5801152144380353e-05, "loss": 0.6593, "step": 2022 }, { "epoch": 0.9114665465194863, "grad_norm": 0.5265191494338296, "learning_rate": 1.5797303474040332e-05, "loss": 0.6522, "step": 2023 }, { "epoch": 0.9119170984455959, "grad_norm": 0.5043174903961868, "learning_rate": 1.5793453509854632e-05, "loss": 0.6426, "step": 2024 }, { "epoch": 0.9123676503717053, "grad_norm": 0.5052917742703693, "learning_rate": 1.5789602252682482e-05, "loss": 0.6631, "step": 2025 }, { "epoch": 0.9128182022978149, "grad_norm": 0.5369848814359979, "learning_rate": 1.5785749703383408e-05, "loss": 0.6721, "step": 2026 }, { "epoch": 0.9132687542239243, "grad_norm": 0.5551852576873637, "learning_rate": 1.578189586281723e-05, "loss": 0.6711, "step": 2027 }, { "epoch": 0.9137193061500338, "grad_norm": 0.5393416185216897, "learning_rate": 1.577804073184404e-05, "loss": 0.683, "step": 2028 }, { "epoch": 0.9141698580761433, "grad_norm": 0.5225955896922032, "learning_rate": 1.5774184311324237e-05, "loss": 0.6263, "step": 2029 }, { "epoch": 0.9146204100022528, "grad_norm": 0.5173816264664313, "learning_rate": 1.5770326602118502e-05, "loss": 0.6602, "step": 2030 }, { "epoch": 0.9150709619283622, "grad_norm": 0.5233330049294469, "learning_rate": 1.5766467605087793e-05, "loss": 0.6856, "step": 2031 }, { "epoch": 0.9155215138544718, "grad_norm": 0.5224088340558598, "learning_rate": 1.5762607321093368e-05, "loss": 0.6517, "step": 2032 }, { "epoch": 0.9159720657805812, "grad_norm": 0.5695844974343865, "learning_rate": 1.5758745750996767e-05, "loss": 0.7038, "step": 2033 }, { "epoch": 0.9164226177066906, "grad_norm": 0.5242076669955836, "learning_rate": 1.5754882895659816e-05, "loss": 0.7078, "step": 2034 }, { "epoch": 0.9168731696328002, "grad_norm": 0.5433713771096753, "learning_rate": 1.5751018755944628e-05, "loss": 0.6666, "step": 2035 }, { "epoch": 0.9173237215589096, "grad_norm": 0.5263095508652006, "learning_rate": 1.5747153332713605e-05, "loss": 0.6515, "step": 2036 }, { "epoch": 0.9177742734850192, "grad_norm": 0.5519739851077745, "learning_rate": 1.5743286626829437e-05, "loss": 0.6971, "step": 2037 }, { "epoch": 0.9182248254111286, "grad_norm": 0.5125230880945615, "learning_rate": 1.5739418639155097e-05, "loss": 0.6385, "step": 2038 }, { "epoch": 0.9186753773372381, "grad_norm": 0.5219448311603279, "learning_rate": 1.5735549370553847e-05, "loss": 0.6432, "step": 2039 }, { "epoch": 0.9191259292633476, "grad_norm": 0.515527450684626, "learning_rate": 1.5731678821889225e-05, "loss": 0.6772, "step": 2040 }, { "epoch": 0.9195764811894571, "grad_norm": 0.4809304604813576, "learning_rate": 1.5727806994025068e-05, "loss": 0.6384, "step": 2041 }, { "epoch": 0.9200270331155666, "grad_norm": 0.5167563195265864, "learning_rate": 1.5723933887825492e-05, "loss": 0.6583, "step": 2042 }, { "epoch": 0.9204775850416761, "grad_norm": 0.5336009269136912, "learning_rate": 1.5720059504154898e-05, "loss": 0.6888, "step": 2043 }, { "epoch": 0.9209281369677855, "grad_norm": 0.5312508757455741, "learning_rate": 1.5716183843877977e-05, "loss": 0.6733, "step": 2044 }, { "epoch": 0.9213786888938951, "grad_norm": 0.49858558968300476, "learning_rate": 1.57123069078597e-05, "loss": 0.6466, "step": 2045 }, { "epoch": 0.9218292408200045, "grad_norm": 0.5632470759472132, "learning_rate": 1.570842869696532e-05, "loss": 0.6717, "step": 2046 }, { "epoch": 0.9222797927461139, "grad_norm": 0.5062402298071194, "learning_rate": 1.5704549212060383e-05, "loss": 0.6197, "step": 2047 }, { "epoch": 0.9227303446722235, "grad_norm": 0.5403047420067593, "learning_rate": 1.570066845401071e-05, "loss": 0.6846, "step": 2048 }, { "epoch": 0.9231808965983329, "grad_norm": 0.5227696077651769, "learning_rate": 1.5696786423682414e-05, "loss": 0.6506, "step": 2049 }, { "epoch": 0.9236314485244425, "grad_norm": 0.5326646906249989, "learning_rate": 1.569290312194189e-05, "loss": 0.6515, "step": 2050 }, { "epoch": 0.9240820004505519, "grad_norm": 0.5005348864036909, "learning_rate": 1.5689018549655815e-05, "loss": 0.65, "step": 2051 }, { "epoch": 0.9245325523766614, "grad_norm": 0.5504801338556821, "learning_rate": 1.568513270769115e-05, "loss": 0.6572, "step": 2052 }, { "epoch": 0.9249831043027709, "grad_norm": 0.5241921483400896, "learning_rate": 1.5681245596915137e-05, "loss": 0.6411, "step": 2053 }, { "epoch": 0.9254336562288804, "grad_norm": 0.5221444286008937, "learning_rate": 1.5677357218195307e-05, "loss": 0.6712, "step": 2054 }, { "epoch": 0.9258842081549898, "grad_norm": 0.5788683760204377, "learning_rate": 1.5673467572399466e-05, "loss": 0.6892, "step": 2055 }, { "epoch": 0.9263347600810994, "grad_norm": 0.558866251362816, "learning_rate": 1.5669576660395716e-05, "loss": 0.6881, "step": 2056 }, { "epoch": 0.9267853120072088, "grad_norm": 0.5518916126004071, "learning_rate": 1.5665684483052425e-05, "loss": 0.6618, "step": 2057 }, { "epoch": 0.9272358639333184, "grad_norm": 0.5237552553822056, "learning_rate": 1.5661791041238258e-05, "loss": 0.6521, "step": 2058 }, { "epoch": 0.9276864158594278, "grad_norm": 0.551129553430645, "learning_rate": 1.5657896335822147e-05, "loss": 0.6539, "step": 2059 }, { "epoch": 0.9281369677855373, "grad_norm": 0.5441565292023076, "learning_rate": 1.5654000367673322e-05, "loss": 0.6725, "step": 2060 }, { "epoch": 0.9285875197116468, "grad_norm": 0.5597106880407459, "learning_rate": 1.5650103137661285e-05, "loss": 0.6801, "step": 2061 }, { "epoch": 0.9290380716377562, "grad_norm": 0.4988063945830279, "learning_rate": 1.564620464665582e-05, "loss": 0.6219, "step": 2062 }, { "epoch": 0.9294886235638657, "grad_norm": 0.6231314978156812, "learning_rate": 1.5642304895527005e-05, "loss": 0.6958, "step": 2063 }, { "epoch": 0.9299391754899752, "grad_norm": 0.5740802056824497, "learning_rate": 1.5638403885145175e-05, "loss": 0.695, "step": 2064 }, { "epoch": 0.9303897274160847, "grad_norm": 0.6045517546049897, "learning_rate": 1.5634501616380967e-05, "loss": 0.6118, "step": 2065 }, { "epoch": 0.9308402793421942, "grad_norm": 0.5884633998537847, "learning_rate": 1.563059809010529e-05, "loss": 0.644, "step": 2066 }, { "epoch": 0.9312908312683037, "grad_norm": 0.5351950435225182, "learning_rate": 1.5626693307189334e-05, "loss": 0.6704, "step": 2067 }, { "epoch": 0.9317413831944131, "grad_norm": 0.6077568821908993, "learning_rate": 1.562278726850458e-05, "loss": 0.6874, "step": 2068 }, { "epoch": 0.9321919351205227, "grad_norm": 0.5267129761560163, "learning_rate": 1.5618879974922765e-05, "loss": 0.6737, "step": 2069 }, { "epoch": 0.9326424870466321, "grad_norm": 0.49463875851163214, "learning_rate": 1.561497142731593e-05, "loss": 0.6643, "step": 2070 }, { "epoch": 0.9330930389727417, "grad_norm": 0.5632942832429155, "learning_rate": 1.561106162655639e-05, "loss": 0.6498, "step": 2071 }, { "epoch": 0.9335435908988511, "grad_norm": 0.5419849203110403, "learning_rate": 1.560715057351673e-05, "loss": 0.6798, "step": 2072 }, { "epoch": 0.9339941428249606, "grad_norm": 0.5517104184297006, "learning_rate": 1.560323826906982e-05, "loss": 0.6843, "step": 2073 }, { "epoch": 0.9344446947510701, "grad_norm": 0.5549696010076861, "learning_rate": 1.5599324714088823e-05, "loss": 0.6867, "step": 2074 }, { "epoch": 0.9348952466771795, "grad_norm": 0.49606738437646364, "learning_rate": 1.559540990944715e-05, "loss": 0.6085, "step": 2075 }, { "epoch": 0.935345798603289, "grad_norm": 0.5593636318888514, "learning_rate": 1.5591493856018526e-05, "loss": 0.644, "step": 2076 }, { "epoch": 0.9357963505293985, "grad_norm": 0.5254717211018337, "learning_rate": 1.5587576554676927e-05, "loss": 0.6596, "step": 2077 }, { "epoch": 0.936246902455508, "grad_norm": 0.5963370972976311, "learning_rate": 1.5583658006296626e-05, "loss": 0.6514, "step": 2078 }, { "epoch": 0.9366974543816174, "grad_norm": 0.5055684425103384, "learning_rate": 1.5579738211752165e-05, "loss": 0.6873, "step": 2079 }, { "epoch": 0.937148006307727, "grad_norm": 0.546847850451453, "learning_rate": 1.557581717191836e-05, "loss": 0.6489, "step": 2080 }, { "epoch": 0.9375985582338364, "grad_norm": 0.5142106447586308, "learning_rate": 1.557189488767032e-05, "loss": 0.6562, "step": 2081 }, { "epoch": 0.938049110159946, "grad_norm": 0.512754256649422, "learning_rate": 1.556797135988342e-05, "loss": 0.6678, "step": 2082 }, { "epoch": 0.9384996620860554, "grad_norm": 0.5075318385546839, "learning_rate": 1.5564046589433312e-05, "loss": 0.6502, "step": 2083 }, { "epoch": 0.9389502140121649, "grad_norm": 0.5227358604637687, "learning_rate": 1.5560120577195933e-05, "loss": 0.6781, "step": 2084 }, { "epoch": 0.9394007659382744, "grad_norm": 0.4995149865317715, "learning_rate": 1.555619332404749e-05, "loss": 0.6797, "step": 2085 }, { "epoch": 0.9398513178643839, "grad_norm": 0.5207158175535739, "learning_rate": 1.555226483086447e-05, "loss": 0.6756, "step": 2086 }, { "epoch": 0.9403018697904933, "grad_norm": 0.5598835648740843, "learning_rate": 1.5548335098523634e-05, "loss": 0.673, "step": 2087 }, { "epoch": 0.9407524217166029, "grad_norm": 0.5089818969023402, "learning_rate": 1.5544404127902027e-05, "loss": 0.6419, "step": 2088 }, { "epoch": 0.9412029736427123, "grad_norm": 0.5490259017134944, "learning_rate": 1.5540471919876966e-05, "loss": 0.6516, "step": 2089 }, { "epoch": 0.9416535255688218, "grad_norm": 0.528830015027588, "learning_rate": 1.5536538475326037e-05, "loss": 0.6684, "step": 2090 }, { "epoch": 0.9421040774949313, "grad_norm": 0.5808117934864314, "learning_rate": 1.5532603795127113e-05, "loss": 0.634, "step": 2091 }, { "epoch": 0.9425546294210407, "grad_norm": 0.5158175666205743, "learning_rate": 1.5528667880158338e-05, "loss": 0.6426, "step": 2092 }, { "epoch": 0.9430051813471503, "grad_norm": 0.5653036648866141, "learning_rate": 1.5524730731298136e-05, "loss": 0.6499, "step": 2093 }, { "epoch": 0.9434557332732597, "grad_norm": 0.5251899490211328, "learning_rate": 1.552079234942519e-05, "loss": 0.6449, "step": 2094 }, { "epoch": 0.9439062851993693, "grad_norm": 0.5568661480226216, "learning_rate": 1.5516852735418483e-05, "loss": 0.6673, "step": 2095 }, { "epoch": 0.9443568371254787, "grad_norm": 0.555575439331069, "learning_rate": 1.5512911890157253e-05, "loss": 0.6542, "step": 2096 }, { "epoch": 0.9448073890515882, "grad_norm": 0.5589050414101941, "learning_rate": 1.5508969814521026e-05, "loss": 0.6542, "step": 2097 }, { "epoch": 0.9452579409776977, "grad_norm": 0.5666188620279057, "learning_rate": 1.5505026509389595e-05, "loss": 0.6387, "step": 2098 }, { "epoch": 0.9457084929038072, "grad_norm": 0.5491557007313722, "learning_rate": 1.5501081975643023e-05, "loss": 0.6466, "step": 2099 }, { "epoch": 0.9461590448299166, "grad_norm": 0.5519641579448142, "learning_rate": 1.5497136214161662e-05, "loss": 0.6566, "step": 2100 }, { "epoch": 0.9466095967560262, "grad_norm": 0.5233132623802725, "learning_rate": 1.5493189225826125e-05, "loss": 0.6496, "step": 2101 }, { "epoch": 0.9470601486821356, "grad_norm": 0.5764777932665365, "learning_rate": 1.5489241011517303e-05, "loss": 0.663, "step": 2102 }, { "epoch": 0.947510700608245, "grad_norm": 0.5559379851467695, "learning_rate": 1.548529157211636e-05, "loss": 0.6623, "step": 2103 }, { "epoch": 0.9479612525343546, "grad_norm": 0.5648490529221545, "learning_rate": 1.548134090850474e-05, "loss": 0.6819, "step": 2104 }, { "epoch": 0.948411804460464, "grad_norm": 0.5142092047417908, "learning_rate": 1.547738902156415e-05, "loss": 0.6635, "step": 2105 }, { "epoch": 0.9488623563865736, "grad_norm": 0.5677104905303226, "learning_rate": 1.5473435912176573e-05, "loss": 0.6824, "step": 2106 }, { "epoch": 0.949312908312683, "grad_norm": 0.5321979368094254, "learning_rate": 1.5469481581224274e-05, "loss": 0.6569, "step": 2107 }, { "epoch": 0.9497634602387925, "grad_norm": 0.5595482515980114, "learning_rate": 1.546552602958977e-05, "loss": 0.6663, "step": 2108 }, { "epoch": 0.950214012164902, "grad_norm": 0.5003160806040748, "learning_rate": 1.5461569258155872e-05, "loss": 0.6085, "step": 2109 }, { "epoch": 0.9506645640910115, "grad_norm": 0.5092646416545348, "learning_rate": 1.5457611267805657e-05, "loss": 0.6521, "step": 2110 }, { "epoch": 0.951115116017121, "grad_norm": 0.5119985396121255, "learning_rate": 1.5453652059422468e-05, "loss": 0.6628, "step": 2111 }, { "epoch": 0.9515656679432305, "grad_norm": 0.48560395083833746, "learning_rate": 1.5449691633889924e-05, "loss": 0.668, "step": 2112 }, { "epoch": 0.9520162198693399, "grad_norm": 0.4959678047047878, "learning_rate": 1.544572999209191e-05, "loss": 0.6705, "step": 2113 }, { "epoch": 0.9524667717954495, "grad_norm": 0.51308773953676, "learning_rate": 1.5441767134912597e-05, "loss": 0.6585, "step": 2114 }, { "epoch": 0.9529173237215589, "grad_norm": 0.5255563858305244, "learning_rate": 1.543780306323641e-05, "loss": 0.6748, "step": 2115 }, { "epoch": 0.9533678756476683, "grad_norm": 0.4858447219492103, "learning_rate": 1.5433837777948058e-05, "loss": 0.669, "step": 2116 }, { "epoch": 0.9538184275737779, "grad_norm": 0.510221032872435, "learning_rate": 1.5429871279932514e-05, "loss": 0.6743, "step": 2117 }, { "epoch": 0.9542689794998873, "grad_norm": 0.501071975757376, "learning_rate": 1.5425903570075023e-05, "loss": 0.6377, "step": 2118 }, { "epoch": 0.9547195314259969, "grad_norm": 0.49870366544239103, "learning_rate": 1.54219346492611e-05, "loss": 0.654, "step": 2119 }, { "epoch": 0.9551700833521063, "grad_norm": 0.5684212651494892, "learning_rate": 1.5417964518376532e-05, "loss": 0.6601, "step": 2120 }, { "epoch": 0.9556206352782158, "grad_norm": 0.4886385090273265, "learning_rate": 1.541399317830738e-05, "loss": 0.6567, "step": 2121 }, { "epoch": 0.9560711872043253, "grad_norm": 0.5228727320052189, "learning_rate": 1.5410020629939966e-05, "loss": 0.6679, "step": 2122 }, { "epoch": 0.9565217391304348, "grad_norm": 0.51794769529738, "learning_rate": 1.5406046874160882e-05, "loss": 0.6865, "step": 2123 }, { "epoch": 0.9569722910565442, "grad_norm": 0.5986006087337848, "learning_rate": 1.5402071911856995e-05, "loss": 0.6176, "step": 2124 }, { "epoch": 0.9574228429826538, "grad_norm": 0.49752767059019587, "learning_rate": 1.5398095743915448e-05, "loss": 0.655, "step": 2125 }, { "epoch": 0.9578733949087632, "grad_norm": 0.5430236550028833, "learning_rate": 1.539411837122363e-05, "loss": 0.6353, "step": 2126 }, { "epoch": 0.9583239468348728, "grad_norm": 0.5538830742328424, "learning_rate": 1.5390139794669225e-05, "loss": 0.6714, "step": 2127 }, { "epoch": 0.9587744987609822, "grad_norm": 0.5242326392348307, "learning_rate": 1.5386160015140168e-05, "loss": 0.6878, "step": 2128 }, { "epoch": 0.9592250506870917, "grad_norm": 0.5567700542500603, "learning_rate": 1.5382179033524672e-05, "loss": 0.6572, "step": 2129 }, { "epoch": 0.9596756026132012, "grad_norm": 0.5046067781816161, "learning_rate": 1.5378196850711212e-05, "loss": 0.6404, "step": 2130 }, { "epoch": 0.9601261545393106, "grad_norm": 0.5518208742896313, "learning_rate": 1.5374213467588538e-05, "loss": 0.6629, "step": 2131 }, { "epoch": 0.9605767064654201, "grad_norm": 0.5037710839528792, "learning_rate": 1.5370228885045662e-05, "loss": 0.6699, "step": 2132 }, { "epoch": 0.9610272583915296, "grad_norm": 0.5304511990324116, "learning_rate": 1.5366243103971864e-05, "loss": 0.6734, "step": 2133 }, { "epoch": 0.9614778103176391, "grad_norm": 0.5200434450963733, "learning_rate": 1.5362256125256694e-05, "loss": 0.6374, "step": 2134 }, { "epoch": 0.9619283622437486, "grad_norm": 0.5349883383497511, "learning_rate": 1.5358267949789968e-05, "loss": 0.6816, "step": 2135 }, { "epoch": 0.9623789141698581, "grad_norm": 0.5428064857483109, "learning_rate": 1.5354278578461767e-05, "loss": 0.6661, "step": 2136 }, { "epoch": 0.9628294660959675, "grad_norm": 0.5392800758248344, "learning_rate": 1.535028801216245e-05, "loss": 0.6458, "step": 2137 }, { "epoch": 0.9632800180220771, "grad_norm": 0.5340910261545889, "learning_rate": 1.5346296251782625e-05, "loss": 0.6474, "step": 2138 }, { "epoch": 0.9637305699481865, "grad_norm": 0.5405483860133489, "learning_rate": 1.534230329821318e-05, "loss": 0.655, "step": 2139 }, { "epoch": 0.964181121874296, "grad_norm": 0.510966042592002, "learning_rate": 1.5338309152345262e-05, "loss": 0.6679, "step": 2140 }, { "epoch": 0.9646316738004055, "grad_norm": 0.5126680933234994, "learning_rate": 1.5334313815070286e-05, "loss": 0.6584, "step": 2141 }, { "epoch": 0.965082225726515, "grad_norm": 0.7042314957039754, "learning_rate": 1.533031728727994e-05, "loss": 0.6469, "step": 2142 }, { "epoch": 0.9655327776526245, "grad_norm": 0.5053156950246153, "learning_rate": 1.5326319569866167e-05, "loss": 0.6571, "step": 2143 }, { "epoch": 0.9659833295787339, "grad_norm": 0.511302864037444, "learning_rate": 1.532232066372118e-05, "loss": 0.6283, "step": 2144 }, { "epoch": 0.9664338815048434, "grad_norm": 0.5127248294215326, "learning_rate": 1.5318320569737454e-05, "loss": 0.634, "step": 2145 }, { "epoch": 0.9668844334309529, "grad_norm": 0.5781504582261674, "learning_rate": 1.5314319288807737e-05, "loss": 0.6589, "step": 2146 }, { "epoch": 0.9673349853570624, "grad_norm": 0.49375840618598876, "learning_rate": 1.531031682182504e-05, "loss": 0.659, "step": 2147 }, { "epoch": 0.9677855372831718, "grad_norm": 0.5542613846372022, "learning_rate": 1.530631316968263e-05, "loss": 0.6852, "step": 2148 }, { "epoch": 0.9682360892092814, "grad_norm": 0.48501242059461763, "learning_rate": 1.530230833327405e-05, "loss": 0.6508, "step": 2149 }, { "epoch": 0.9686866411353908, "grad_norm": 0.5313947528234524, "learning_rate": 1.5298302313493095e-05, "loss": 0.6271, "step": 2150 }, { "epoch": 0.9691371930615004, "grad_norm": 0.555763453651329, "learning_rate": 1.5294295111233837e-05, "loss": 0.6771, "step": 2151 }, { "epoch": 0.9695877449876098, "grad_norm": 0.5444262114187387, "learning_rate": 1.5290286727390604e-05, "loss": 0.6502, "step": 2152 }, { "epoch": 0.9700382969137193, "grad_norm": 0.5577406469022342, "learning_rate": 1.5286277162857988e-05, "loss": 0.652, "step": 2153 }, { "epoch": 0.9704888488398288, "grad_norm": 0.5230627556730606, "learning_rate": 1.5282266418530846e-05, "loss": 0.6402, "step": 2154 }, { "epoch": 0.9709394007659383, "grad_norm": 0.5155811000892606, "learning_rate": 1.5278254495304298e-05, "loss": 0.6693, "step": 2155 }, { "epoch": 0.9713899526920478, "grad_norm": 0.5235331405275967, "learning_rate": 1.5274241394073733e-05, "loss": 0.6592, "step": 2156 }, { "epoch": 0.9718405046181573, "grad_norm": 0.4914544840914451, "learning_rate": 1.527022711573479e-05, "loss": 0.6734, "step": 2157 }, { "epoch": 0.9722910565442667, "grad_norm": 0.5008560659927994, "learning_rate": 1.5266211661183385e-05, "loss": 0.6351, "step": 2158 }, { "epoch": 0.9727416084703762, "grad_norm": 0.4971098558153854, "learning_rate": 1.526219503131568e-05, "loss": 0.6405, "step": 2159 }, { "epoch": 0.9731921603964857, "grad_norm": 0.5050591734293671, "learning_rate": 1.5258177227028118e-05, "loss": 0.702, "step": 2160 }, { "epoch": 0.9736427123225951, "grad_norm": 0.5197644752627226, "learning_rate": 1.525415824921739e-05, "loss": 0.6707, "step": 2161 }, { "epoch": 0.9740932642487047, "grad_norm": 0.527562260752817, "learning_rate": 1.5250138098780456e-05, "loss": 0.6809, "step": 2162 }, { "epoch": 0.9745438161748141, "grad_norm": 0.534658583616158, "learning_rate": 1.524611677661454e-05, "loss": 0.6728, "step": 2163 }, { "epoch": 0.9749943681009237, "grad_norm": 0.5819939785466004, "learning_rate": 1.524209428361711e-05, "loss": 0.7237, "step": 2164 }, { "epoch": 0.9754449200270331, "grad_norm": 0.5158374871466588, "learning_rate": 1.5238070620685923e-05, "loss": 0.6842, "step": 2165 }, { "epoch": 0.9758954719531426, "grad_norm": 0.5692194301267326, "learning_rate": 1.5234045788718969e-05, "loss": 0.6482, "step": 2166 }, { "epoch": 0.9763460238792521, "grad_norm": 0.5598635874686371, "learning_rate": 1.5230019788614527e-05, "loss": 0.6662, "step": 2167 }, { "epoch": 0.9767965758053616, "grad_norm": 0.5519990943323733, "learning_rate": 1.5225992621271112e-05, "loss": 0.6751, "step": 2168 }, { "epoch": 0.977247127731471, "grad_norm": 0.5270677390976147, "learning_rate": 1.5221964287587512e-05, "loss": 0.6593, "step": 2169 }, { "epoch": 0.9776976796575806, "grad_norm": 0.506671723660425, "learning_rate": 1.5217934788462774e-05, "loss": 0.6435, "step": 2170 }, { "epoch": 0.97814823158369, "grad_norm": 0.5517214212656759, "learning_rate": 1.5213904124796201e-05, "loss": 0.6665, "step": 2171 }, { "epoch": 0.9785987835097995, "grad_norm": 0.5300845183631239, "learning_rate": 1.5209872297487365e-05, "loss": 0.6982, "step": 2172 }, { "epoch": 0.979049335435909, "grad_norm": 0.592217790185578, "learning_rate": 1.5205839307436088e-05, "loss": 0.6295, "step": 2173 }, { "epoch": 0.9794998873620184, "grad_norm": 0.5414434134428355, "learning_rate": 1.5201805155542453e-05, "loss": 0.6711, "step": 2174 }, { "epoch": 0.979950439288128, "grad_norm": 0.535455013398276, "learning_rate": 1.5197769842706807e-05, "loss": 0.651, "step": 2175 }, { "epoch": 0.9804009912142374, "grad_norm": 0.5115074085368884, "learning_rate": 1.5193733369829754e-05, "loss": 0.6885, "step": 2176 }, { "epoch": 0.980851543140347, "grad_norm": 0.520230259674952, "learning_rate": 1.5189695737812153e-05, "loss": 0.6596, "step": 2177 }, { "epoch": 0.9813020950664564, "grad_norm": 0.49312860245261164, "learning_rate": 1.518565694755513e-05, "loss": 0.6509, "step": 2178 }, { "epoch": 0.9817526469925659, "grad_norm": 0.49496643914089405, "learning_rate": 1.5181616999960061e-05, "loss": 0.6764, "step": 2179 }, { "epoch": 0.9822031989186754, "grad_norm": 0.5294813199468179, "learning_rate": 1.5177575895928585e-05, "loss": 0.6443, "step": 2180 }, { "epoch": 0.9826537508447849, "grad_norm": 0.49157337650609156, "learning_rate": 1.5173533636362599e-05, "loss": 0.6839, "step": 2181 }, { "epoch": 0.9831043027708943, "grad_norm": 0.5398795816365327, "learning_rate": 1.5169490222164255e-05, "loss": 0.6908, "step": 2182 }, { "epoch": 0.9835548546970039, "grad_norm": 0.49763726798873004, "learning_rate": 1.5165445654235968e-05, "loss": 0.6936, "step": 2183 }, { "epoch": 0.9840054066231133, "grad_norm": 0.5229554903514371, "learning_rate": 1.5161399933480402e-05, "loss": 0.6619, "step": 2184 }, { "epoch": 0.9844559585492227, "grad_norm": 0.5028078840719078, "learning_rate": 1.5157353060800492e-05, "loss": 0.6428, "step": 2185 }, { "epoch": 0.9849065104753323, "grad_norm": 0.5117614032566498, "learning_rate": 1.5153305037099413e-05, "loss": 0.6042, "step": 2186 }, { "epoch": 0.9853570624014417, "grad_norm": 0.4873028759975156, "learning_rate": 1.5149255863280607e-05, "loss": 0.6514, "step": 2187 }, { "epoch": 0.9858076143275513, "grad_norm": 0.5188899914974723, "learning_rate": 1.5145205540247774e-05, "loss": 0.6488, "step": 2188 }, { "epoch": 0.9862581662536607, "grad_norm": 0.48673309340749626, "learning_rate": 1.5141154068904864e-05, "loss": 0.6906, "step": 2189 }, { "epoch": 0.9867087181797702, "grad_norm": 0.5234553454685003, "learning_rate": 1.513710145015609e-05, "loss": 0.6412, "step": 2190 }, { "epoch": 0.9871592701058797, "grad_norm": 0.5353649031208162, "learning_rate": 1.5133047684905916e-05, "loss": 0.6735, "step": 2191 }, { "epoch": 0.9876098220319892, "grad_norm": 0.5276969379354292, "learning_rate": 1.5128992774059063e-05, "loss": 0.6834, "step": 2192 }, { "epoch": 0.9880603739580986, "grad_norm": 0.5797656514672573, "learning_rate": 1.512493671852051e-05, "loss": 0.6702, "step": 2193 }, { "epoch": 0.9885109258842082, "grad_norm": 0.5320809432517186, "learning_rate": 1.5120879519195493e-05, "loss": 0.6757, "step": 2194 }, { "epoch": 0.9889614778103176, "grad_norm": 0.550996834419774, "learning_rate": 1.5116821176989494e-05, "loss": 0.7028, "step": 2195 }, { "epoch": 0.9894120297364272, "grad_norm": 0.5157677067641773, "learning_rate": 1.5112761692808258e-05, "loss": 0.6442, "step": 2196 }, { "epoch": 0.9898625816625366, "grad_norm": 0.5220534601635931, "learning_rate": 1.5108701067557787e-05, "loss": 0.6715, "step": 2197 }, { "epoch": 0.9903131335886461, "grad_norm": 0.49846370771440335, "learning_rate": 1.5104639302144327e-05, "loss": 0.6322, "step": 2198 }, { "epoch": 0.9907636855147556, "grad_norm": 0.530231309217841, "learning_rate": 1.5100576397474395e-05, "loss": 0.6739, "step": 2199 }, { "epoch": 0.991214237440865, "grad_norm": 0.4985323919716359, "learning_rate": 1.5096512354454745e-05, "loss": 0.6824, "step": 2200 }, { "epoch": 0.9916647893669746, "grad_norm": 0.5303069212729016, "learning_rate": 1.509244717399239e-05, "loss": 0.6287, "step": 2201 }, { "epoch": 0.992115341293084, "grad_norm": 0.5265945732139566, "learning_rate": 1.5088380856994608e-05, "loss": 0.6303, "step": 2202 }, { "epoch": 0.9925658932191935, "grad_norm": 0.513087651109303, "learning_rate": 1.5084313404368917e-05, "loss": 0.6383, "step": 2203 }, { "epoch": 0.993016445145303, "grad_norm": 0.5093632720169649, "learning_rate": 1.5080244817023096e-05, "loss": 0.6686, "step": 2204 }, { "epoch": 0.9934669970714125, "grad_norm": 0.5179313116811184, "learning_rate": 1.5076175095865171e-05, "loss": 0.6749, "step": 2205 }, { "epoch": 0.9939175489975219, "grad_norm": 0.5027531738000897, "learning_rate": 1.5072104241803427e-05, "loss": 0.6436, "step": 2206 }, { "epoch": 0.9943681009236315, "grad_norm": 0.5167922024044844, "learning_rate": 1.50680322557464e-05, "loss": 0.684, "step": 2207 }, { "epoch": 0.9948186528497409, "grad_norm": 0.5074236245612974, "learning_rate": 1.5063959138602879e-05, "loss": 0.6619, "step": 2208 }, { "epoch": 0.9952692047758505, "grad_norm": 0.5110051907604202, "learning_rate": 1.5059884891281904e-05, "loss": 0.6597, "step": 2209 }, { "epoch": 0.9957197567019599, "grad_norm": 0.4825158540306631, "learning_rate": 1.5055809514692766e-05, "loss": 0.6615, "step": 2210 }, { "epoch": 0.9961703086280694, "grad_norm": 0.5476080196817861, "learning_rate": 1.5051733009745013e-05, "loss": 0.669, "step": 2211 }, { "epoch": 0.9966208605541789, "grad_norm": 0.4925495582976271, "learning_rate": 1.504765537734844e-05, "loss": 0.6644, "step": 2212 }, { "epoch": 0.9970714124802883, "grad_norm": 0.5149418257041116, "learning_rate": 1.5043576618413095e-05, "loss": 0.6401, "step": 2213 }, { "epoch": 0.9975219644063978, "grad_norm": 0.5302618148569721, "learning_rate": 1.5039496733849279e-05, "loss": 0.6751, "step": 2214 }, { "epoch": 0.9979725163325073, "grad_norm": 0.4914039917030782, "learning_rate": 1.5035415724567544e-05, "loss": 0.6877, "step": 2215 }, { "epoch": 0.9984230682586168, "grad_norm": 0.5494668502726232, "learning_rate": 1.5031333591478689e-05, "loss": 0.6872, "step": 2216 }, { "epoch": 0.9988736201847263, "grad_norm": 0.5273905658622525, "learning_rate": 1.5027250335493771e-05, "loss": 0.649, "step": 2217 }, { "epoch": 0.9993241721108358, "grad_norm": 0.5158295115894378, "learning_rate": 1.5023165957524094e-05, "loss": 0.6781, "step": 2218 }, { "epoch": 0.9997747240369452, "grad_norm": 0.5565212485159621, "learning_rate": 1.5019080458481203e-05, "loss": 0.7087, "step": 2219 }, { "epoch": 1.0002252759630548, "grad_norm": 0.5125038940472109, "learning_rate": 1.5014993839276914e-05, "loss": 0.6465, "step": 2220 }, { "epoch": 1.0002252759630548, "eval_loss": 0.6317197680473328, "eval_runtime": 24.313, "eval_samples_per_second": 11.475, "eval_steps_per_second": 0.494, "step": 2220 }, { "epoch": 1.0006758278891643, "grad_norm": 0.5129289604395073, "learning_rate": 1.5010906100823276e-05, "loss": 0.6588, "step": 2221 }, { "epoch": 1.0011263798152736, "grad_norm": 0.5113120975374188, "learning_rate": 1.5006817244032589e-05, "loss": 0.6537, "step": 2222 }, { "epoch": 1.0015769317413832, "grad_norm": 0.5091845529877413, "learning_rate": 1.5002727269817414e-05, "loss": 0.6621, "step": 2223 }, { "epoch": 1.0020274836674927, "grad_norm": 0.49412383428257206, "learning_rate": 1.4998636179090549e-05, "loss": 0.6712, "step": 2224 }, { "epoch": 1.0024780355936023, "grad_norm": 0.49616774696677973, "learning_rate": 1.4994543972765045e-05, "loss": 0.669, "step": 2225 }, { "epoch": 1.0029285875197116, "grad_norm": 0.5127001599806387, "learning_rate": 1.4990450651754207e-05, "loss": 0.6911, "step": 2226 }, { "epoch": 1.0033791394458211, "grad_norm": 0.5066602520091241, "learning_rate": 1.4986356216971583e-05, "loss": 0.6906, "step": 2227 }, { "epoch": 1.0038296913719307, "grad_norm": 0.5524045763066094, "learning_rate": 1.4982260669330972e-05, "loss": 0.646, "step": 2228 }, { "epoch": 1.00428024329804, "grad_norm": 0.5106399040004421, "learning_rate": 1.4978164009746418e-05, "loss": 0.6128, "step": 2229 }, { "epoch": 1.0047307952241495, "grad_norm": 0.507226716923821, "learning_rate": 1.497406623913222e-05, "loss": 0.6598, "step": 2230 }, { "epoch": 1.005181347150259, "grad_norm": 0.5191927642526177, "learning_rate": 1.4969967358402922e-05, "loss": 0.6946, "step": 2231 }, { "epoch": 1.0056318990763686, "grad_norm": 0.5222924493586916, "learning_rate": 1.4965867368473308e-05, "loss": 0.7057, "step": 2232 }, { "epoch": 1.006082451002478, "grad_norm": 0.5414763902407882, "learning_rate": 1.4961766270258422e-05, "loss": 0.6958, "step": 2233 }, { "epoch": 1.0065330029285875, "grad_norm": 0.5682652886452754, "learning_rate": 1.4957664064673548e-05, "loss": 0.6585, "step": 2234 }, { "epoch": 1.006983554854697, "grad_norm": 0.5198345365368962, "learning_rate": 1.4953560752634218e-05, "loss": 0.6278, "step": 2235 }, { "epoch": 1.0074341067808066, "grad_norm": 0.5220505587470071, "learning_rate": 1.494945633505621e-05, "loss": 0.6672, "step": 2236 }, { "epoch": 1.007884658706916, "grad_norm": 0.5178698556261453, "learning_rate": 1.4945350812855555e-05, "loss": 0.6656, "step": 2237 }, { "epoch": 1.0083352106330254, "grad_norm": 0.5369790092362066, "learning_rate": 1.4941244186948522e-05, "loss": 0.6744, "step": 2238 }, { "epoch": 1.008785762559135, "grad_norm": 0.4896404659987527, "learning_rate": 1.4937136458251633e-05, "loss": 0.6664, "step": 2239 }, { "epoch": 1.0092363144852445, "grad_norm": 0.5160494778578126, "learning_rate": 1.4933027627681651e-05, "loss": 0.6617, "step": 2240 }, { "epoch": 1.0096868664113539, "grad_norm": 0.5002389772259755, "learning_rate": 1.4928917696155587e-05, "loss": 0.6607, "step": 2241 }, { "epoch": 1.0101374183374634, "grad_norm": 0.5243586992588188, "learning_rate": 1.4924806664590702e-05, "loss": 0.6902, "step": 2242 }, { "epoch": 1.010587970263573, "grad_norm": 0.511508187889206, "learning_rate": 1.4920694533904494e-05, "loss": 0.6779, "step": 2243 }, { "epoch": 1.0002252759630548, "grad_norm": 0.6330138075751222, "learning_rate": 1.4916581305014713e-05, "loss": 0.6632, "step": 2244 }, { "epoch": 1.0006758278891643, "grad_norm": 0.8216566736489781, "learning_rate": 1.491246697883935e-05, "loss": 0.5624, "step": 2245 }, { "epoch": 1.0011263798152736, "grad_norm": 0.6967650804648878, "learning_rate": 1.4908351556296649e-05, "loss": 0.5301, "step": 2246 }, { "epoch": 1.0015769317413832, "grad_norm": 1.0108911110191645, "learning_rate": 1.4904235038305084e-05, "loss": 0.5295, "step": 2247 }, { "epoch": 1.0020274836674927, "grad_norm": 0.7266484834243672, "learning_rate": 1.4900117425783388e-05, "loss": 0.5556, "step": 2248 }, { "epoch": 1.0024780355936023, "grad_norm": 1.0038103411718933, "learning_rate": 1.4895998719650526e-05, "loss": 0.5506, "step": 2249 }, { "epoch": 1.0029285875197116, "grad_norm": 0.6343442040258035, "learning_rate": 1.489187892082572e-05, "loss": 0.5244, "step": 2250 }, { "epoch": 1.0033791394458211, "grad_norm": 0.9101210358984605, "learning_rate": 1.4887758030228426e-05, "loss": 0.5528, "step": 2251 }, { "epoch": 1.0038296913719307, "grad_norm": 0.7793051793603826, "learning_rate": 1.4883636048778347e-05, "loss": 0.5725, "step": 2252 }, { "epoch": 1.00428024329804, "grad_norm": 0.6772569942646158, "learning_rate": 1.4879512977395431e-05, "loss": 0.505, "step": 2253 }, { "epoch": 1.0047307952241495, "grad_norm": 0.8192831097404958, "learning_rate": 1.4875388816999865e-05, "loss": 0.5903, "step": 2254 }, { "epoch": 1.005181347150259, "grad_norm": 0.6228475576349783, "learning_rate": 1.4871263568512082e-05, "loss": 0.5563, "step": 2255 }, { "epoch": 1.0056318990763686, "grad_norm": 0.7321897056841667, "learning_rate": 1.486713723285276e-05, "loss": 0.5266, "step": 2256 }, { "epoch": 1.006082451002478, "grad_norm": 0.7398387288508258, "learning_rate": 1.4863009810942814e-05, "loss": 0.5414, "step": 2257 }, { "epoch": 1.0065330029285875, "grad_norm": 0.6764726223023437, "learning_rate": 1.4858881303703408e-05, "loss": 0.5637, "step": 2258 }, { "epoch": 1.006983554854697, "grad_norm": 0.6654859375686083, "learning_rate": 1.4854751712055943e-05, "loss": 0.5224, "step": 2259 }, { "epoch": 1.0074341067808066, "grad_norm": 0.5591946235579457, "learning_rate": 1.4850621036922062e-05, "loss": 0.5278, "step": 2260 }, { "epoch": 1.007884658706916, "grad_norm": 0.651841999392173, "learning_rate": 1.4846489279223653e-05, "loss": 0.5293, "step": 2261 }, { "epoch": 1.0083352106330254, "grad_norm": 0.6067384979835401, "learning_rate": 1.4842356439882841e-05, "loss": 0.5368, "step": 2262 }, { "epoch": 1.008785762559135, "grad_norm": 0.5734182397705873, "learning_rate": 1.4838222519822e-05, "loss": 0.5393, "step": 2263 }, { "epoch": 1.0092363144852445, "grad_norm": 0.5734248189612562, "learning_rate": 1.4834087519963743e-05, "loss": 0.5199, "step": 2264 }, { "epoch": 1.0096868664113539, "grad_norm": 0.5479854499344309, "learning_rate": 1.4829951441230917e-05, "loss": 0.5055, "step": 2265 }, { "epoch": 1.0101374183374634, "grad_norm": 0.6133339263701423, "learning_rate": 1.4825814284546612e-05, "loss": 0.5763, "step": 2266 }, { "epoch": 1.010587970263573, "grad_norm": 0.5557920642724613, "learning_rate": 1.4821676050834166e-05, "loss": 0.5088, "step": 2267 }, { "epoch": 1.0110385221896823, "grad_norm": 0.5558935753696834, "learning_rate": 1.4817536741017153e-05, "loss": 0.5448, "step": 2268 }, { "epoch": 1.0114890741157918, "grad_norm": 0.5820484880568093, "learning_rate": 1.4813396356019388e-05, "loss": 0.5066, "step": 2269 }, { "epoch": 1.0119396260419014, "grad_norm": 0.54302182287902, "learning_rate": 1.4809254896764918e-05, "loss": 0.5919, "step": 2270 }, { "epoch": 1.012390177968011, "grad_norm": 0.5420724781914339, "learning_rate": 1.480511236417804e-05, "loss": 0.5253, "step": 2271 }, { "epoch": 1.0128407298941202, "grad_norm": 0.5532941078125485, "learning_rate": 1.4800968759183288e-05, "loss": 0.5241, "step": 2272 }, { "epoch": 1.0132912818202298, "grad_norm": 0.5486831661853274, "learning_rate": 1.4796824082705431e-05, "loss": 0.5425, "step": 2273 }, { "epoch": 1.0137418337463393, "grad_norm": 0.5288783002548103, "learning_rate": 1.4792678335669484e-05, "loss": 0.5257, "step": 2274 }, { "epoch": 1.0141923856724488, "grad_norm": 0.5548940996112479, "learning_rate": 1.4788531519000696e-05, "loss": 0.536, "step": 2275 }, { "epoch": 1.0146429375985582, "grad_norm": 0.5568150740105301, "learning_rate": 1.4784383633624555e-05, "loss": 0.5278, "step": 2276 }, { "epoch": 1.0150934895246677, "grad_norm": 0.5733455498651303, "learning_rate": 1.4780234680466792e-05, "loss": 0.5195, "step": 2277 }, { "epoch": 1.0155440414507773, "grad_norm": 0.5377606318584219, "learning_rate": 1.4776084660453366e-05, "loss": 0.5403, "step": 2278 }, { "epoch": 1.0159945933768866, "grad_norm": 0.5593256185066677, "learning_rate": 1.4771933574510492e-05, "loss": 0.5147, "step": 2279 }, { "epoch": 1.0164451453029961, "grad_norm": 0.5152250206035727, "learning_rate": 1.47677814235646e-05, "loss": 0.4971, "step": 2280 }, { "epoch": 1.0168956972291057, "grad_norm": 0.5575063795074531, "learning_rate": 1.4763628208542375e-05, "loss": 0.5349, "step": 2281 }, { "epoch": 1.0173462491552152, "grad_norm": 0.5367880461637564, "learning_rate": 1.4759473930370738e-05, "loss": 0.5089, "step": 2282 }, { "epoch": 1.0177968010813245, "grad_norm": 0.5495500193497066, "learning_rate": 1.4755318589976835e-05, "loss": 0.4941, "step": 2283 }, { "epoch": 1.018247353007434, "grad_norm": 0.5482238605554282, "learning_rate": 1.4751162188288065e-05, "loss": 0.5321, "step": 2284 }, { "epoch": 1.0186979049335436, "grad_norm": 0.500361519996149, "learning_rate": 1.4747004726232048e-05, "loss": 0.511, "step": 2285 }, { "epoch": 1.0191484568596532, "grad_norm": 0.5455440035832955, "learning_rate": 1.4742846204736655e-05, "loss": 0.5581, "step": 2286 }, { "epoch": 1.0195990087857625, "grad_norm": 0.536029298526984, "learning_rate": 1.4738686624729987e-05, "loss": 0.5334, "step": 2287 }, { "epoch": 1.020049560711872, "grad_norm": 0.5460079637613923, "learning_rate": 1.4734525987140382e-05, "loss": 0.522, "step": 2288 }, { "epoch": 1.0205001126379816, "grad_norm": 0.5431883281894985, "learning_rate": 1.473036429289641e-05, "loss": 0.5172, "step": 2289 }, { "epoch": 1.0209506645640911, "grad_norm": 0.535208234093454, "learning_rate": 1.4726201542926883e-05, "loss": 0.521, "step": 2290 }, { "epoch": 1.0214012164902004, "grad_norm": 0.565905081267227, "learning_rate": 1.4722037738160847e-05, "loss": 0.556, "step": 2291 }, { "epoch": 1.02185176841631, "grad_norm": 0.5075678485237822, "learning_rate": 1.4717872879527578e-05, "loss": 0.5065, "step": 2292 }, { "epoch": 1.0223023203424195, "grad_norm": 0.5699529765302025, "learning_rate": 1.4713706967956598e-05, "loss": 0.5694, "step": 2293 }, { "epoch": 1.0227528722685288, "grad_norm": 0.5563675362196827, "learning_rate": 1.4709540004377654e-05, "loss": 0.5474, "step": 2294 }, { "epoch": 1.0232034241946384, "grad_norm": 0.5583696299403484, "learning_rate": 1.470537198972073e-05, "loss": 0.5081, "step": 2295 }, { "epoch": 1.023653976120748, "grad_norm": 0.568289106893811, "learning_rate": 1.470120292491605e-05, "loss": 0.5462, "step": 2296 }, { "epoch": 1.0241045280468575, "grad_norm": 0.5343982659069653, "learning_rate": 1.4697032810894062e-05, "loss": 0.5066, "step": 2297 }, { "epoch": 1.0245550799729668, "grad_norm": 0.5522955008549788, "learning_rate": 1.4692861648585463e-05, "loss": 0.5325, "step": 2298 }, { "epoch": 1.0250056318990763, "grad_norm": 0.5742618419085335, "learning_rate": 1.4688689438921171e-05, "loss": 0.5485, "step": 2299 }, { "epoch": 1.0254561838251859, "grad_norm": 0.5544489438321286, "learning_rate": 1.468451618283234e-05, "loss": 0.5161, "step": 2300 }, { "epoch": 1.0259067357512954, "grad_norm": 0.5366733665343913, "learning_rate": 1.4680341881250364e-05, "loss": 0.5616, "step": 2301 }, { "epoch": 1.0263572876774048, "grad_norm": 0.5107374858400172, "learning_rate": 1.4676166535106866e-05, "loss": 0.4915, "step": 2302 }, { "epoch": 1.0268078396035143, "grad_norm": 0.5053690264123764, "learning_rate": 1.4671990145333697e-05, "loss": 0.5489, "step": 2303 }, { "epoch": 1.0272583915296238, "grad_norm": 0.5421427080857344, "learning_rate": 1.4667812712862954e-05, "loss": 0.4903, "step": 2304 }, { "epoch": 1.0277089434557334, "grad_norm": 0.542350601470671, "learning_rate": 1.4663634238626951e-05, "loss": 0.5367, "step": 2305 }, { "epoch": 1.0281594953818427, "grad_norm": 0.5163183757086391, "learning_rate": 1.4659454723558249e-05, "loss": 0.5334, "step": 2306 }, { "epoch": 1.0286100473079522, "grad_norm": 0.5371571961841657, "learning_rate": 1.4655274168589635e-05, "loss": 0.5301, "step": 2307 }, { "epoch": 1.0290605992340618, "grad_norm": 0.5886021579982276, "learning_rate": 1.465109257465412e-05, "loss": 0.5381, "step": 2308 }, { "epoch": 1.0295111511601711, "grad_norm": 0.5178355738529657, "learning_rate": 1.4646909942684961e-05, "loss": 0.5528, "step": 2309 }, { "epoch": 1.0299617030862807, "grad_norm": 0.5780452584811256, "learning_rate": 1.4642726273615639e-05, "loss": 0.5532, "step": 2310 }, { "epoch": 1.0304122550123902, "grad_norm": 3.0259927402346185, "learning_rate": 1.4638541568379868e-05, "loss": 0.5629, "step": 2311 }, { "epoch": 1.0308628069384997, "grad_norm": 0.5591457960505094, "learning_rate": 1.463435582791159e-05, "loss": 0.541, "step": 2312 }, { "epoch": 1.031313358864609, "grad_norm": 0.530261887626246, "learning_rate": 1.4630169053144985e-05, "loss": 0.5335, "step": 2313 }, { "epoch": 1.0317639107907186, "grad_norm": 0.5615133403120534, "learning_rate": 1.462598124501446e-05, "loss": 0.5046, "step": 2314 }, { "epoch": 1.0322144627168282, "grad_norm": 0.5317009497407112, "learning_rate": 1.4621792404454647e-05, "loss": 0.5609, "step": 2315 }, { "epoch": 1.0326650146429377, "grad_norm": 0.5698060983808115, "learning_rate": 1.461760253240042e-05, "loss": 0.57, "step": 2316 }, { "epoch": 1.033115566569047, "grad_norm": 0.520452733917016, "learning_rate": 1.461341162978688e-05, "loss": 0.5156, "step": 2317 }, { "epoch": 1.0335661184951566, "grad_norm": 0.5498412706035866, "learning_rate": 1.4609219697549346e-05, "loss": 0.5219, "step": 2318 }, { "epoch": 1.034016670421266, "grad_norm": 0.5587856947145249, "learning_rate": 1.4605026736623383e-05, "loss": 0.5388, "step": 2319 }, { "epoch": 1.0344672223473754, "grad_norm": 0.5457741954327833, "learning_rate": 1.4600832747944773e-05, "loss": 0.5328, "step": 2320 }, { "epoch": 1.034917774273485, "grad_norm": 0.5657946437070239, "learning_rate": 1.4596637732449537e-05, "loss": 0.5429, "step": 2321 }, { "epoch": 1.0353683261995945, "grad_norm": 0.554325300995995, "learning_rate": 1.459244169107392e-05, "loss": 0.5513, "step": 2322 }, { "epoch": 1.035818878125704, "grad_norm": 0.528302846043422, "learning_rate": 1.4588244624754398e-05, "loss": 0.5538, "step": 2323 }, { "epoch": 1.0362694300518134, "grad_norm": 0.5374411243548209, "learning_rate": 1.4584046534427671e-05, "loss": 0.5419, "step": 2324 }, { "epoch": 1.036719981977923, "grad_norm": 0.5615324540877131, "learning_rate": 1.4579847421030677e-05, "loss": 0.5809, "step": 2325 }, { "epoch": 1.0371705339040325, "grad_norm": 0.5598687759274701, "learning_rate": 1.4575647285500571e-05, "loss": 0.5555, "step": 2326 }, { "epoch": 1.037621085830142, "grad_norm": 0.5346019433781276, "learning_rate": 1.4571446128774746e-05, "loss": 0.5462, "step": 2327 }, { "epoch": 1.0380716377562513, "grad_norm": 0.545271953715972, "learning_rate": 1.4567243951790819e-05, "loss": 0.55, "step": 2328 }, { "epoch": 1.0385221896823609, "grad_norm": 0.5422349512867061, "learning_rate": 1.456304075548663e-05, "loss": 0.5683, "step": 2329 }, { "epoch": 1.0389727416084704, "grad_norm": 0.5143173196210951, "learning_rate": 1.4558836540800256e-05, "loss": 0.5507, "step": 2330 }, { "epoch": 1.03942329353458, "grad_norm": 0.5874298575225206, "learning_rate": 1.4554631308669993e-05, "loss": 0.5408, "step": 2331 }, { "epoch": 1.0398738454606893, "grad_norm": 0.5390468902825236, "learning_rate": 1.4550425060034367e-05, "loss": 0.5566, "step": 2332 }, { "epoch": 1.0403243973867988, "grad_norm": 0.5557725410771143, "learning_rate": 1.4546217795832135e-05, "loss": 0.5478, "step": 2333 }, { "epoch": 1.0407749493129084, "grad_norm": 0.545189301900231, "learning_rate": 1.4542009517002269e-05, "loss": 0.5323, "step": 2334 }, { "epoch": 1.0412255012390177, "grad_norm": 0.5625389942107346, "learning_rate": 1.4537800224483982e-05, "loss": 0.5654, "step": 2335 }, { "epoch": 1.0416760531651272, "grad_norm": 0.5651653589063264, "learning_rate": 1.4533589919216705e-05, "loss": 0.536, "step": 2336 }, { "epoch": 1.0421266050912368, "grad_norm": 0.5233898904505166, "learning_rate": 1.4529378602140096e-05, "loss": 0.5184, "step": 2337 }, { "epoch": 1.0425771570173463, "grad_norm": 0.5642560006776882, "learning_rate": 1.4525166274194038e-05, "loss": 0.5325, "step": 2338 }, { "epoch": 1.0430277089434556, "grad_norm": 0.5458023957206369, "learning_rate": 1.4520952936318644e-05, "loss": 0.547, "step": 2339 }, { "epoch": 1.0434782608695652, "grad_norm": 0.5418864731169402, "learning_rate": 1.4516738589454246e-05, "loss": 0.5372, "step": 2340 }, { "epoch": 1.0439288127956747, "grad_norm": 0.5837376147762359, "learning_rate": 1.4512523234541407e-05, "loss": 0.5631, "step": 2341 }, { "epoch": 1.0443793647217843, "grad_norm": 0.5176063927092273, "learning_rate": 1.4508306872520912e-05, "loss": 0.5197, "step": 2342 }, { "epoch": 1.0448299166478936, "grad_norm": 0.5449610742399347, "learning_rate": 1.450408950433377e-05, "loss": 0.52, "step": 2343 }, { "epoch": 1.0452804685740031, "grad_norm": 0.5334654046822662, "learning_rate": 1.4499871130921213e-05, "loss": 0.5259, "step": 2344 }, { "epoch": 1.0457310205001127, "grad_norm": 0.5458349441335711, "learning_rate": 1.4495651753224706e-05, "loss": 0.5232, "step": 2345 }, { "epoch": 1.0461815724262222, "grad_norm": 0.5621409275345248, "learning_rate": 1.449143137218593e-05, "loss": 0.5319, "step": 2346 }, { "epoch": 1.0466321243523315, "grad_norm": 0.5457241373633883, "learning_rate": 1.4487209988746791e-05, "loss": 0.5337, "step": 2347 }, { "epoch": 1.047082676278441, "grad_norm": 0.5461870838098938, "learning_rate": 1.4482987603849422e-05, "loss": 0.5637, "step": 2348 }, { "epoch": 1.0475332282045506, "grad_norm": 0.5968124174136077, "learning_rate": 1.4478764218436174e-05, "loss": 0.5262, "step": 2349 }, { "epoch": 1.04798378013066, "grad_norm": 0.5275100590929976, "learning_rate": 1.447453983344963e-05, "loss": 0.517, "step": 2350 }, { "epoch": 1.0484343320567695, "grad_norm": 0.570676000197419, "learning_rate": 1.4470314449832586e-05, "loss": 0.5468, "step": 2351 }, { "epoch": 1.048884883982879, "grad_norm": 0.58596280550594, "learning_rate": 1.4466088068528068e-05, "loss": 0.5539, "step": 2352 }, { "epoch": 1.0493354359089886, "grad_norm": 0.5705711220816825, "learning_rate": 1.4461860690479319e-05, "loss": 0.5122, "step": 2353 }, { "epoch": 1.049785987835098, "grad_norm": 0.5984115211636118, "learning_rate": 1.4457632316629812e-05, "loss": 0.5821, "step": 2354 }, { "epoch": 1.0502365397612075, "grad_norm": 0.5806465621472428, "learning_rate": 1.4453402947923242e-05, "loss": 0.5574, "step": 2355 }, { "epoch": 1.050687091687317, "grad_norm": 0.5554042643068661, "learning_rate": 1.4449172585303511e-05, "loss": 0.5249, "step": 2356 }, { "epoch": 1.0511376436134265, "grad_norm": 0.5934105732195316, "learning_rate": 1.444494122971476e-05, "loss": 0.5518, "step": 2357 }, { "epoch": 1.0515881955395359, "grad_norm": 0.5409847123162919, "learning_rate": 1.4440708882101344e-05, "loss": 0.5555, "step": 2358 }, { "epoch": 1.0520387474656454, "grad_norm": 0.5508949691153823, "learning_rate": 1.4436475543407843e-05, "loss": 0.5905, "step": 2359 }, { "epoch": 1.052489299391755, "grad_norm": 0.5558761389305954, "learning_rate": 1.4432241214579054e-05, "loss": 0.5424, "step": 2360 }, { "epoch": 1.0529398513178645, "grad_norm": 0.5735205504031479, "learning_rate": 1.4428005896559997e-05, "loss": 0.5623, "step": 2361 }, { "epoch": 1.0533904032439738, "grad_norm": 0.5398374600797092, "learning_rate": 1.4423769590295917e-05, "loss": 0.518, "step": 2362 }, { "epoch": 1.0538409551700834, "grad_norm": 0.5984584909650187, "learning_rate": 1.4419532296732271e-05, "loss": 0.5596, "step": 2363 }, { "epoch": 1.054291507096193, "grad_norm": 0.5433167221454723, "learning_rate": 1.441529401681474e-05, "loss": 0.5089, "step": 2364 }, { "epoch": 1.0547420590223022, "grad_norm": 0.5878452329290651, "learning_rate": 1.4411054751489229e-05, "loss": 0.5519, "step": 2365 }, { "epoch": 1.0551926109484118, "grad_norm": 0.619238567250366, "learning_rate": 1.4406814501701857e-05, "loss": 0.5543, "step": 2366 }, { "epoch": 1.0556431628745213, "grad_norm": 0.567925435023758, "learning_rate": 1.4402573268398969e-05, "loss": 0.5626, "step": 2367 }, { "epoch": 1.0560937148006309, "grad_norm": 0.6108509553522148, "learning_rate": 1.4398331052527126e-05, "loss": 0.5619, "step": 2368 }, { "epoch": 1.0565442667267402, "grad_norm": 0.5711601457310035, "learning_rate": 1.4394087855033103e-05, "loss": 0.5374, "step": 2369 }, { "epoch": 1.0569948186528497, "grad_norm": 0.5811344982322286, "learning_rate": 1.4389843676863904e-05, "loss": 0.5666, "step": 2370 }, { "epoch": 1.0574453705789593, "grad_norm": 0.5753573538276676, "learning_rate": 1.4385598518966746e-05, "loss": 0.5409, "step": 2371 }, { "epoch": 1.0578959225050688, "grad_norm": 0.5446618617089274, "learning_rate": 1.4381352382289064e-05, "loss": 0.5321, "step": 2372 }, { "epoch": 1.0583464744311781, "grad_norm": 0.5654183324536933, "learning_rate": 1.437710526777852e-05, "loss": 0.5352, "step": 2373 }, { "epoch": 1.0587970263572877, "grad_norm": 0.5615177088345406, "learning_rate": 1.437285717638298e-05, "loss": 0.5067, "step": 2374 }, { "epoch": 1.0592475782833972, "grad_norm": 0.5319112796776498, "learning_rate": 1.4368608109050537e-05, "loss": 0.5009, "step": 2375 }, { "epoch": 1.0596981302095065, "grad_norm": 0.6204781600801443, "learning_rate": 1.4364358066729502e-05, "loss": 0.5526, "step": 2376 }, { "epoch": 1.060148682135616, "grad_norm": 0.53261953160568, "learning_rate": 1.4360107050368403e-05, "loss": 0.5266, "step": 2377 }, { "epoch": 1.0605992340617256, "grad_norm": 0.5244086929570869, "learning_rate": 1.4355855060915987e-05, "loss": 0.512, "step": 2378 }, { "epoch": 1.0610497859878352, "grad_norm": 0.582948485708368, "learning_rate": 1.4351602099321207e-05, "loss": 0.5445, "step": 2379 }, { "epoch": 1.0615003379139445, "grad_norm": 0.5530779941714423, "learning_rate": 1.4347348166533247e-05, "loss": 0.5373, "step": 2380 }, { "epoch": 1.061950889840054, "grad_norm": 0.5336032399760264, "learning_rate": 1.4343093263501502e-05, "loss": 0.5468, "step": 2381 }, { "epoch": 1.0624014417661636, "grad_norm": 0.540060301430995, "learning_rate": 1.4338837391175582e-05, "loss": 0.5577, "step": 2382 }, { "epoch": 1.0628519936922731, "grad_norm": 0.5304842617104921, "learning_rate": 1.4334580550505319e-05, "loss": 0.5326, "step": 2383 }, { "epoch": 1.0633025456183824, "grad_norm": 0.5380026496327263, "learning_rate": 1.4330322742440754e-05, "loss": 0.5624, "step": 2384 }, { "epoch": 1.063753097544492, "grad_norm": 0.5216990920371497, "learning_rate": 1.4326063967932147e-05, "loss": 0.548, "step": 2385 }, { "epoch": 1.0642036494706015, "grad_norm": 0.5336932228474889, "learning_rate": 1.4321804227929976e-05, "loss": 0.5433, "step": 2386 }, { "epoch": 1.0646542013967109, "grad_norm": 0.5634994908484537, "learning_rate": 1.4317543523384928e-05, "loss": 0.5401, "step": 2387 }, { "epoch": 1.0651047533228204, "grad_norm": 0.5679075567417035, "learning_rate": 1.4313281855247919e-05, "loss": 0.5417, "step": 2388 }, { "epoch": 1.06555530524893, "grad_norm": 0.5470899302014528, "learning_rate": 1.430901922447006e-05, "loss": 0.5192, "step": 2389 }, { "epoch": 1.0660058571750395, "grad_norm": 0.5453449284961835, "learning_rate": 1.4304755632002696e-05, "loss": 0.5534, "step": 2390 }, { "epoch": 1.0664564091011488, "grad_norm": 0.5164669004090043, "learning_rate": 1.4300491078797371e-05, "loss": 0.5233, "step": 2391 }, { "epoch": 1.0669069610272583, "grad_norm": 0.5583955846824878, "learning_rate": 1.4296225565805854e-05, "loss": 0.5313, "step": 2392 }, { "epoch": 1.067357512953368, "grad_norm": 0.5690823975108673, "learning_rate": 1.4291959093980124e-05, "loss": 0.5506, "step": 2393 }, { "epoch": 1.0678080648794774, "grad_norm": 0.5795824503799907, "learning_rate": 1.4287691664272376e-05, "loss": 0.5561, "step": 2394 }, { "epoch": 1.0682586168055868, "grad_norm": 0.587347971691636, "learning_rate": 1.4283423277635015e-05, "loss": 0.5689, "step": 2395 }, { "epoch": 1.0687091687316963, "grad_norm": 0.5521000565314542, "learning_rate": 1.4279153935020667e-05, "loss": 0.5523, "step": 2396 }, { "epoch": 1.0691597206578058, "grad_norm": 0.5661532165317656, "learning_rate": 1.4274883637382162e-05, "loss": 0.5375, "step": 2397 }, { "epoch": 1.0696102725839154, "grad_norm": 0.5846465217291035, "learning_rate": 1.4270612385672547e-05, "loss": 0.5232, "step": 2398 }, { "epoch": 1.0700608245100247, "grad_norm": 0.5740642482745102, "learning_rate": 1.4266340180845087e-05, "loss": 0.5382, "step": 2399 }, { "epoch": 1.0705113764361343, "grad_norm": 0.5871788068575606, "learning_rate": 1.4262067023853252e-05, "loss": 0.5717, "step": 2400 }, { "epoch": 1.0709619283622438, "grad_norm": 0.5519687396469077, "learning_rate": 1.4257792915650728e-05, "loss": 0.5304, "step": 2401 }, { "epoch": 1.0714124802883531, "grad_norm": 0.5436771817213898, "learning_rate": 1.4253517857191415e-05, "loss": 0.5327, "step": 2402 }, { "epoch": 1.0718630322144627, "grad_norm": 0.5599916813913439, "learning_rate": 1.424924184942942e-05, "loss": 0.5322, "step": 2403 }, { "epoch": 1.0723135841405722, "grad_norm": 0.574668572372696, "learning_rate": 1.4244964893319066e-05, "loss": 0.5455, "step": 2404 }, { "epoch": 1.0727641360666818, "grad_norm": 0.5287082983050424, "learning_rate": 1.424068698981489e-05, "loss": 0.516, "step": 2405 }, { "epoch": 1.073214687992791, "grad_norm": 0.5675039720359022, "learning_rate": 1.4236408139871633e-05, "loss": 0.5534, "step": 2406 }, { "epoch": 1.0736652399189006, "grad_norm": 0.5452190099159818, "learning_rate": 1.4232128344444251e-05, "loss": 0.5421, "step": 2407 }, { "epoch": 1.0741157918450102, "grad_norm": 0.5549060008548651, "learning_rate": 1.4227847604487914e-05, "loss": 0.5629, "step": 2408 }, { "epoch": 1.0745663437711197, "grad_norm": 0.52107036178573, "learning_rate": 1.4223565920958e-05, "loss": 0.5211, "step": 2409 }, { "epoch": 1.075016895697229, "grad_norm": 0.5711406720704991, "learning_rate": 1.4219283294810095e-05, "loss": 0.5377, "step": 2410 }, { "epoch": 1.0754674476233386, "grad_norm": 0.5434911609556213, "learning_rate": 1.4214999727000002e-05, "loss": 0.5323, "step": 2411 }, { "epoch": 1.0759179995494481, "grad_norm": 0.5719924351571029, "learning_rate": 1.4210715218483726e-05, "loss": 0.5457, "step": 2412 }, { "epoch": 1.0763685514755577, "grad_norm": 0.537990977771356, "learning_rate": 1.4206429770217492e-05, "loss": 0.5223, "step": 2413 }, { "epoch": 1.076819103401667, "grad_norm": 0.531596679176356, "learning_rate": 1.420214338315772e-05, "loss": 0.5429, "step": 2414 }, { "epoch": 1.0772696553277765, "grad_norm": 0.5416071789136246, "learning_rate": 1.4197856058261059e-05, "loss": 0.5548, "step": 2415 }, { "epoch": 1.077720207253886, "grad_norm": 0.5428655792885407, "learning_rate": 1.4193567796484349e-05, "loss": 0.5419, "step": 2416 }, { "epoch": 1.0781707591799954, "grad_norm": 0.5234321206449948, "learning_rate": 1.4189278598784648e-05, "loss": 0.5162, "step": 2417 }, { "epoch": 1.078621311106105, "grad_norm": 0.5273669327349683, "learning_rate": 1.4184988466119225e-05, "loss": 0.5439, "step": 2418 }, { "epoch": 1.0790718630322145, "grad_norm": 0.552390148131127, "learning_rate": 1.418069739944555e-05, "loss": 0.5392, "step": 2419 }, { "epoch": 1.079522414958324, "grad_norm": 0.575020834263563, "learning_rate": 1.4176405399721311e-05, "loss": 0.5359, "step": 2420 }, { "epoch": 1.0799729668844333, "grad_norm": 0.5418901427698165, "learning_rate": 1.4172112467904397e-05, "loss": 0.5304, "step": 2421 }, { "epoch": 1.0804235188105429, "grad_norm": 0.5267299898999296, "learning_rate": 1.4167818604952906e-05, "loss": 0.5179, "step": 2422 }, { "epoch": 1.0808740707366524, "grad_norm": 0.5293187764945355, "learning_rate": 1.4163523811825144e-05, "loss": 0.5575, "step": 2423 }, { "epoch": 1.081324622662762, "grad_norm": 0.5294685802429532, "learning_rate": 1.4159228089479627e-05, "loss": 0.5279, "step": 2424 }, { "epoch": 1.0817751745888713, "grad_norm": 0.5449122947151439, "learning_rate": 1.4154931438875077e-05, "loss": 0.5387, "step": 2425 }, { "epoch": 1.0822257265149808, "grad_norm": 0.5179192716301508, "learning_rate": 1.4150633860970424e-05, "loss": 0.5358, "step": 2426 }, { "epoch": 1.0826762784410904, "grad_norm": 0.6294065516466725, "learning_rate": 1.4146335356724803e-05, "loss": 0.5454, "step": 2427 }, { "epoch": 1.0831268303672, "grad_norm": 0.551531511214578, "learning_rate": 1.4142035927097558e-05, "loss": 0.5487, "step": 2428 }, { "epoch": 1.0835773822933092, "grad_norm": 0.5546806665337337, "learning_rate": 1.4137735573048232e-05, "loss": 0.5187, "step": 2429 }, { "epoch": 1.0840279342194188, "grad_norm": 0.5932153061526829, "learning_rate": 1.413343429553659e-05, "loss": 0.5401, "step": 2430 }, { "epoch": 1.0844784861455283, "grad_norm": 0.5543140996780151, "learning_rate": 1.4129132095522586e-05, "loss": 0.5475, "step": 2431 }, { "epoch": 1.0849290380716377, "grad_norm": 0.5437369950296908, "learning_rate": 1.4124828973966393e-05, "loss": 0.5461, "step": 2432 }, { "epoch": 1.0853795899977472, "grad_norm": 0.5173492182985826, "learning_rate": 1.4120524931828383e-05, "loss": 0.5641, "step": 2433 }, { "epoch": 1.0858301419238567, "grad_norm": 0.5848202148585261, "learning_rate": 1.4116219970069134e-05, "loss": 0.543, "step": 2434 }, { "epoch": 1.0862806938499663, "grad_norm": 0.5851451912217549, "learning_rate": 1.4111914089649428e-05, "loss": 0.5682, "step": 2435 }, { "epoch": 1.0867312457760756, "grad_norm": 0.5581062727383971, "learning_rate": 1.4107607291530256e-05, "loss": 0.5487, "step": 2436 }, { "epoch": 1.0871817977021851, "grad_norm": 0.5621135952270758, "learning_rate": 1.410329957667281e-05, "loss": 0.5525, "step": 2437 }, { "epoch": 1.0876323496282947, "grad_norm": 0.5640230371638104, "learning_rate": 1.4098990946038496e-05, "loss": 0.5616, "step": 2438 }, { "epoch": 1.0880829015544042, "grad_norm": 0.5498665448554109, "learning_rate": 1.4094681400588908e-05, "loss": 0.5467, "step": 2439 }, { "epoch": 1.0885334534805136, "grad_norm": 0.5877581554057422, "learning_rate": 1.4090370941285855e-05, "loss": 0.5174, "step": 2440 }, { "epoch": 1.088984005406623, "grad_norm": 0.5277440706177609, "learning_rate": 1.4086059569091349e-05, "loss": 0.5407, "step": 2441 }, { "epoch": 1.0894345573327326, "grad_norm": 0.5695110654107208, "learning_rate": 1.4081747284967602e-05, "loss": 0.5432, "step": 2442 }, { "epoch": 1.0898851092588422, "grad_norm": 0.5683249196113497, "learning_rate": 1.4077434089877038e-05, "loss": 0.5463, "step": 2443 }, { "epoch": 1.0903356611849515, "grad_norm": 0.5557942822937136, "learning_rate": 1.4073119984782273e-05, "loss": 0.5279, "step": 2444 }, { "epoch": 1.090786213111061, "grad_norm": 0.5727664491714661, "learning_rate": 1.4068804970646134e-05, "loss": 0.5313, "step": 2445 }, { "epoch": 1.0912367650371706, "grad_norm": 0.5690281488476074, "learning_rate": 1.4064489048431649e-05, "loss": 0.5475, "step": 2446 }, { "epoch": 1.09168731696328, "grad_norm": 0.5426495347507527, "learning_rate": 1.4060172219102046e-05, "loss": 0.5476, "step": 2447 }, { "epoch": 1.0921378688893895, "grad_norm": 0.5553893840536959, "learning_rate": 1.4055854483620759e-05, "loss": 0.5307, "step": 2448 }, { "epoch": 1.092588420815499, "grad_norm": 0.5383653438706938, "learning_rate": 1.4051535842951422e-05, "loss": 0.4955, "step": 2449 }, { "epoch": 1.0930389727416085, "grad_norm": 0.5636758632667017, "learning_rate": 1.4047216298057872e-05, "loss": 0.5518, "step": 2450 }, { "epoch": 1.0934895246677179, "grad_norm": 0.5165361780420922, "learning_rate": 1.4042895849904152e-05, "loss": 0.5403, "step": 2451 }, { "epoch": 1.0939400765938274, "grad_norm": 0.5671206236176849, "learning_rate": 1.4038574499454496e-05, "loss": 0.5322, "step": 2452 }, { "epoch": 1.094390628519937, "grad_norm": 0.5608306922284574, "learning_rate": 1.4034252247673346e-05, "loss": 0.5869, "step": 2453 }, { "epoch": 1.0948411804460465, "grad_norm": 0.5197936590828006, "learning_rate": 1.4029929095525347e-05, "loss": 0.5064, "step": 2454 }, { "epoch": 1.0952917323721558, "grad_norm": 0.5382221586605132, "learning_rate": 1.4025605043975344e-05, "loss": 0.5624, "step": 2455 }, { "epoch": 1.0957422842982654, "grad_norm": 0.5931092304675467, "learning_rate": 1.402128009398838e-05, "loss": 0.5449, "step": 2456 }, { "epoch": 1.096192836224375, "grad_norm": 0.5288710388625996, "learning_rate": 1.4016954246529697e-05, "loss": 0.5533, "step": 2457 }, { "epoch": 1.0966433881504845, "grad_norm": 0.5298555294123491, "learning_rate": 1.4012627502564743e-05, "loss": 0.4993, "step": 2458 }, { "epoch": 1.0970939400765938, "grad_norm": 0.5499772739369385, "learning_rate": 1.4008299863059165e-05, "loss": 0.5183, "step": 2459 }, { "epoch": 1.0975444920027033, "grad_norm": 0.808028925492155, "learning_rate": 1.4003971328978807e-05, "loss": 0.564, "step": 2460 }, { "epoch": 1.0979950439288129, "grad_norm": 0.584283503017691, "learning_rate": 1.3999641901289712e-05, "loss": 0.5623, "step": 2461 }, { "epoch": 1.0984455958549222, "grad_norm": 0.7004252950477083, "learning_rate": 1.3995311580958124e-05, "loss": 0.5291, "step": 2462 }, { "epoch": 1.0988961477810317, "grad_norm": 0.58763306814028, "learning_rate": 1.3990980368950493e-05, "loss": 0.5549, "step": 2463 }, { "epoch": 1.0993466997071413, "grad_norm": 0.5579164812679118, "learning_rate": 1.3986648266233452e-05, "loss": 0.5341, "step": 2464 }, { "epoch": 1.0997972516332508, "grad_norm": 0.5511544294247197, "learning_rate": 1.3982315273773848e-05, "loss": 0.5544, "step": 2465 }, { "epoch": 1.1002478035593601, "grad_norm": 0.5484489186845902, "learning_rate": 1.3977981392538719e-05, "loss": 0.5521, "step": 2466 }, { "epoch": 1.1006983554854697, "grad_norm": 0.5502062835074665, "learning_rate": 1.3973646623495305e-05, "loss": 0.5718, "step": 2467 }, { "epoch": 1.1011489074115792, "grad_norm": 0.5246715201168951, "learning_rate": 1.3969310967611041e-05, "loss": 0.5304, "step": 2468 }, { "epoch": 1.1015994593376888, "grad_norm": 0.5624988441185822, "learning_rate": 1.3964974425853561e-05, "loss": 0.5139, "step": 2469 }, { "epoch": 1.102050011263798, "grad_norm": 0.6082248540949896, "learning_rate": 1.39606369991907e-05, "loss": 0.55, "step": 2470 }, { "epoch": 1.1025005631899076, "grad_norm": 0.5485799218152857, "learning_rate": 1.3956298688590484e-05, "loss": 0.5385, "step": 2471 }, { "epoch": 1.1029511151160172, "grad_norm": 0.5625350488008032, "learning_rate": 1.395195949502114e-05, "loss": 0.5377, "step": 2472 }, { "epoch": 1.1034016670421267, "grad_norm": 0.5261359409512559, "learning_rate": 1.3947619419451095e-05, "loss": 0.5097, "step": 2473 }, { "epoch": 1.103852218968236, "grad_norm": 0.5531249971891897, "learning_rate": 1.3943278462848966e-05, "loss": 0.5246, "step": 2474 }, { "epoch": 1.1043027708943456, "grad_norm": 0.5493366993368886, "learning_rate": 1.393893662618357e-05, "loss": 0.5056, "step": 2475 }, { "epoch": 1.1047533228204551, "grad_norm": 0.5536066368122587, "learning_rate": 1.3934593910423925e-05, "loss": 0.5097, "step": 2476 }, { "epoch": 1.1052038747465645, "grad_norm": 0.5778653430986637, "learning_rate": 1.3930250316539237e-05, "loss": 0.5399, "step": 2477 }, { "epoch": 1.105654426672674, "grad_norm": 0.5462915034229242, "learning_rate": 1.3925905845498915e-05, "loss": 0.5676, "step": 2478 }, { "epoch": 1.1061049785987835, "grad_norm": 0.5831821108286375, "learning_rate": 1.3921560498272557e-05, "loss": 0.5462, "step": 2479 }, { "epoch": 1.106555530524893, "grad_norm": 0.5536052006903271, "learning_rate": 1.3917214275829962e-05, "loss": 0.4922, "step": 2480 }, { "epoch": 1.1070060824510024, "grad_norm": 0.5346166641131841, "learning_rate": 1.3912867179141122e-05, "loss": 0.5735, "step": 2481 }, { "epoch": 1.107456634377112, "grad_norm": 0.5411865774827244, "learning_rate": 1.3908519209176227e-05, "loss": 0.5453, "step": 2482 }, { "epoch": 1.1079071863032215, "grad_norm": 0.5480473430703549, "learning_rate": 1.3904170366905661e-05, "loss": 0.5423, "step": 2483 }, { "epoch": 1.1083577382293308, "grad_norm": 0.5251987193628572, "learning_rate": 1.3899820653299993e-05, "loss": 0.5453, "step": 2484 }, { "epoch": 1.1088082901554404, "grad_norm": 0.5322518081237972, "learning_rate": 1.3895470069330003e-05, "loss": 0.5639, "step": 2485 }, { "epoch": 1.10925884208155, "grad_norm": 0.5519128228755401, "learning_rate": 1.3891118615966654e-05, "loss": 0.5333, "step": 2486 }, { "epoch": 1.1097093940076594, "grad_norm": 0.5325345266531463, "learning_rate": 1.3886766294181105e-05, "loss": 0.5746, "step": 2487 }, { "epoch": 1.1101599459337688, "grad_norm": 0.5589477204588101, "learning_rate": 1.3882413104944709e-05, "loss": 0.5292, "step": 2488 }, { "epoch": 1.1106104978598783, "grad_norm": 0.538868491080661, "learning_rate": 1.3878059049229018e-05, "loss": 0.5183, "step": 2489 }, { "epoch": 1.1110610497859879, "grad_norm": 0.5660047578259679, "learning_rate": 1.3873704128005767e-05, "loss": 0.5312, "step": 2490 }, { "epoch": 1.1115116017120974, "grad_norm": 0.5709558628726238, "learning_rate": 1.3869348342246894e-05, "loss": 0.5552, "step": 2491 }, { "epoch": 1.1119621536382067, "grad_norm": 0.5459864818621021, "learning_rate": 1.3864991692924524e-05, "loss": 0.5053, "step": 2492 }, { "epoch": 1.1124127055643163, "grad_norm": 0.5760451279581503, "learning_rate": 1.3860634181010979e-05, "loss": 0.5404, "step": 2493 }, { "epoch": 1.1128632574904258, "grad_norm": 0.5472380303590829, "learning_rate": 1.3856275807478767e-05, "loss": 0.5155, "step": 2494 }, { "epoch": 1.1133138094165353, "grad_norm": 0.5886935791162917, "learning_rate": 1.3851916573300597e-05, "loss": 0.5269, "step": 2495 }, { "epoch": 1.1137643613426447, "grad_norm": 0.5412204638431802, "learning_rate": 1.3847556479449362e-05, "loss": 0.544, "step": 2496 }, { "epoch": 1.1142149132687542, "grad_norm": 0.5708377381417326, "learning_rate": 1.384319552689815e-05, "loss": 0.5886, "step": 2497 }, { "epoch": 1.1146654651948638, "grad_norm": 0.6164208436330674, "learning_rate": 1.3838833716620245e-05, "loss": 0.5191, "step": 2498 }, { "epoch": 1.115116017120973, "grad_norm": 0.5453156807378684, "learning_rate": 1.3834471049589117e-05, "loss": 0.5602, "step": 2499 }, { "epoch": 1.1155665690470826, "grad_norm": 0.5607385876103175, "learning_rate": 1.3830107526778429e-05, "loss": 0.5475, "step": 2500 }, { "epoch": 1.1160171209731922, "grad_norm": 0.5389701665113363, "learning_rate": 1.3825743149162029e-05, "loss": 0.5299, "step": 2501 }, { "epoch": 1.1164676728993017, "grad_norm": 0.5579654040938038, "learning_rate": 1.3821377917713969e-05, "loss": 0.5172, "step": 2502 }, { "epoch": 1.116918224825411, "grad_norm": 0.5727217926105537, "learning_rate": 1.381701183340848e-05, "loss": 0.5552, "step": 2503 }, { "epoch": 1.1173687767515206, "grad_norm": 0.5556950175381556, "learning_rate": 1.3812644897219989e-05, "loss": 0.5241, "step": 2504 }, { "epoch": 1.1178193286776301, "grad_norm": 0.5281112222294846, "learning_rate": 1.380827711012311e-05, "loss": 0.5203, "step": 2505 }, { "epoch": 1.1182698806037397, "grad_norm": 0.5359647333593397, "learning_rate": 1.3803908473092648e-05, "loss": 0.5545, "step": 2506 }, { "epoch": 1.118720432529849, "grad_norm": 0.5319469766276254, "learning_rate": 1.37995389871036e-05, "loss": 0.5574, "step": 2507 }, { "epoch": 1.1191709844559585, "grad_norm": 0.5236617683199616, "learning_rate": 1.3795168653131153e-05, "loss": 0.5867, "step": 2508 }, { "epoch": 1.119621536382068, "grad_norm": 0.5538149381385399, "learning_rate": 1.3790797472150671e-05, "loss": 0.5373, "step": 2509 }, { "epoch": 1.1200720883081776, "grad_norm": 0.5440138652851166, "learning_rate": 1.3786425445137727e-05, "loss": 0.5478, "step": 2510 }, { "epoch": 1.120522640234287, "grad_norm": 0.550145281033573, "learning_rate": 1.378205257306807e-05, "loss": 0.5605, "step": 2511 }, { "epoch": 1.1209731921603965, "grad_norm": 0.5251284384588835, "learning_rate": 1.3777678856917637e-05, "loss": 0.5527, "step": 2512 }, { "epoch": 1.121423744086506, "grad_norm": 0.5354244558804825, "learning_rate": 1.377330429766256e-05, "loss": 0.5416, "step": 2513 }, { "epoch": 1.1218742960126153, "grad_norm": 0.5288986754352305, "learning_rate": 1.3768928896279153e-05, "loss": 0.5293, "step": 2514 }, { "epoch": 1.122324847938725, "grad_norm": 0.5418682255698885, "learning_rate": 1.376455265374392e-05, "loss": 0.5248, "step": 2515 }, { "epoch": 1.1227753998648344, "grad_norm": 0.5118090917982545, "learning_rate": 1.3760175571033559e-05, "loss": 0.5034, "step": 2516 }, { "epoch": 1.123225951790944, "grad_norm": 0.5486931625989568, "learning_rate": 1.3755797649124944e-05, "loss": 0.5591, "step": 2517 }, { "epoch": 1.1236765037170533, "grad_norm": 0.5624265887433602, "learning_rate": 1.3751418888995147e-05, "loss": 0.544, "step": 2518 }, { "epoch": 1.1241270556431628, "grad_norm": 0.5290187968849177, "learning_rate": 1.374703929162142e-05, "loss": 0.5542, "step": 2519 }, { "epoch": 1.1245776075692724, "grad_norm": 0.5475537225921762, "learning_rate": 1.3742658857981204e-05, "loss": 0.5248, "step": 2520 }, { "epoch": 1.125028159495382, "grad_norm": 0.5448280078884555, "learning_rate": 1.373827758905213e-05, "loss": 0.5345, "step": 2521 }, { "epoch": 1.1254787114214913, "grad_norm": 0.5689157998386607, "learning_rate": 1.3733895485812005e-05, "loss": 0.5306, "step": 2522 }, { "epoch": 1.1259292633476008, "grad_norm": 0.5607677332135335, "learning_rate": 1.3729512549238835e-05, "loss": 0.5286, "step": 2523 }, { "epoch": 1.1263798152737103, "grad_norm": 0.531437089383485, "learning_rate": 1.3725128780310805e-05, "loss": 0.533, "step": 2524 }, { "epoch": 1.1268303671998199, "grad_norm": 0.5941577771966148, "learning_rate": 1.372074418000629e-05, "loss": 0.5672, "step": 2525 }, { "epoch": 1.1272809191259292, "grad_norm": 0.5412332870989555, "learning_rate": 1.3716358749303842e-05, "loss": 0.5565, "step": 2526 }, { "epoch": 1.1277314710520387, "grad_norm": 0.5432995762463553, "learning_rate": 1.3711972489182208e-05, "loss": 0.5391, "step": 2527 }, { "epoch": 1.1281820229781483, "grad_norm": 0.5952199981732926, "learning_rate": 1.3707585400620316e-05, "loss": 0.5796, "step": 2528 }, { "epoch": 1.1286325749042576, "grad_norm": 0.5537114305107348, "learning_rate": 1.370319748459728e-05, "loss": 0.5518, "step": 2529 }, { "epoch": 1.1290831268303672, "grad_norm": 0.6233036265157511, "learning_rate": 1.3698808742092392e-05, "loss": 0.5682, "step": 2530 }, { "epoch": 1.1295336787564767, "grad_norm": 0.5914993821494369, "learning_rate": 1.3694419174085143e-05, "loss": 0.5546, "step": 2531 }, { "epoch": 1.1299842306825862, "grad_norm": 0.5270101231243076, "learning_rate": 1.369002878155519e-05, "loss": 0.5249, "step": 2532 }, { "epoch": 1.1304347826086956, "grad_norm": 0.565311146776813, "learning_rate": 1.3685637565482392e-05, "loss": 0.5211, "step": 2533 }, { "epoch": 1.130885334534805, "grad_norm": 0.5626457179307485, "learning_rate": 1.3681245526846782e-05, "loss": 0.5354, "step": 2534 }, { "epoch": 1.1313358864609147, "grad_norm": 0.5425368898598708, "learning_rate": 1.3676852666628573e-05, "loss": 0.5494, "step": 2535 }, { "epoch": 1.1317864383870242, "grad_norm": 0.6013331685141253, "learning_rate": 1.3672458985808166e-05, "loss": 0.527, "step": 2536 }, { "epoch": 1.1322369903131335, "grad_norm": 0.5542745401287804, "learning_rate": 1.366806448536615e-05, "loss": 0.519, "step": 2537 }, { "epoch": 1.132687542239243, "grad_norm": 0.5941856762625517, "learning_rate": 1.366366916628329e-05, "loss": 0.5756, "step": 2538 }, { "epoch": 1.1331380941653526, "grad_norm": 0.5466033832052003, "learning_rate": 1.3659273029540536e-05, "loss": 0.551, "step": 2539 }, { "epoch": 1.1335886460914621, "grad_norm": 0.5156879497093713, "learning_rate": 1.3654876076119022e-05, "loss": 0.5153, "step": 2540 }, { "epoch": 1.1340391980175715, "grad_norm": 0.542646337361679, "learning_rate": 1.3650478307000059e-05, "loss": 0.5083, "step": 2541 }, { "epoch": 1.134489749943681, "grad_norm": 0.5336604448259553, "learning_rate": 1.3646079723165148e-05, "loss": 0.5385, "step": 2542 }, { "epoch": 1.1349403018697906, "grad_norm": 0.5645739272025789, "learning_rate": 1.3641680325595962e-05, "loss": 0.5123, "step": 2543 }, { "epoch": 1.1353908537958999, "grad_norm": 0.5489585053583429, "learning_rate": 1.363728011527437e-05, "loss": 0.5488, "step": 2544 }, { "epoch": 1.1358414057220094, "grad_norm": 0.563355313176038, "learning_rate": 1.3632879093182405e-05, "loss": 0.56, "step": 2545 }, { "epoch": 1.136291957648119, "grad_norm": 0.593496287615702, "learning_rate": 1.3628477260302295e-05, "loss": 0.5697, "step": 2546 }, { "epoch": 1.1367425095742285, "grad_norm": 0.5348734123380754, "learning_rate": 1.3624074617616443e-05, "loss": 0.5626, "step": 2547 }, { "epoch": 1.1371930615003378, "grad_norm": 0.5489363145260897, "learning_rate": 1.3619671166107432e-05, "loss": 0.5428, "step": 2548 }, { "epoch": 1.1376436134264474, "grad_norm": 0.568450945597945, "learning_rate": 1.3615266906758025e-05, "loss": 0.521, "step": 2549 }, { "epoch": 1.138094165352557, "grad_norm": 0.5470875252899995, "learning_rate": 1.3610861840551172e-05, "loss": 0.5289, "step": 2550 }, { "epoch": 1.1385447172786665, "grad_norm": 0.5755033912216826, "learning_rate": 1.3606455968469994e-05, "loss": 0.5643, "step": 2551 }, { "epoch": 1.1389952692047758, "grad_norm": 0.5750438821743254, "learning_rate": 1.3602049291497798e-05, "loss": 0.5322, "step": 2552 }, { "epoch": 1.1394458211308853, "grad_norm": 0.5866610942583794, "learning_rate": 1.3597641810618071e-05, "loss": 0.5752, "step": 2553 }, { "epoch": 1.1398963730569949, "grad_norm": 0.5509533756203264, "learning_rate": 1.3593233526814475e-05, "loss": 0.5398, "step": 2554 }, { "epoch": 1.1403469249831044, "grad_norm": 0.5767691577929199, "learning_rate": 1.3588824441070852e-05, "loss": 0.5248, "step": 2555 }, { "epoch": 1.1407974769092137, "grad_norm": 0.5726871960761908, "learning_rate": 1.3584414554371227e-05, "loss": 0.5365, "step": 2556 }, { "epoch": 1.1412480288353233, "grad_norm": 0.6044906171982946, "learning_rate": 1.3580003867699801e-05, "loss": 0.5514, "step": 2557 }, { "epoch": 1.1416985807614328, "grad_norm": 0.541316417788941, "learning_rate": 1.357559238204095e-05, "loss": 0.5602, "step": 2558 }, { "epoch": 1.1421491326875421, "grad_norm": 0.5951940516129202, "learning_rate": 1.357118009837924e-05, "loss": 0.5573, "step": 2559 }, { "epoch": 1.1425996846136517, "grad_norm": 0.6702716409382939, "learning_rate": 1.3566767017699399e-05, "loss": 0.5976, "step": 2560 }, { "epoch": 1.1430502365397612, "grad_norm": 0.5532377994580826, "learning_rate": 1.3562353140986344e-05, "loss": 0.54, "step": 2561 }, { "epoch": 1.1435007884658708, "grad_norm": 0.6055975042471609, "learning_rate": 1.3557938469225167e-05, "loss": 0.5487, "step": 2562 }, { "epoch": 1.14395134039198, "grad_norm": 0.5642772588876559, "learning_rate": 1.3553523003401135e-05, "loss": 0.5249, "step": 2563 }, { "epoch": 1.1444018923180896, "grad_norm": 0.575090720350691, "learning_rate": 1.3549106744499699e-05, "loss": 0.5403, "step": 2564 }, { "epoch": 1.1448524442441992, "grad_norm": 0.5450575953446578, "learning_rate": 1.3544689693506478e-05, "loss": 0.5118, "step": 2565 }, { "epoch": 1.1453029961703085, "grad_norm": 0.5965529637053941, "learning_rate": 1.3540271851407273e-05, "loss": 0.5584, "step": 2566 }, { "epoch": 1.145753548096418, "grad_norm": 0.5466790425955697, "learning_rate": 1.3535853219188064e-05, "loss": 0.5317, "step": 2567 }, { "epoch": 1.1462041000225276, "grad_norm": 0.5650335329780262, "learning_rate": 1.3531433797835001e-05, "loss": 0.5308, "step": 2568 }, { "epoch": 1.1466546519486371, "grad_norm": 0.6018382389409606, "learning_rate": 1.3527013588334415e-05, "loss": 0.5712, "step": 2569 }, { "epoch": 1.1471052038747467, "grad_norm": 0.5784462487568489, "learning_rate": 1.3522592591672805e-05, "loss": 0.5514, "step": 2570 }, { "epoch": 1.147555755800856, "grad_norm": 0.5559824650225849, "learning_rate": 1.3518170808836859e-05, "loss": 0.5269, "step": 2571 }, { "epoch": 1.1480063077269655, "grad_norm": 0.5877629148168636, "learning_rate": 1.3513748240813429e-05, "loss": 0.5443, "step": 2572 }, { "epoch": 1.148456859653075, "grad_norm": 0.5551592820435189, "learning_rate": 1.3509324888589548e-05, "loss": 0.5302, "step": 2573 }, { "epoch": 1.1489074115791844, "grad_norm": 0.5517951875265202, "learning_rate": 1.3504900753152422e-05, "loss": 0.5371, "step": 2574 }, { "epoch": 1.149357963505294, "grad_norm": 0.5700389826480365, "learning_rate": 1.3500475835489432e-05, "loss": 0.5567, "step": 2575 }, { "epoch": 1.1498085154314035, "grad_norm": 0.5596898673831494, "learning_rate": 1.3496050136588135e-05, "loss": 0.5421, "step": 2576 }, { "epoch": 1.150259067357513, "grad_norm": 0.5589614679870831, "learning_rate": 1.349162365743626e-05, "loss": 0.5535, "step": 2577 }, { "epoch": 1.1507096192836224, "grad_norm": 0.5452545083982395, "learning_rate": 1.3487196399021712e-05, "loss": 0.5834, "step": 2578 }, { "epoch": 1.151160171209732, "grad_norm": 0.5813372089683586, "learning_rate": 1.3482768362332568e-05, "loss": 0.5498, "step": 2579 }, { "epoch": 1.1516107231358415, "grad_norm": 0.5334579020852422, "learning_rate": 1.347833954835708e-05, "loss": 0.5241, "step": 2580 }, { "epoch": 1.1520612750619508, "grad_norm": 0.569251617051239, "learning_rate": 1.3473909958083676e-05, "loss": 0.5479, "step": 2581 }, { "epoch": 1.1525118269880603, "grad_norm": 0.5522191932890925, "learning_rate": 1.3469479592500954e-05, "loss": 0.5477, "step": 2582 }, { "epoch": 1.1529623789141699, "grad_norm": 0.5563092118367644, "learning_rate": 1.3465048452597682e-05, "loss": 0.5192, "step": 2583 }, { "epoch": 1.1534129308402794, "grad_norm": 0.5502903716763298, "learning_rate": 1.3460616539362805e-05, "loss": 0.5222, "step": 2584 }, { "epoch": 1.153863482766389, "grad_norm": 0.5574713397466391, "learning_rate": 1.3456183853785445e-05, "loss": 0.5212, "step": 2585 }, { "epoch": 1.1543140346924983, "grad_norm": 0.5588367061054311, "learning_rate": 1.3451750396854887e-05, "loss": 0.5435, "step": 2586 }, { "epoch": 1.1547645866186078, "grad_norm": 0.53571626453021, "learning_rate": 1.3447316169560593e-05, "loss": 0.5663, "step": 2587 }, { "epoch": 1.1552151385447174, "grad_norm": 0.5701956342074129, "learning_rate": 1.3442881172892199e-05, "loss": 0.5541, "step": 2588 }, { "epoch": 1.1556656904708267, "grad_norm": 1.5843767624604892, "learning_rate": 1.343844540783951e-05, "loss": 0.5427, "step": 2589 }, { "epoch": 1.1561162423969362, "grad_norm": 0.5538114083686556, "learning_rate": 1.3434008875392499e-05, "loss": 0.5122, "step": 2590 }, { "epoch": 1.1565667943230458, "grad_norm": 0.5448861563646507, "learning_rate": 1.3429571576541315e-05, "loss": 0.5198, "step": 2591 }, { "epoch": 1.1570173462491553, "grad_norm": 0.5431727856950954, "learning_rate": 1.3425133512276284e-05, "loss": 0.5345, "step": 2592 }, { "epoch": 1.1574678981752646, "grad_norm": 0.5640947837300287, "learning_rate": 1.3420694683587884e-05, "loss": 0.5373, "step": 2593 }, { "epoch": 1.1579184501013742, "grad_norm": 0.5652876968133053, "learning_rate": 1.3416255091466783e-05, "loss": 0.5318, "step": 2594 }, { "epoch": 1.1583690020274837, "grad_norm": 0.5271659023049761, "learning_rate": 1.3411814736903815e-05, "loss": 0.5251, "step": 2595 }, { "epoch": 1.158819553953593, "grad_norm": 0.5591010386337627, "learning_rate": 1.3407373620889974e-05, "loss": 0.5595, "step": 2596 }, { "epoch": 1.1592701058797026, "grad_norm": 0.5390206557857722, "learning_rate": 1.3402931744416432e-05, "loss": 0.5418, "step": 2597 }, { "epoch": 1.1597206578058121, "grad_norm": 0.5302070484024346, "learning_rate": 1.3398489108474533e-05, "loss": 0.5351, "step": 2598 }, { "epoch": 1.1601712097319217, "grad_norm": 0.5708218651887617, "learning_rate": 1.3394045714055785e-05, "loss": 0.5154, "step": 2599 }, { "epoch": 1.160621761658031, "grad_norm": 0.5579771950972244, "learning_rate": 1.3389601562151868e-05, "loss": 0.5366, "step": 2600 }, { "epoch": 1.1610723135841405, "grad_norm": 0.5582583993021651, "learning_rate": 1.338515665375463e-05, "loss": 0.5672, "step": 2601 }, { "epoch": 1.16152286551025, "grad_norm": 0.565848912106485, "learning_rate": 1.3380710989856086e-05, "loss": 0.5194, "step": 2602 }, { "epoch": 1.1619734174363596, "grad_norm": 0.6216807084273037, "learning_rate": 1.3376264571448427e-05, "loss": 0.5542, "step": 2603 }, { "epoch": 1.162423969362469, "grad_norm": 0.5459089798214791, "learning_rate": 1.3371817399524006e-05, "loss": 0.5301, "step": 2604 }, { "epoch": 1.1628745212885785, "grad_norm": 0.5737429421852782, "learning_rate": 1.3367369475075344e-05, "loss": 0.5685, "step": 2605 }, { "epoch": 1.163325073214688, "grad_norm": 0.5413988418379686, "learning_rate": 1.3362920799095131e-05, "loss": 0.5246, "step": 2606 }, { "epoch": 1.1637756251407976, "grad_norm": 0.5616016466156482, "learning_rate": 1.3358471372576229e-05, "loss": 0.5329, "step": 2607 }, { "epoch": 1.164226177066907, "grad_norm": 0.5196067634500796, "learning_rate": 1.3354021196511658e-05, "loss": 0.5724, "step": 2608 }, { "epoch": 1.1646767289930164, "grad_norm": 0.5334581871245484, "learning_rate": 1.3349570271894614e-05, "loss": 0.5373, "step": 2609 }, { "epoch": 1.165127280919126, "grad_norm": 0.5494023711893725, "learning_rate": 1.3345118599718456e-05, "loss": 0.5836, "step": 2610 }, { "epoch": 1.1655778328452353, "grad_norm": 0.5293363055767825, "learning_rate": 1.3340666180976713e-05, "loss": 0.5362, "step": 2611 }, { "epoch": 1.1660283847713449, "grad_norm": 0.5460501533522283, "learning_rate": 1.3336213016663078e-05, "loss": 0.5353, "step": 2612 }, { "epoch": 1.1664789366974544, "grad_norm": 0.5372739072653653, "learning_rate": 1.3331759107771406e-05, "loss": 0.5231, "step": 2613 }, { "epoch": 1.166929488623564, "grad_norm": 0.6034984661725534, "learning_rate": 1.3327304455295731e-05, "loss": 0.5285, "step": 2614 }, { "epoch": 1.1673800405496733, "grad_norm": 0.5411982364434087, "learning_rate": 1.3322849060230239e-05, "loss": 0.5532, "step": 2615 }, { "epoch": 1.1678305924757828, "grad_norm": 0.5510608900114187, "learning_rate": 1.331839292356929e-05, "loss": 0.5336, "step": 2616 }, { "epoch": 1.1682811444018923, "grad_norm": 0.609330262314016, "learning_rate": 1.3313936046307411e-05, "loss": 0.5592, "step": 2617 }, { "epoch": 1.168731696328002, "grad_norm": 0.5539343049250022, "learning_rate": 1.3309478429439284e-05, "loss": 0.5218, "step": 2618 }, { "epoch": 1.1691822482541112, "grad_norm": 0.550082042212073, "learning_rate": 1.3305020073959766e-05, "loss": 0.5663, "step": 2619 }, { "epoch": 1.1696328001802208, "grad_norm": 0.5668803574320931, "learning_rate": 1.3300560980863875e-05, "loss": 0.5501, "step": 2620 }, { "epoch": 1.1700833521063303, "grad_norm": 0.5827338697259022, "learning_rate": 1.3296101151146794e-05, "loss": 0.529, "step": 2621 }, { "epoch": 1.1705339040324398, "grad_norm": 0.5477397292262841, "learning_rate": 1.3291640585803869e-05, "loss": 0.5293, "step": 2622 }, { "epoch": 1.1709844559585492, "grad_norm": 0.5721552071423873, "learning_rate": 1.3287179285830614e-05, "loss": 0.5751, "step": 2623 }, { "epoch": 1.1714350078846587, "grad_norm": 0.5510083639226543, "learning_rate": 1.3282717252222704e-05, "loss": 0.5577, "step": 2624 }, { "epoch": 1.1718855598107683, "grad_norm": 0.5508222683276645, "learning_rate": 1.3278254485975977e-05, "loss": 0.5347, "step": 2625 }, { "epoch": 1.1723361117368776, "grad_norm": 0.5776982419688977, "learning_rate": 1.327379098808644e-05, "loss": 0.5498, "step": 2626 }, { "epoch": 1.1727866636629871, "grad_norm": 0.5444607624667239, "learning_rate": 1.3269326759550252e-05, "loss": 0.5428, "step": 2627 }, { "epoch": 1.1732372155890967, "grad_norm": 0.5640076408567413, "learning_rate": 1.3264861801363749e-05, "loss": 0.5876, "step": 2628 }, { "epoch": 1.1736877675152062, "grad_norm": 0.5727848907050749, "learning_rate": 1.326039611452342e-05, "loss": 0.5481, "step": 2629 }, { "epoch": 1.1741383194413155, "grad_norm": 0.5372438204020575, "learning_rate": 1.325592970002592e-05, "loss": 0.5372, "step": 2630 }, { "epoch": 1.174588871367425, "grad_norm": 0.5413736356796469, "learning_rate": 1.3251462558868067e-05, "loss": 0.5158, "step": 2631 }, { "epoch": 1.1750394232935346, "grad_norm": 0.5189533120382217, "learning_rate": 1.3246994692046837e-05, "loss": 0.5567, "step": 2632 }, { "epoch": 1.1754899752196442, "grad_norm": 0.5563941875695564, "learning_rate": 1.3242526100559374e-05, "loss": 0.568, "step": 2633 }, { "epoch": 1.1759405271457535, "grad_norm": 0.5568152093791827, "learning_rate": 1.3238056785402982e-05, "loss": 0.5438, "step": 2634 }, { "epoch": 1.176391079071863, "grad_norm": 0.5362401455513599, "learning_rate": 1.3233586747575123e-05, "loss": 0.5356, "step": 2635 }, { "epoch": 1.1768416309979726, "grad_norm": 0.5263832921508219, "learning_rate": 1.3229115988073424e-05, "loss": 0.5176, "step": 2636 }, { "epoch": 1.177292182924082, "grad_norm": 0.5610937566512404, "learning_rate": 1.3224644507895672e-05, "loss": 0.5147, "step": 2637 }, { "epoch": 1.1777427348501914, "grad_norm": 0.5515122252994896, "learning_rate": 1.3220172308039812e-05, "loss": 0.5642, "step": 2638 }, { "epoch": 1.178193286776301, "grad_norm": 0.5424212344251159, "learning_rate": 1.3215699389503956e-05, "loss": 0.5369, "step": 2639 }, { "epoch": 1.1786438387024105, "grad_norm": 0.5191662297785912, "learning_rate": 1.3211225753286371e-05, "loss": 0.5135, "step": 2640 }, { "epoch": 1.1790943906285198, "grad_norm": 0.5262801814213129, "learning_rate": 1.3206751400385487e-05, "loss": 0.5453, "step": 2641 }, { "epoch": 1.1795449425546294, "grad_norm": 0.5424037475682956, "learning_rate": 1.320227633179989e-05, "loss": 0.5697, "step": 2642 }, { "epoch": 1.179995494480739, "grad_norm": 0.5327746771216306, "learning_rate": 1.3197800548528333e-05, "loss": 0.5547, "step": 2643 }, { "epoch": 1.1804460464068485, "grad_norm": 0.5251887220539375, "learning_rate": 1.3193324051569717e-05, "loss": 0.5409, "step": 2644 }, { "epoch": 1.1808965983329578, "grad_norm": 0.5560095260938452, "learning_rate": 1.3188846841923117e-05, "loss": 0.4884, "step": 2645 }, { "epoch": 1.1813471502590673, "grad_norm": 0.5451000597507577, "learning_rate": 1.3184368920587756e-05, "loss": 0.5406, "step": 2646 }, { "epoch": 1.1817977021851769, "grad_norm": 0.5374381863331043, "learning_rate": 1.3179890288563015e-05, "loss": 0.5142, "step": 2647 }, { "epoch": 1.1822482541112862, "grad_norm": 0.5464038010284133, "learning_rate": 1.3175410946848446e-05, "loss": 0.5269, "step": 2648 }, { "epoch": 1.1826988060373957, "grad_norm": 0.5610132589954433, "learning_rate": 1.3170930896443745e-05, "loss": 0.5371, "step": 2649 }, { "epoch": 1.1831493579635053, "grad_norm": 0.5648151904540305, "learning_rate": 1.3166450138348775e-05, "loss": 0.559, "step": 2650 }, { "epoch": 1.1835999098896148, "grad_norm": 0.5915561904478516, "learning_rate": 1.3161968673563552e-05, "loss": 0.5282, "step": 2651 }, { "epoch": 1.1840504618157244, "grad_norm": 0.5696957127957695, "learning_rate": 1.3157486503088255e-05, "loss": 0.5124, "step": 2652 }, { "epoch": 1.1845010137418337, "grad_norm": 0.5348317628543351, "learning_rate": 1.3153003627923217e-05, "loss": 0.513, "step": 2653 }, { "epoch": 1.1849515656679432, "grad_norm": 0.5688168372071629, "learning_rate": 1.3148520049068926e-05, "loss": 0.5257, "step": 2654 }, { "epoch": 1.1854021175940528, "grad_norm": 0.5877225655648127, "learning_rate": 1.3144035767526036e-05, "loss": 0.5772, "step": 2655 }, { "epoch": 1.185852669520162, "grad_norm": 0.5540253468362754, "learning_rate": 1.3139550784295343e-05, "loss": 0.517, "step": 2656 }, { "epoch": 1.1863032214462716, "grad_norm": 0.6072596931941426, "learning_rate": 1.3135065100377816e-05, "loss": 0.5429, "step": 2657 }, { "epoch": 1.1867537733723812, "grad_norm": 0.5423728011703263, "learning_rate": 1.3130578716774566e-05, "loss": 0.5461, "step": 2658 }, { "epoch": 1.1872043252984907, "grad_norm": 0.5245067678893597, "learning_rate": 1.3126091634486873e-05, "loss": 0.5445, "step": 2659 }, { "epoch": 1.1876548772246, "grad_norm": 0.5374631966135622, "learning_rate": 1.3121603854516161e-05, "loss": 0.5268, "step": 2660 }, { "epoch": 1.1881054291507096, "grad_norm": 0.5691128042045763, "learning_rate": 1.311711537786402e-05, "loss": 0.5637, "step": 2661 }, { "epoch": 1.1885559810768191, "grad_norm": 0.535724716930054, "learning_rate": 1.3112626205532189e-05, "loss": 0.5123, "step": 2662 }, { "epoch": 1.1890065330029285, "grad_norm": 0.5301403827628056, "learning_rate": 1.3108136338522561e-05, "loss": 0.4992, "step": 2663 }, { "epoch": 1.189457084929038, "grad_norm": 0.5412320250755094, "learning_rate": 1.3103645777837195e-05, "loss": 0.5392, "step": 2664 }, { "epoch": 1.1899076368551476, "grad_norm": 0.5422682833679254, "learning_rate": 1.3099154524478295e-05, "loss": 0.5422, "step": 2665 }, { "epoch": 1.190358188781257, "grad_norm": 0.5342762654789409, "learning_rate": 1.3094662579448217e-05, "loss": 0.5841, "step": 2666 }, { "epoch": 1.1908087407073666, "grad_norm": 0.5510916178975199, "learning_rate": 1.3090169943749475e-05, "loss": 0.5688, "step": 2667 }, { "epoch": 1.191259292633476, "grad_norm": 0.5293219131699066, "learning_rate": 1.3085676618384743e-05, "loss": 0.5172, "step": 2668 }, { "epoch": 1.1917098445595855, "grad_norm": 0.5065980711564443, "learning_rate": 1.3081182604356846e-05, "loss": 0.5386, "step": 2669 }, { "epoch": 1.192160396485695, "grad_norm": 0.5253687533826809, "learning_rate": 1.3076687902668754e-05, "loss": 0.5309, "step": 2670 }, { "epoch": 1.1926109484118044, "grad_norm": 0.5603088554097495, "learning_rate": 1.3072192514323603e-05, "loss": 0.559, "step": 2671 }, { "epoch": 1.193061500337914, "grad_norm": 0.5072063590320225, "learning_rate": 1.3067696440324671e-05, "loss": 0.5132, "step": 2672 }, { "epoch": 1.1935120522640235, "grad_norm": 0.5130500618984914, "learning_rate": 1.3063199681675398e-05, "loss": 0.5314, "step": 2673 }, { "epoch": 1.193962604190133, "grad_norm": 0.5099003968452915, "learning_rate": 1.3058702239379374e-05, "loss": 0.5357, "step": 2674 }, { "epoch": 1.1944131561162423, "grad_norm": 0.538443488284015, "learning_rate": 1.305420411444034e-05, "loss": 0.5347, "step": 2675 }, { "epoch": 1.1948637080423519, "grad_norm": 0.5510200903863639, "learning_rate": 1.304970530786219e-05, "loss": 0.5514, "step": 2676 }, { "epoch": 1.1953142599684614, "grad_norm": 0.557159080450487, "learning_rate": 1.3045205820648969e-05, "loss": 0.5404, "step": 2677 }, { "epoch": 1.1957648118945707, "grad_norm": 0.5589249527589923, "learning_rate": 1.304070565380488e-05, "loss": 0.5804, "step": 2678 }, { "epoch": 1.1962153638206803, "grad_norm": 0.5641641353618894, "learning_rate": 1.3036204808334267e-05, "loss": 0.5648, "step": 2679 }, { "epoch": 1.1966659157467898, "grad_norm": 0.5383011202599708, "learning_rate": 1.3031703285241632e-05, "loss": 0.5277, "step": 2680 }, { "epoch": 1.1971164676728994, "grad_norm": 0.7058308938816441, "learning_rate": 1.3027201085531633e-05, "loss": 0.5548, "step": 2681 }, { "epoch": 1.197567019599009, "grad_norm": 0.5525543233103771, "learning_rate": 1.3022698210209069e-05, "loss": 0.4813, "step": 2682 }, { "epoch": 1.1980175715251182, "grad_norm": 0.5680924712805282, "learning_rate": 1.3018194660278895e-05, "loss": 0.5399, "step": 2683 }, { "epoch": 1.1984681234512278, "grad_norm": 0.5268141452955944, "learning_rate": 1.3013690436746218e-05, "loss": 0.5602, "step": 2684 }, { "epoch": 1.1989186753773373, "grad_norm": 0.5636734699181754, "learning_rate": 1.300918554061629e-05, "loss": 0.5424, "step": 2685 }, { "epoch": 1.1993692273034466, "grad_norm": 0.5269364955873043, "learning_rate": 1.300467997289452e-05, "loss": 0.5147, "step": 2686 }, { "epoch": 1.1998197792295562, "grad_norm": 0.5566951287942556, "learning_rate": 1.3000173734586461e-05, "loss": 0.5433, "step": 2687 }, { "epoch": 1.2002703311556657, "grad_norm": 0.5343655202513623, "learning_rate": 1.299566682669782e-05, "loss": 0.5327, "step": 2688 }, { "epoch": 1.2007208830817753, "grad_norm": 0.5458935888085129, "learning_rate": 1.2991159250234449e-05, "loss": 0.549, "step": 2689 }, { "epoch": 1.2011714350078846, "grad_norm": 0.5696129137150898, "learning_rate": 1.2986651006202353e-05, "loss": 0.565, "step": 2690 }, { "epoch": 1.2016219869339941, "grad_norm": 0.542810512456624, "learning_rate": 1.2982142095607686e-05, "loss": 0.5571, "step": 2691 }, { "epoch": 1.2020725388601037, "grad_norm": 0.5452622515221659, "learning_rate": 1.2977632519456745e-05, "loss": 0.5165, "step": 2692 }, { "epoch": 1.202523090786213, "grad_norm": 0.5420503374035951, "learning_rate": 1.2973122278755983e-05, "loss": 0.5428, "step": 2693 }, { "epoch": 1.2029736427123225, "grad_norm": 0.5796488897247303, "learning_rate": 1.2968611374511999e-05, "loss": 0.5856, "step": 2694 }, { "epoch": 1.203424194638432, "grad_norm": 0.5268454368825681, "learning_rate": 1.2964099807731539e-05, "loss": 0.5656, "step": 2695 }, { "epoch": 1.2038747465645416, "grad_norm": 0.543817089742169, "learning_rate": 1.2959587579421493e-05, "loss": 0.5503, "step": 2696 }, { "epoch": 1.204325298490651, "grad_norm": 0.5516080698375809, "learning_rate": 1.295507469058891e-05, "loss": 0.5207, "step": 2697 }, { "epoch": 1.2047758504167605, "grad_norm": 0.5247700602423896, "learning_rate": 1.2950561142240973e-05, "loss": 0.551, "step": 2698 }, { "epoch": 1.20522640234287, "grad_norm": 0.5450186414023476, "learning_rate": 1.2946046935385023e-05, "loss": 0.5587, "step": 2699 }, { "epoch": 1.2056769542689796, "grad_norm": 0.5564397430436963, "learning_rate": 1.2941532071028541e-05, "loss": 0.535, "step": 2700 }, { "epoch": 1.206127506195089, "grad_norm": 0.5173088518418875, "learning_rate": 1.2937016550179159e-05, "loss": 0.535, "step": 2701 }, { "epoch": 1.2065780581211984, "grad_norm": 0.5464969534462375, "learning_rate": 1.293250037384465e-05, "loss": 0.5297, "step": 2702 }, { "epoch": 1.207028610047308, "grad_norm": 0.5445645077556358, "learning_rate": 1.292798354303294e-05, "loss": 0.5351, "step": 2703 }, { "epoch": 1.2074791619734175, "grad_norm": 0.5547323826648618, "learning_rate": 1.2923466058752097e-05, "loss": 0.5669, "step": 2704 }, { "epoch": 1.2079297138995269, "grad_norm": 0.5293616480854798, "learning_rate": 1.2918947922010336e-05, "loss": 0.523, "step": 2705 }, { "epoch": 1.2083802658256364, "grad_norm": 0.5409186082514323, "learning_rate": 1.2914429133816017e-05, "loss": 0.535, "step": 2706 }, { "epoch": 1.208830817751746, "grad_norm": 0.5392122864524949, "learning_rate": 1.2909909695177647e-05, "loss": 0.5134, "step": 2707 }, { "epoch": 1.2092813696778553, "grad_norm": 0.5264221424179725, "learning_rate": 1.2905389607103875e-05, "loss": 0.55, "step": 2708 }, { "epoch": 1.2097319216039648, "grad_norm": 0.5302092410808068, "learning_rate": 1.2900868870603502e-05, "loss": 0.551, "step": 2709 }, { "epoch": 1.2101824735300744, "grad_norm": 0.5069772529697989, "learning_rate": 1.2896347486685462e-05, "loss": 0.5537, "step": 2710 }, { "epoch": 1.210633025456184, "grad_norm": 0.5310554579923883, "learning_rate": 1.2891825456358844e-05, "loss": 0.5469, "step": 2711 }, { "epoch": 1.2110835773822932, "grad_norm": 0.5206794434796747, "learning_rate": 1.2887302780632876e-05, "loss": 0.5139, "step": 2712 }, { "epoch": 1.2115341293084028, "grad_norm": 0.5636225166616216, "learning_rate": 1.2882779460516935e-05, "loss": 0.5471, "step": 2713 }, { "epoch": 1.2119846812345123, "grad_norm": 0.5375592210962908, "learning_rate": 1.2878255497020533e-05, "loss": 0.5429, "step": 2714 }, { "epoch": 1.2124352331606219, "grad_norm": 0.5152494756830842, "learning_rate": 1.2873730891153335e-05, "loss": 0.536, "step": 2715 }, { "epoch": 1.2128857850867312, "grad_norm": 0.5376835254541351, "learning_rate": 1.2869205643925142e-05, "loss": 0.5328, "step": 2716 }, { "epoch": 1.2133363370128407, "grad_norm": 0.5396376373648636, "learning_rate": 1.2864679756345905e-05, "loss": 0.5381, "step": 2717 }, { "epoch": 1.2137868889389503, "grad_norm": 0.5367219840891512, "learning_rate": 1.2860153229425712e-05, "loss": 0.5405, "step": 2718 }, { "epoch": 1.2142374408650598, "grad_norm": 0.5662402043503111, "learning_rate": 1.2855626064174796e-05, "loss": 0.5372, "step": 2719 }, { "epoch": 1.2146879927911691, "grad_norm": 0.5332445927438019, "learning_rate": 1.2851098261603535e-05, "loss": 0.5744, "step": 2720 }, { "epoch": 1.2151385447172787, "grad_norm": 0.5512351687686222, "learning_rate": 1.2846569822722441e-05, "loss": 0.5312, "step": 2721 }, { "epoch": 1.2155890966433882, "grad_norm": 0.5442970214018428, "learning_rate": 1.284204074854218e-05, "loss": 0.5388, "step": 2722 }, { "epoch": 1.2160396485694975, "grad_norm": 0.532133664676364, "learning_rate": 1.283751104007355e-05, "loss": 0.5673, "step": 2723 }, { "epoch": 1.216490200495607, "grad_norm": 0.5329046027338183, "learning_rate": 1.2832980698327495e-05, "loss": 0.5255, "step": 2724 }, { "epoch": 1.2169407524217166, "grad_norm": 0.5736580450133879, "learning_rate": 1.28284497243151e-05, "loss": 0.5299, "step": 2725 }, { "epoch": 1.2173913043478262, "grad_norm": 0.5239518698223353, "learning_rate": 1.2823918119047591e-05, "loss": 0.5213, "step": 2726 }, { "epoch": 1.2178418562739355, "grad_norm": 0.5309024106566795, "learning_rate": 1.2819385883536332e-05, "loss": 0.5211, "step": 2727 }, { "epoch": 1.218292408200045, "grad_norm": 0.5990650787504634, "learning_rate": 1.281485301879283e-05, "loss": 0.537, "step": 2728 }, { "epoch": 1.2187429601261546, "grad_norm": 0.5249410032091041, "learning_rate": 1.2810319525828737e-05, "loss": 0.5514, "step": 2729 }, { "epoch": 1.2191935120522641, "grad_norm": 0.5624330500125361, "learning_rate": 1.2805785405655833e-05, "loss": 0.5393, "step": 2730 }, { "epoch": 1.2196440639783734, "grad_norm": 0.5747771315258541, "learning_rate": 1.2801250659286054e-05, "loss": 0.5196, "step": 2731 }, { "epoch": 1.220094615904483, "grad_norm": 0.5230017831564547, "learning_rate": 1.2796715287731463e-05, "loss": 0.5205, "step": 2732 }, { "epoch": 1.2205451678305925, "grad_norm": 0.5309602263760007, "learning_rate": 1.2792179292004265e-05, "loss": 0.4947, "step": 2733 }, { "epoch": 1.220995719756702, "grad_norm": 0.5573602311332846, "learning_rate": 1.2787642673116811e-05, "loss": 0.5584, "step": 2734 }, { "epoch": 1.2214462716828114, "grad_norm": 0.6136450388528623, "learning_rate": 1.2783105432081584e-05, "loss": 0.5461, "step": 2735 }, { "epoch": 1.221896823608921, "grad_norm": 0.5369972704741242, "learning_rate": 1.2778567569911209e-05, "loss": 0.5464, "step": 2736 }, { "epoch": 1.2223473755350305, "grad_norm": 0.5449579761658416, "learning_rate": 1.2774029087618448e-05, "loss": 0.5447, "step": 2737 }, { "epoch": 1.2227979274611398, "grad_norm": 0.575396208138024, "learning_rate": 1.2769489986216202e-05, "loss": 0.5467, "step": 2738 }, { "epoch": 1.2232484793872493, "grad_norm": 0.5687406025996558, "learning_rate": 1.2764950266717511e-05, "loss": 0.5313, "step": 2739 }, { "epoch": 1.2236990313133589, "grad_norm": 0.5857655941948733, "learning_rate": 1.2760409930135552e-05, "loss": 0.5544, "step": 2740 }, { "epoch": 1.2241495832394684, "grad_norm": 0.5701469517816236, "learning_rate": 1.275586897748364e-05, "loss": 0.5314, "step": 2741 }, { "epoch": 1.2246001351655778, "grad_norm": 0.6015520692126528, "learning_rate": 1.2751327409775227e-05, "loss": 0.5353, "step": 2742 }, { "epoch": 1.2250506870916873, "grad_norm": 0.5808413471817728, "learning_rate": 1.2746785228023904e-05, "loss": 0.5421, "step": 2743 }, { "epoch": 1.2255012390177968, "grad_norm": 0.5633085736492418, "learning_rate": 1.2742242433243396e-05, "loss": 0.5299, "step": 2744 }, { "epoch": 1.2259517909439062, "grad_norm": 0.552896215788886, "learning_rate": 1.2737699026447573e-05, "loss": 0.5396, "step": 2745 }, { "epoch": 1.2264023428700157, "grad_norm": 0.6251420895482046, "learning_rate": 1.2733155008650426e-05, "loss": 0.5497, "step": 2746 }, { "epoch": 1.2268528947961252, "grad_norm": 0.5493161607478098, "learning_rate": 1.2728610380866097e-05, "loss": 0.5477, "step": 2747 }, { "epoch": 1.2273034467222348, "grad_norm": 0.5283216326519315, "learning_rate": 1.2724065144108858e-05, "loss": 0.531, "step": 2748 }, { "epoch": 1.2277539986483443, "grad_norm": 0.611602913349481, "learning_rate": 1.2719519299393117e-05, "loss": 0.5737, "step": 2749 }, { "epoch": 1.2282045505744537, "grad_norm": 0.532418718854545, "learning_rate": 1.2714972847733418e-05, "loss": 0.5235, "step": 2750 }, { "epoch": 1.2286551025005632, "grad_norm": 0.5658471425409108, "learning_rate": 1.2710425790144445e-05, "loss": 0.5631, "step": 2751 }, { "epoch": 1.2291056544266727, "grad_norm": 0.543379889494761, "learning_rate": 1.2705878127641007e-05, "loss": 0.5427, "step": 2752 }, { "epoch": 1.229556206352782, "grad_norm": 0.5526777020154706, "learning_rate": 1.2701329861238057e-05, "loss": 0.5451, "step": 2753 }, { "epoch": 1.2300067582788916, "grad_norm": 0.5714126630679512, "learning_rate": 1.2696780991950681e-05, "loss": 0.539, "step": 2754 }, { "epoch": 1.2304573102050012, "grad_norm": 0.5243905648901731, "learning_rate": 1.2692231520794094e-05, "loss": 0.5512, "step": 2755 }, { "epoch": 1.2309078621311107, "grad_norm": 0.5832474961799542, "learning_rate": 1.2687681448783655e-05, "loss": 0.5663, "step": 2756 }, { "epoch": 1.23135841405722, "grad_norm": 0.5661067946269102, "learning_rate": 1.268313077693485e-05, "loss": 0.5228, "step": 2757 }, { "epoch": 1.2318089659833296, "grad_norm": 0.5596901861661625, "learning_rate": 1.2678579506263299e-05, "loss": 0.5296, "step": 2758 }, { "epoch": 1.232259517909439, "grad_norm": 0.5646818608921428, "learning_rate": 1.2674027637784759e-05, "loss": 0.5487, "step": 2759 }, { "epoch": 1.2327100698355484, "grad_norm": 0.5780219073370139, "learning_rate": 1.2669475172515115e-05, "loss": 0.5238, "step": 2760 }, { "epoch": 1.233160621761658, "grad_norm": 0.604396291108298, "learning_rate": 1.2664922111470396e-05, "loss": 0.5585, "step": 2761 }, { "epoch": 1.2336111736877675, "grad_norm": 0.5578203296628594, "learning_rate": 1.2660368455666752e-05, "loss": 0.5428, "step": 2762 }, { "epoch": 1.234061725613877, "grad_norm": 0.5580375657466478, "learning_rate": 1.2655814206120472e-05, "loss": 0.596, "step": 2763 }, { "epoch": 1.2345122775399866, "grad_norm": 0.5310686077555594, "learning_rate": 1.2651259363847976e-05, "loss": 0.503, "step": 2764 }, { "epoch": 1.234962829466096, "grad_norm": 0.5327331014113506, "learning_rate": 1.2646703929865817e-05, "loss": 0.5221, "step": 2765 }, { "epoch": 1.2354133813922055, "grad_norm": 0.5516843776881915, "learning_rate": 1.2642147905190677e-05, "loss": 0.5346, "step": 2766 }, { "epoch": 1.235863933318315, "grad_norm": 0.5673304578151085, "learning_rate": 1.2637591290839377e-05, "loss": 0.5473, "step": 2767 }, { "epoch": 1.2363144852444243, "grad_norm": 0.5416369063824926, "learning_rate": 1.2633034087828859e-05, "loss": 0.5462, "step": 2768 }, { "epoch": 1.2367650371705339, "grad_norm": 0.5757826264680216, "learning_rate": 1.2628476297176206e-05, "loss": 0.5409, "step": 2769 }, { "epoch": 1.2372155890966434, "grad_norm": 0.5161796375652606, "learning_rate": 1.2623917919898632e-05, "loss": 0.5428, "step": 2770 }, { "epoch": 1.237666141022753, "grad_norm": 0.543164648388002, "learning_rate": 1.261935895701347e-05, "loss": 0.5358, "step": 2771 }, { "epoch": 1.2381166929488623, "grad_norm": 0.610672365225943, "learning_rate": 1.26147994095382e-05, "loss": 0.5504, "step": 2772 }, { "epoch": 1.2385672448749718, "grad_norm": 0.5451949613336983, "learning_rate": 1.2610239278490416e-05, "loss": 0.5303, "step": 2773 }, { "epoch": 1.2390177968010814, "grad_norm": 0.5496302646828318, "learning_rate": 1.2605678564887862e-05, "loss": 0.5412, "step": 2774 }, { "epoch": 1.2394683487271907, "grad_norm": 0.5205694312338793, "learning_rate": 1.2601117269748391e-05, "loss": 0.5295, "step": 2775 }, { "epoch": 1.2394683487271907, "eval_loss": 0.6407710909843445, "eval_runtime": 24.3917, "eval_samples_per_second": 11.438, "eval_steps_per_second": 0.492, "step": 2775 }, { "epoch": 1.2399189006533002, "grad_norm": 0.5585188356270934, "learning_rate": 1.2596555394089998e-05, "loss": 0.5586, "step": 2776 }, { "epoch": 1.2403694525794098, "grad_norm": 0.5237927846107138, "learning_rate": 1.2591992938930808e-05, "loss": 0.548, "step": 2777 }, { "epoch": 1.2408200045055193, "grad_norm": 0.5917796679814796, "learning_rate": 1.258742990528907e-05, "loss": 0.5309, "step": 2778 }, { "epoch": 1.2412705564316289, "grad_norm": 0.5383551949456513, "learning_rate": 1.2582866294183167e-05, "loss": 0.5248, "step": 2779 }, { "epoch": 1.2417211083577382, "grad_norm": 0.5898388313133788, "learning_rate": 1.2578302106631606e-05, "loss": 0.5417, "step": 2780 }, { "epoch": 1.2421716602838477, "grad_norm": 0.5299781702574662, "learning_rate": 1.2573737343653026e-05, "loss": 0.5338, "step": 2781 }, { "epoch": 1.2426222122099573, "grad_norm": 0.5431526502344307, "learning_rate": 1.2569172006266192e-05, "loss": 0.5247, "step": 2782 }, { "epoch": 1.2430727641360666, "grad_norm": 0.5459088567245328, "learning_rate": 1.2564606095490001e-05, "loss": 0.5452, "step": 2783 }, { "epoch": 1.2435233160621761, "grad_norm": 0.5592315355582396, "learning_rate": 1.2560039612343477e-05, "loss": 0.5517, "step": 2784 }, { "epoch": 1.2439738679882857, "grad_norm": 0.5756797216862264, "learning_rate": 1.2555472557845767e-05, "loss": 0.5551, "step": 2785 }, { "epoch": 1.2444244199143952, "grad_norm": 0.5857822210323945, "learning_rate": 1.2550904933016152e-05, "loss": 0.5798, "step": 2786 }, { "epoch": 1.2448749718405046, "grad_norm": 0.5576812098758511, "learning_rate": 1.2546336738874037e-05, "loss": 0.5205, "step": 2787 }, { "epoch": 1.245325523766614, "grad_norm": 0.5739022872979528, "learning_rate": 1.254176797643895e-05, "loss": 0.521, "step": 2788 }, { "epoch": 1.2457760756927236, "grad_norm": 0.5576615760248046, "learning_rate": 1.2537198646730554e-05, "loss": 0.5514, "step": 2789 }, { "epoch": 1.246226627618833, "grad_norm": 0.5530630738125171, "learning_rate": 1.2532628750768635e-05, "loss": 0.5529, "step": 2790 }, { "epoch": 1.2466771795449425, "grad_norm": 0.552505765246583, "learning_rate": 1.2528058289573102e-05, "loss": 0.5266, "step": 2791 }, { "epoch": 1.247127731471052, "grad_norm": 0.5323079753833205, "learning_rate": 1.2523487264163997e-05, "loss": 0.5265, "step": 2792 }, { "epoch": 1.2475782833971616, "grad_norm": 0.5226824241138399, "learning_rate": 1.2518915675561482e-05, "loss": 0.5489, "step": 2793 }, { "epoch": 1.248028835323271, "grad_norm": 0.5782616471031384, "learning_rate": 1.2514343524785848e-05, "loss": 0.511, "step": 2794 }, { "epoch": 1.2484793872493805, "grad_norm": 0.5625338075278479, "learning_rate": 1.2509770812857509e-05, "loss": 0.5521, "step": 2795 }, { "epoch": 1.24892993917549, "grad_norm": 0.5442424253586947, "learning_rate": 1.2505197540797006e-05, "loss": 0.5541, "step": 2796 }, { "epoch": 1.2493804911015995, "grad_norm": 0.5326767509331735, "learning_rate": 1.2500623709625008e-05, "loss": 0.5614, "step": 2797 }, { "epoch": 1.2498310430277089, "grad_norm": 0.563205511278612, "learning_rate": 1.24960493203623e-05, "loss": 0.5481, "step": 2798 }, { "epoch": 1.2502815949538184, "grad_norm": 0.57093496266155, "learning_rate": 1.2491474374029805e-05, "loss": 0.5459, "step": 2799 }, { "epoch": 1.250732146879928, "grad_norm": 0.5230938312949769, "learning_rate": 1.2486898871648552e-05, "loss": 0.5337, "step": 2800 }, { "epoch": 1.2511826988060375, "grad_norm": 0.565838624186354, "learning_rate": 1.2482322814239707e-05, "loss": 0.563, "step": 2801 }, { "epoch": 1.2516332507321468, "grad_norm": 0.5583498338867298, "learning_rate": 1.2477746202824563e-05, "loss": 0.5491, "step": 2802 }, { "epoch": 1.2520838026582564, "grad_norm": 0.535043386486007, "learning_rate": 1.2473169038424526e-05, "loss": 0.5394, "step": 2803 }, { "epoch": 1.252534354584366, "grad_norm": 0.5765008834761635, "learning_rate": 1.2468591322061132e-05, "loss": 0.5639, "step": 2804 }, { "epoch": 1.2529849065104752, "grad_norm": 0.54955193047871, "learning_rate": 1.2464013054756037e-05, "loss": 0.5466, "step": 2805 }, { "epoch": 1.2534354584365848, "grad_norm": 0.5629640164503728, "learning_rate": 1.2459434237531023e-05, "loss": 0.5197, "step": 2806 }, { "epoch": 1.2538860103626943, "grad_norm": 0.5664437322084154, "learning_rate": 1.2454854871407993e-05, "loss": 0.5747, "step": 2807 }, { "epoch": 1.2543365622888039, "grad_norm": 0.5192989458010204, "learning_rate": 1.2450274957408973e-05, "loss": 0.5434, "step": 2808 }, { "epoch": 1.2547871142149134, "grad_norm": 0.5570811459866294, "learning_rate": 1.2445694496556108e-05, "loss": 0.5403, "step": 2809 }, { "epoch": 1.2552376661410227, "grad_norm": 0.5382019725736075, "learning_rate": 1.2441113489871675e-05, "loss": 0.5137, "step": 2810 }, { "epoch": 1.2556882180671323, "grad_norm": 0.5741100948365891, "learning_rate": 1.2436531938378058e-05, "loss": 0.5456, "step": 2811 }, { "epoch": 1.2561387699932416, "grad_norm": 0.5274020407820713, "learning_rate": 1.2431949843097776e-05, "loss": 0.5259, "step": 2812 }, { "epoch": 1.2565893219193511, "grad_norm": 0.5390107031254662, "learning_rate": 1.2427367205053458e-05, "loss": 0.5541, "step": 2813 }, { "epoch": 1.2570398738454607, "grad_norm": 0.547328727700666, "learning_rate": 1.2422784025267864e-05, "loss": 0.5338, "step": 2814 }, { "epoch": 1.2574904257715702, "grad_norm": 0.5674388536821203, "learning_rate": 1.2418200304763871e-05, "loss": 0.5681, "step": 2815 }, { "epoch": 1.2579409776976798, "grad_norm": 0.5411642069683669, "learning_rate": 1.2413616044564478e-05, "loss": 0.5297, "step": 2816 }, { "epoch": 1.258391529623789, "grad_norm": 0.5454370352510366, "learning_rate": 1.2409031245692798e-05, "loss": 0.5585, "step": 2817 }, { "epoch": 1.2588420815498986, "grad_norm": 0.5541670859453669, "learning_rate": 1.2404445909172074e-05, "loss": 0.5619, "step": 2818 }, { "epoch": 1.2592926334760082, "grad_norm": 0.5148378838423551, "learning_rate": 1.239986003602566e-05, "loss": 0.5144, "step": 2819 }, { "epoch": 1.2597431854021175, "grad_norm": 0.531635265998524, "learning_rate": 1.2395273627277036e-05, "loss": 0.5369, "step": 2820 }, { "epoch": 1.260193737328227, "grad_norm": 0.4992690169069533, "learning_rate": 1.2390686683949799e-05, "loss": 0.5092, "step": 2821 }, { "epoch": 1.2606442892543366, "grad_norm": 0.5471826691985632, "learning_rate": 1.238609920706767e-05, "loss": 0.5531, "step": 2822 }, { "epoch": 1.2610948411804461, "grad_norm": 0.5244002157374751, "learning_rate": 1.238151119765448e-05, "loss": 0.5284, "step": 2823 }, { "epoch": 1.2615453931065557, "grad_norm": 0.5254558555560961, "learning_rate": 1.2376922656734182e-05, "loss": 0.5324, "step": 2824 }, { "epoch": 1.261995945032665, "grad_norm": 0.5354607435249603, "learning_rate": 1.2372333585330853e-05, "loss": 0.5726, "step": 2825 }, { "epoch": 1.2624464969587745, "grad_norm": 0.5651519864939345, "learning_rate": 1.2367743984468686e-05, "loss": 0.5518, "step": 2826 }, { "epoch": 1.2628970488848839, "grad_norm": 0.5350855976727646, "learning_rate": 1.2363153855171985e-05, "loss": 0.5182, "step": 2827 }, { "epoch": 1.2633476008109934, "grad_norm": 0.5096561329991277, "learning_rate": 1.2358563198465184e-05, "loss": 0.5264, "step": 2828 }, { "epoch": 1.263798152737103, "grad_norm": 0.5722517086104103, "learning_rate": 1.2353972015372825e-05, "loss": 0.5645, "step": 2829 }, { "epoch": 1.2642487046632125, "grad_norm": 0.5368897129450148, "learning_rate": 1.2349380306919574e-05, "loss": 0.5632, "step": 2830 }, { "epoch": 1.264699256589322, "grad_norm": 0.5486310263140914, "learning_rate": 1.2344788074130207e-05, "loss": 0.5381, "step": 2831 }, { "epoch": 1.2651498085154314, "grad_norm": 0.5487137980451916, "learning_rate": 1.2340195318029623e-05, "loss": 0.5647, "step": 2832 }, { "epoch": 1.265600360441541, "grad_norm": 0.5791146135905902, "learning_rate": 1.233560203964284e-05, "loss": 0.539, "step": 2833 }, { "epoch": 1.2660509123676504, "grad_norm": 0.5310217134003614, "learning_rate": 1.2331008239994986e-05, "loss": 0.5587, "step": 2834 }, { "epoch": 1.2665014642937598, "grad_norm": 0.5666582382466298, "learning_rate": 1.2326413920111304e-05, "loss": 0.5491, "step": 2835 }, { "epoch": 1.2669520162198693, "grad_norm": 0.5363688652321479, "learning_rate": 1.2321819081017161e-05, "loss": 0.5296, "step": 2836 }, { "epoch": 1.2674025681459788, "grad_norm": 0.5618771648995419, "learning_rate": 1.2317223723738036e-05, "loss": 0.56, "step": 2837 }, { "epoch": 1.2678531200720884, "grad_norm": 0.5652343448894805, "learning_rate": 1.2312627849299523e-05, "loss": 0.5227, "step": 2838 }, { "epoch": 1.2683036719981977, "grad_norm": 0.5432276609715341, "learning_rate": 1.2308031458727331e-05, "loss": 0.5628, "step": 2839 }, { "epoch": 1.2687542239243073, "grad_norm": 0.5226556566357317, "learning_rate": 1.2303434553047289e-05, "loss": 0.5279, "step": 2840 }, { "epoch": 1.2692047758504168, "grad_norm": 0.5422132459897447, "learning_rate": 1.2298837133285331e-05, "loss": 0.5774, "step": 2841 }, { "epoch": 1.2696553277765261, "grad_norm": 0.5443949081529654, "learning_rate": 1.2294239200467516e-05, "loss": 0.5507, "step": 2842 }, { "epoch": 1.2701058797026357, "grad_norm": 0.5336670630824856, "learning_rate": 1.2289640755620016e-05, "loss": 0.5332, "step": 2843 }, { "epoch": 1.2705564316287452, "grad_norm": 0.5422108623792434, "learning_rate": 1.2285041799769109e-05, "loss": 0.5412, "step": 2844 }, { "epoch": 1.2710069835548548, "grad_norm": 0.5244904890304235, "learning_rate": 1.2280442333941195e-05, "loss": 0.5334, "step": 2845 }, { "epoch": 1.2714575354809643, "grad_norm": 0.5520309370854738, "learning_rate": 1.2275842359162785e-05, "loss": 0.5254, "step": 2846 }, { "epoch": 1.2719080874070736, "grad_norm": 0.5430996525611371, "learning_rate": 1.2271241876460507e-05, "loss": 0.5678, "step": 2847 }, { "epoch": 1.2723586393331832, "grad_norm": 0.5572833923844616, "learning_rate": 1.2266640886861097e-05, "loss": 0.5476, "step": 2848 }, { "epoch": 1.2728091912592927, "grad_norm": 0.5391201629318152, "learning_rate": 1.2262039391391405e-05, "loss": 0.5509, "step": 2849 }, { "epoch": 1.273259743185402, "grad_norm": 0.6346707228281973, "learning_rate": 1.22574373910784e-05, "loss": 0.5574, "step": 2850 }, { "epoch": 1.2737102951115116, "grad_norm": 0.5486265841334957, "learning_rate": 1.2252834886949155e-05, "loss": 0.5444, "step": 2851 }, { "epoch": 1.2741608470376211, "grad_norm": 0.5381935377801568, "learning_rate": 1.2248231880030861e-05, "loss": 0.5482, "step": 2852 }, { "epoch": 1.2746113989637307, "grad_norm": 0.530072769606086, "learning_rate": 1.2243628371350822e-05, "loss": 0.5317, "step": 2853 }, { "epoch": 1.27506195088984, "grad_norm": 0.5480987077746934, "learning_rate": 1.223902436193645e-05, "loss": 0.5386, "step": 2854 }, { "epoch": 1.2755125028159495, "grad_norm": 0.5276970831683134, "learning_rate": 1.223441985281527e-05, "loss": 0.5677, "step": 2855 }, { "epoch": 1.275963054742059, "grad_norm": 0.5488998976127041, "learning_rate": 1.2229814845014918e-05, "loss": 0.5673, "step": 2856 }, { "epoch": 1.2764136066681684, "grad_norm": 0.5367667019755298, "learning_rate": 1.2225209339563144e-05, "loss": 0.5218, "step": 2857 }, { "epoch": 1.276864158594278, "grad_norm": 0.5241224526065539, "learning_rate": 1.222060333748781e-05, "loss": 0.575, "step": 2858 }, { "epoch": 1.2773147105203875, "grad_norm": 0.5492120042721081, "learning_rate": 1.2215996839816886e-05, "loss": 0.5707, "step": 2859 }, { "epoch": 1.277765262446497, "grad_norm": 0.5371003312174816, "learning_rate": 1.221138984757845e-05, "loss": 0.5249, "step": 2860 }, { "epoch": 1.2782158143726066, "grad_norm": 0.5267123804599224, "learning_rate": 1.2206782361800691e-05, "loss": 0.5388, "step": 2861 }, { "epoch": 1.2786663662987159, "grad_norm": 0.5501682359632158, "learning_rate": 1.2202174383511916e-05, "loss": 0.5348, "step": 2862 }, { "epoch": 1.2791169182248254, "grad_norm": 0.5382659162527933, "learning_rate": 1.2197565913740531e-05, "loss": 0.5497, "step": 2863 }, { "epoch": 1.279567470150935, "grad_norm": 0.5391027687293202, "learning_rate": 1.2192956953515065e-05, "loss": 0.5268, "step": 2864 }, { "epoch": 1.2800180220770443, "grad_norm": 0.5563236885176092, "learning_rate": 1.2188347503864142e-05, "loss": 0.5409, "step": 2865 }, { "epoch": 1.2804685740031538, "grad_norm": 0.5529301764231546, "learning_rate": 1.2183737565816502e-05, "loss": 0.5544, "step": 2866 }, { "epoch": 1.2809191259292634, "grad_norm": 0.5212527837472056, "learning_rate": 1.2179127140400997e-05, "loss": 0.5398, "step": 2867 }, { "epoch": 1.281369677855373, "grad_norm": 0.5689210336680888, "learning_rate": 1.2174516228646582e-05, "loss": 0.5603, "step": 2868 }, { "epoch": 1.2818202297814822, "grad_norm": 0.5385779593298883, "learning_rate": 1.2169904831582324e-05, "loss": 0.556, "step": 2869 }, { "epoch": 1.2822707817075918, "grad_norm": 0.5791838958886656, "learning_rate": 1.21652929502374e-05, "loss": 0.5333, "step": 2870 }, { "epoch": 1.2827213336337013, "grad_norm": 0.5435203131513179, "learning_rate": 1.2160680585641084e-05, "loss": 0.5289, "step": 2871 }, { "epoch": 1.2831718855598107, "grad_norm": 0.5366743787770428, "learning_rate": 1.2156067738822777e-05, "loss": 0.5893, "step": 2872 }, { "epoch": 1.2836224374859202, "grad_norm": 0.5600690858325665, "learning_rate": 1.2151454410811968e-05, "loss": 0.5299, "step": 2873 }, { "epoch": 1.2840729894120297, "grad_norm": 0.5585888952152408, "learning_rate": 1.2146840602638268e-05, "loss": 0.516, "step": 2874 }, { "epoch": 1.2845235413381393, "grad_norm": 0.5177457258931797, "learning_rate": 1.2142226315331387e-05, "loss": 0.5119, "step": 2875 }, { "epoch": 1.2849740932642488, "grad_norm": 0.540722312687846, "learning_rate": 1.2137611549921147e-05, "loss": 0.5741, "step": 2876 }, { "epoch": 1.2854246451903582, "grad_norm": 0.5893122627554512, "learning_rate": 1.213299630743747e-05, "loss": 0.5453, "step": 2877 }, { "epoch": 1.2858751971164677, "grad_norm": 0.5669038946956618, "learning_rate": 1.2128380588910391e-05, "loss": 0.5901, "step": 2878 }, { "epoch": 1.2863257490425772, "grad_norm": 0.5714321718604483, "learning_rate": 1.212376439537005e-05, "loss": 0.5522, "step": 2879 }, { "epoch": 1.2867763009686866, "grad_norm": 0.5492676303089635, "learning_rate": 1.211914772784669e-05, "loss": 0.5399, "step": 2880 }, { "epoch": 1.287226852894796, "grad_norm": 0.5231603331379724, "learning_rate": 1.2114530587370662e-05, "loss": 0.515, "step": 2881 }, { "epoch": 1.2876774048209056, "grad_norm": 0.5597215156709046, "learning_rate": 1.2109912974972424e-05, "loss": 0.5396, "step": 2882 }, { "epoch": 1.2881279567470152, "grad_norm": 0.5454299825532166, "learning_rate": 1.2105294891682533e-05, "loss": 0.5271, "step": 2883 }, { "epoch": 1.2885785086731245, "grad_norm": 0.5382107718174441, "learning_rate": 1.2100676338531661e-05, "loss": 0.5751, "step": 2884 }, { "epoch": 1.289029060599234, "grad_norm": 0.5381429042398873, "learning_rate": 1.2096057316550576e-05, "loss": 0.5966, "step": 2885 }, { "epoch": 1.2894796125253436, "grad_norm": 0.5258170790485999, "learning_rate": 1.2091437826770154e-05, "loss": 0.542, "step": 2886 }, { "epoch": 1.289930164451453, "grad_norm": 0.5339340577364331, "learning_rate": 1.2086817870221376e-05, "loss": 0.5386, "step": 2887 }, { "epoch": 1.2903807163775625, "grad_norm": 0.5596129000230871, "learning_rate": 1.2082197447935328e-05, "loss": 0.5274, "step": 2888 }, { "epoch": 1.290831268303672, "grad_norm": 0.5417771806525132, "learning_rate": 1.20775765609432e-05, "loss": 0.5684, "step": 2889 }, { "epoch": 1.2912818202297816, "grad_norm": 0.5489254850433792, "learning_rate": 1.2072955210276281e-05, "loss": 0.5152, "step": 2890 }, { "epoch": 1.291732372155891, "grad_norm": 0.5390755874441743, "learning_rate": 1.2068333396965968e-05, "loss": 0.5222, "step": 2891 }, { "epoch": 1.2921829240820004, "grad_norm": 0.5442436005252642, "learning_rate": 1.2063711122043759e-05, "loss": 0.5619, "step": 2892 }, { "epoch": 1.29263347600811, "grad_norm": 0.5411959083602635, "learning_rate": 1.205908838654126e-05, "loss": 0.551, "step": 2893 }, { "epoch": 1.2930840279342193, "grad_norm": 0.5381352348782724, "learning_rate": 1.2054465191490172e-05, "loss": 0.5339, "step": 2894 }, { "epoch": 1.2935345798603288, "grad_norm": 0.5120723811327035, "learning_rate": 1.2049841537922307e-05, "loss": 0.5498, "step": 2895 }, { "epoch": 1.2939851317864384, "grad_norm": 0.5242929602102896, "learning_rate": 1.2045217426869566e-05, "loss": 0.5595, "step": 2896 }, { "epoch": 1.294435683712548, "grad_norm": 0.5559781623922975, "learning_rate": 1.204059285936397e-05, "loss": 0.5363, "step": 2897 }, { "epoch": 1.2948862356386575, "grad_norm": 0.5174228322399117, "learning_rate": 1.2035967836437625e-05, "loss": 0.514, "step": 2898 }, { "epoch": 1.2953367875647668, "grad_norm": 0.5226699387321099, "learning_rate": 1.2031342359122755e-05, "loss": 0.5411, "step": 2899 }, { "epoch": 1.2957873394908763, "grad_norm": 0.5659108414236881, "learning_rate": 1.202671642845167e-05, "loss": 0.5925, "step": 2900 }, { "epoch": 1.2962378914169859, "grad_norm": 0.5482214495850869, "learning_rate": 1.202209004545679e-05, "loss": 0.5464, "step": 2901 }, { "epoch": 1.2966884433430952, "grad_norm": 0.5299311583460597, "learning_rate": 1.2017463211170635e-05, "loss": 0.5294, "step": 2902 }, { "epoch": 1.2971389952692047, "grad_norm": 0.5662560185718251, "learning_rate": 1.2012835926625823e-05, "loss": 0.574, "step": 2903 }, { "epoch": 1.2975895471953143, "grad_norm": 0.538650889975412, "learning_rate": 1.2008208192855077e-05, "loss": 0.6012, "step": 2904 }, { "epoch": 1.2980400991214238, "grad_norm": 0.5418783887749242, "learning_rate": 1.2003580010891214e-05, "loss": 0.5047, "step": 2905 }, { "epoch": 1.2984906510475334, "grad_norm": 0.5612374930152574, "learning_rate": 1.1998951381767157e-05, "loss": 0.5528, "step": 2906 }, { "epoch": 1.2989412029736427, "grad_norm": 0.5518642509542633, "learning_rate": 1.1994322306515926e-05, "loss": 0.5681, "step": 2907 }, { "epoch": 1.2993917548997522, "grad_norm": 0.5359433586569251, "learning_rate": 1.1989692786170636e-05, "loss": 0.5059, "step": 2908 }, { "epoch": 1.2998423068258615, "grad_norm": 0.5335990885388519, "learning_rate": 1.1985062821764515e-05, "loss": 0.5434, "step": 2909 }, { "epoch": 1.300292858751971, "grad_norm": 0.5934651403947064, "learning_rate": 1.1980432414330873e-05, "loss": 0.5593, "step": 2910 }, { "epoch": 1.3007434106780806, "grad_norm": 0.5535977373216939, "learning_rate": 1.197580156490313e-05, "loss": 0.5551, "step": 2911 }, { "epoch": 1.3011939626041902, "grad_norm": 0.5357124703454401, "learning_rate": 1.1971170274514802e-05, "loss": 0.5505, "step": 2912 }, { "epoch": 1.3016445145302997, "grad_norm": 0.5667587143943307, "learning_rate": 1.1966538544199506e-05, "loss": 0.553, "step": 2913 }, { "epoch": 1.302095066456409, "grad_norm": 0.5656556048314922, "learning_rate": 1.1961906374990952e-05, "loss": 0.5079, "step": 2914 }, { "epoch": 1.3025456183825186, "grad_norm": 0.5313939878413356, "learning_rate": 1.195727376792295e-05, "loss": 0.5286, "step": 2915 }, { "epoch": 1.3029961703086281, "grad_norm": 0.5433858321133005, "learning_rate": 1.1952640724029407e-05, "loss": 0.5348, "step": 2916 }, { "epoch": 1.3034467222347375, "grad_norm": 0.5556183122555424, "learning_rate": 1.1948007244344334e-05, "loss": 0.5275, "step": 2917 }, { "epoch": 1.303897274160847, "grad_norm": 0.5705298139485353, "learning_rate": 1.1943373329901824e-05, "loss": 0.5408, "step": 2918 }, { "epoch": 1.3043478260869565, "grad_norm": 0.5394232137426886, "learning_rate": 1.1938738981736084e-05, "loss": 0.5446, "step": 2919 }, { "epoch": 1.304798378013066, "grad_norm": 0.542865056843831, "learning_rate": 1.193410420088141e-05, "loss": 0.533, "step": 2920 }, { "epoch": 1.3052489299391756, "grad_norm": 0.6631365537700317, "learning_rate": 1.1929468988372191e-05, "loss": 0.5492, "step": 2921 }, { "epoch": 1.305699481865285, "grad_norm": 0.7073685836148406, "learning_rate": 1.1924833345242921e-05, "loss": 0.552, "step": 2922 }, { "epoch": 1.3061500337913945, "grad_norm": 0.539861398498514, "learning_rate": 1.1920197272528185e-05, "loss": 0.5514, "step": 2923 }, { "epoch": 1.3066005857175038, "grad_norm": 0.5307532089708791, "learning_rate": 1.1915560771262664e-05, "loss": 0.5207, "step": 2924 }, { "epoch": 1.3070511376436134, "grad_norm": 0.5542122348017018, "learning_rate": 1.1910923842481134e-05, "loss": 0.5234, "step": 2925 }, { "epoch": 1.307501689569723, "grad_norm": 0.5602606621027156, "learning_rate": 1.190628648721847e-05, "loss": 0.5432, "step": 2926 }, { "epoch": 1.3079522414958324, "grad_norm": 0.5417965157810973, "learning_rate": 1.1901648706509637e-05, "loss": 0.5582, "step": 2927 }, { "epoch": 1.308402793421942, "grad_norm": 0.5366478806900754, "learning_rate": 1.1897010501389698e-05, "loss": 0.5448, "step": 2928 }, { "epoch": 1.3088533453480513, "grad_norm": 0.5612695630874537, "learning_rate": 1.1892371872893812e-05, "loss": 0.5615, "step": 2929 }, { "epoch": 1.3093038972741609, "grad_norm": 0.52121026888872, "learning_rate": 1.1887732822057234e-05, "loss": 0.5584, "step": 2930 }, { "epoch": 1.3097544492002704, "grad_norm": 0.5380877409680623, "learning_rate": 1.1883093349915305e-05, "loss": 0.5454, "step": 2931 }, { "epoch": 1.3102050011263797, "grad_norm": 0.5440052625380027, "learning_rate": 1.1878453457503465e-05, "loss": 0.5622, "step": 2932 }, { "epoch": 1.3106555530524893, "grad_norm": 0.532044508113527, "learning_rate": 1.187381314585725e-05, "loss": 0.5232, "step": 2933 }, { "epoch": 1.3111061049785988, "grad_norm": 0.5504382849178293, "learning_rate": 1.1869172416012285e-05, "loss": 0.5742, "step": 2934 }, { "epoch": 1.3115566569047084, "grad_norm": 0.5314427435425049, "learning_rate": 1.1864531269004294e-05, "loss": 0.5302, "step": 2935 }, { "epoch": 1.3120072088308177, "grad_norm": 0.5530845379971511, "learning_rate": 1.1859889705869092e-05, "loss": 0.5271, "step": 2936 }, { "epoch": 1.3124577607569272, "grad_norm": 0.5461372252016856, "learning_rate": 1.185524772764258e-05, "loss": 0.5392, "step": 2937 }, { "epoch": 1.3129083126830368, "grad_norm": 0.5504821595905118, "learning_rate": 1.1850605335360766e-05, "loss": 0.5804, "step": 2938 }, { "epoch": 1.313358864609146, "grad_norm": 0.5879685934940109, "learning_rate": 1.1845962530059734e-05, "loss": 0.5512, "step": 2939 }, { "epoch": 1.3138094165352556, "grad_norm": 0.5689990531445419, "learning_rate": 1.1841319312775672e-05, "loss": 0.5741, "step": 2940 }, { "epoch": 1.3142599684613652, "grad_norm": 0.5483115481378813, "learning_rate": 1.1836675684544856e-05, "loss": 0.5596, "step": 2941 }, { "epoch": 1.3147105203874747, "grad_norm": 0.5806350888203295, "learning_rate": 1.1832031646403654e-05, "loss": 0.5361, "step": 2942 }, { "epoch": 1.3151610723135843, "grad_norm": 0.5566573522364622, "learning_rate": 1.1827387199388524e-05, "loss": 0.5325, "step": 2943 }, { "epoch": 1.3156116242396936, "grad_norm": 0.5455027612294823, "learning_rate": 1.1822742344536017e-05, "loss": 0.5172, "step": 2944 }, { "epoch": 1.3160621761658031, "grad_norm": 0.5805112367213028, "learning_rate": 1.1818097082882773e-05, "loss": 0.5421, "step": 2945 }, { "epoch": 1.3165127280919127, "grad_norm": 0.5619414418038399, "learning_rate": 1.1813451415465526e-05, "loss": 0.5551, "step": 2946 }, { "epoch": 1.316963280018022, "grad_norm": 0.5464443098217037, "learning_rate": 1.1808805343321102e-05, "loss": 0.5467, "step": 2947 }, { "epoch": 1.3174138319441315, "grad_norm": 0.5955786515274301, "learning_rate": 1.1804158867486406e-05, "loss": 0.5604, "step": 2948 }, { "epoch": 1.317864383870241, "grad_norm": 0.5931576020459974, "learning_rate": 1.1799511988998449e-05, "loss": 0.5416, "step": 2949 }, { "epoch": 1.3183149357963506, "grad_norm": 0.5828009265017366, "learning_rate": 1.1794864708894318e-05, "loss": 0.5232, "step": 2950 }, { "epoch": 1.31876548772246, "grad_norm": 0.5908444806099634, "learning_rate": 1.1790217028211202e-05, "loss": 0.5569, "step": 2951 }, { "epoch": 1.3192160396485695, "grad_norm": 0.5456903796129847, "learning_rate": 1.1785568947986368e-05, "loss": 0.5369, "step": 2952 }, { "epoch": 1.319666591574679, "grad_norm": 0.5610876775106571, "learning_rate": 1.178092046925718e-05, "loss": 0.543, "step": 2953 }, { "epoch": 1.3201171435007883, "grad_norm": 0.5342948404352477, "learning_rate": 1.1776271593061089e-05, "loss": 0.5239, "step": 2954 }, { "epoch": 1.320567695426898, "grad_norm": 0.5195946568615741, "learning_rate": 1.1771622320435631e-05, "loss": 0.5269, "step": 2955 }, { "epoch": 1.3210182473530074, "grad_norm": 0.5473222039051013, "learning_rate": 1.1766972652418438e-05, "loss": 0.5155, "step": 2956 }, { "epoch": 1.321468799279117, "grad_norm": 0.5481271382205317, "learning_rate": 1.176232259004722e-05, "loss": 0.5571, "step": 2957 }, { "epoch": 1.3219193512052265, "grad_norm": 0.578312451517863, "learning_rate": 1.1757672134359784e-05, "loss": 0.5556, "step": 2958 }, { "epoch": 1.3223699031313358, "grad_norm": 0.5264211328189307, "learning_rate": 1.1753021286394021e-05, "loss": 0.5684, "step": 2959 }, { "epoch": 1.3228204550574454, "grad_norm": 0.515124059811903, "learning_rate": 1.174837004718791e-05, "loss": 0.5114, "step": 2960 }, { "epoch": 1.323271006983555, "grad_norm": 0.545647656710088, "learning_rate": 1.1743718417779518e-05, "loss": 0.4937, "step": 2961 }, { "epoch": 1.3237215589096643, "grad_norm": 0.5542664354458303, "learning_rate": 1.1739066399206997e-05, "loss": 0.5814, "step": 2962 }, { "epoch": 1.3241721108357738, "grad_norm": 0.5355802109933703, "learning_rate": 1.173441399250859e-05, "loss": 0.5413, "step": 2963 }, { "epoch": 1.3246226627618833, "grad_norm": 0.5267880657801834, "learning_rate": 1.1729761198722622e-05, "loss": 0.5284, "step": 2964 }, { "epoch": 1.3250732146879929, "grad_norm": 0.5403010309259784, "learning_rate": 1.1725108018887507e-05, "loss": 0.5359, "step": 2965 }, { "epoch": 1.3255237666141022, "grad_norm": 0.5503486899019877, "learning_rate": 1.1720454454041745e-05, "loss": 0.5506, "step": 2966 }, { "epoch": 1.3259743185402117, "grad_norm": 0.5561200578566118, "learning_rate": 1.1715800505223918e-05, "loss": 0.5841, "step": 2967 }, { "epoch": 1.3264248704663213, "grad_norm": 0.556661460103497, "learning_rate": 1.1711146173472701e-05, "loss": 0.5501, "step": 2968 }, { "epoch": 1.3268754223924306, "grad_norm": 0.5554806267220642, "learning_rate": 1.1706491459826847e-05, "loss": 0.5167, "step": 2969 }, { "epoch": 1.3273259743185402, "grad_norm": 0.5294746551608028, "learning_rate": 1.1701836365325204e-05, "loss": 0.4996, "step": 2970 }, { "epoch": 1.3277765262446497, "grad_norm": 0.554955767168855, "learning_rate": 1.169718089100669e-05, "loss": 0.5422, "step": 2971 }, { "epoch": 1.3282270781707592, "grad_norm": 0.5633950393894789, "learning_rate": 1.1692525037910325e-05, "loss": 0.5464, "step": 2972 }, { "epoch": 1.3286776300968688, "grad_norm": 0.534286713535088, "learning_rate": 1.1687868807075197e-05, "loss": 0.5364, "step": 2973 }, { "epoch": 1.329128182022978, "grad_norm": 0.5955316054826013, "learning_rate": 1.1683212199540494e-05, "loss": 0.5266, "step": 2974 }, { "epoch": 1.3295787339490877, "grad_norm": 0.5303622087530439, "learning_rate": 1.1678555216345478e-05, "loss": 0.5449, "step": 2975 }, { "epoch": 1.3300292858751972, "grad_norm": 0.5867972152179158, "learning_rate": 1.1673897858529495e-05, "loss": 0.5425, "step": 2976 }, { "epoch": 1.3304798378013065, "grad_norm": 0.5746850202085582, "learning_rate": 1.166924012713198e-05, "loss": 0.5613, "step": 2977 }, { "epoch": 1.330930389727416, "grad_norm": 0.5549361216722061, "learning_rate": 1.1664582023192447e-05, "loss": 0.4826, "step": 2978 }, { "epoch": 1.3313809416535256, "grad_norm": 0.5891226841981271, "learning_rate": 1.1659923547750494e-05, "loss": 0.547, "step": 2979 }, { "epoch": 1.3318314935796352, "grad_norm": 0.5531028718040403, "learning_rate": 1.1655264701845801e-05, "loss": 0.5449, "step": 2980 }, { "epoch": 1.3322820455057445, "grad_norm": 0.5825258317285276, "learning_rate": 1.1650605486518134e-05, "loss": 0.5254, "step": 2981 }, { "epoch": 1.332732597431854, "grad_norm": 0.5185966601046113, "learning_rate": 1.164594590280734e-05, "loss": 0.5428, "step": 2982 }, { "epoch": 1.3331831493579636, "grad_norm": 0.5696421429744927, "learning_rate": 1.1641285951753347e-05, "loss": 0.5689, "step": 2983 }, { "epoch": 1.3336337012840729, "grad_norm": 0.52378696832765, "learning_rate": 1.1636625634396166e-05, "loss": 0.5525, "step": 2984 }, { "epoch": 1.3340842532101824, "grad_norm": 0.5397755242173563, "learning_rate": 1.1631964951775887e-05, "loss": 0.5459, "step": 2985 }, { "epoch": 1.334534805136292, "grad_norm": 0.5497264290023932, "learning_rate": 1.1627303904932687e-05, "loss": 0.5429, "step": 2986 }, { "epoch": 1.3349853570624015, "grad_norm": 0.5516459594983769, "learning_rate": 1.1622642494906819e-05, "loss": 0.5656, "step": 2987 }, { "epoch": 1.335435908988511, "grad_norm": 0.5281444445688066, "learning_rate": 1.1617980722738623e-05, "loss": 0.54, "step": 2988 }, { "epoch": 1.3358864609146204, "grad_norm": 0.5377934878406503, "learning_rate": 1.1613318589468512e-05, "loss": 0.5275, "step": 2989 }, { "epoch": 1.33633701284073, "grad_norm": 0.532997452078875, "learning_rate": 1.1608656096136985e-05, "loss": 0.5154, "step": 2990 }, { "epoch": 1.3367875647668392, "grad_norm": 0.5287159122378852, "learning_rate": 1.1603993243784624e-05, "loss": 0.528, "step": 2991 }, { "epoch": 1.3372381166929488, "grad_norm": 0.5375059354706785, "learning_rate": 1.1599330033452078e-05, "loss": 0.5141, "step": 2992 }, { "epoch": 1.3376886686190583, "grad_norm": 0.5416890099151227, "learning_rate": 1.1594666466180096e-05, "loss": 0.5555, "step": 2993 }, { "epoch": 1.3381392205451679, "grad_norm": 0.5423213996623887, "learning_rate": 1.1590002543009486e-05, "loss": 0.5274, "step": 2994 }, { "epoch": 1.3385897724712774, "grad_norm": 0.5568811594579056, "learning_rate": 1.1585338264981152e-05, "loss": 0.5891, "step": 2995 }, { "epoch": 1.3390403243973867, "grad_norm": 0.5471839765342834, "learning_rate": 1.1580673633136066e-05, "loss": 0.5181, "step": 2996 }, { "epoch": 1.3394908763234963, "grad_norm": 0.5355763870917528, "learning_rate": 1.1576008648515286e-05, "loss": 0.5345, "step": 2997 }, { "epoch": 1.3399414282496058, "grad_norm": 0.5598439869877851, "learning_rate": 1.1571343312159949e-05, "loss": 0.5381, "step": 2998 }, { "epoch": 1.3403919801757151, "grad_norm": 0.5454770160693929, "learning_rate": 1.156667762511126e-05, "loss": 0.5412, "step": 2999 }, { "epoch": 1.3408425321018247, "grad_norm": 0.5185486105556288, "learning_rate": 1.1562011588410513e-05, "loss": 0.5821, "step": 3000 }, { "epoch": 1.3412930840279342, "grad_norm": 0.5917407974494979, "learning_rate": 1.1557345203099082e-05, "loss": 0.5329, "step": 3001 }, { "epoch": 1.3417436359540438, "grad_norm": 0.5611389042270911, "learning_rate": 1.1552678470218406e-05, "loss": 0.5423, "step": 3002 }, { "epoch": 1.3421941878801533, "grad_norm": 0.5452896363322842, "learning_rate": 1.1548011390810016e-05, "loss": 0.5443, "step": 3003 }, { "epoch": 1.3426447398062626, "grad_norm": 0.5797377697330107, "learning_rate": 1.1543343965915508e-05, "loss": 0.5427, "step": 3004 }, { "epoch": 1.3430952917323722, "grad_norm": 0.5446885252464955, "learning_rate": 1.1538676196576563e-05, "loss": 0.5485, "step": 3005 }, { "epoch": 1.3435458436584815, "grad_norm": 0.5839608904941844, "learning_rate": 1.1534008083834937e-05, "loss": 0.5514, "step": 3006 }, { "epoch": 1.343996395584591, "grad_norm": 0.5130720000182243, "learning_rate": 1.1529339628732462e-05, "loss": 0.5529, "step": 3007 }, { "epoch": 1.3444469475107006, "grad_norm": 0.5606522593953582, "learning_rate": 1.1524670832311045e-05, "loss": 0.5713, "step": 3008 }, { "epoch": 1.3448974994368101, "grad_norm": 0.5420162471597086, "learning_rate": 1.1520001695612675e-05, "loss": 0.5404, "step": 3009 }, { "epoch": 1.3453480513629197, "grad_norm": 0.5022134468875697, "learning_rate": 1.1515332219679405e-05, "loss": 0.5812, "step": 3010 }, { "epoch": 1.345798603289029, "grad_norm": 0.5513428899130748, "learning_rate": 1.1510662405553379e-05, "loss": 0.542, "step": 3011 }, { "epoch": 1.3462491552151385, "grad_norm": 0.5503736141755354, "learning_rate": 1.1505992254276808e-05, "loss": 0.5582, "step": 3012 }, { "epoch": 1.346699707141248, "grad_norm": 0.5369567869285253, "learning_rate": 1.1501321766891977e-05, "loss": 0.5342, "step": 3013 }, { "epoch": 1.3471502590673574, "grad_norm": 0.5526794956315513, "learning_rate": 1.1496650944441248e-05, "loss": 0.5294, "step": 3014 }, { "epoch": 1.347600810993467, "grad_norm": 0.5298015526079309, "learning_rate": 1.149197978796706e-05, "loss": 0.5237, "step": 3015 }, { "epoch": 1.3480513629195765, "grad_norm": 0.5189618880956837, "learning_rate": 1.1487308298511922e-05, "loss": 0.5185, "step": 3016 }, { "epoch": 1.348501914845686, "grad_norm": 0.5292576790277431, "learning_rate": 1.148263647711842e-05, "loss": 0.5289, "step": 3017 }, { "epoch": 1.3489524667717954, "grad_norm": 0.5463722980130149, "learning_rate": 1.1477964324829216e-05, "loss": 0.5206, "step": 3018 }, { "epoch": 1.349403018697905, "grad_norm": 0.5157941964855953, "learning_rate": 1.1473291842687044e-05, "loss": 0.5209, "step": 3019 }, { "epoch": 1.3498535706240145, "grad_norm": 0.528360250211852, "learning_rate": 1.1468619031734709e-05, "loss": 0.5528, "step": 3020 }, { "epoch": 1.3503041225501238, "grad_norm": 0.5555738402409276, "learning_rate": 1.1463945893015092e-05, "loss": 0.5892, "step": 3021 }, { "epoch": 1.3507546744762333, "grad_norm": 0.5147530438411099, "learning_rate": 1.1459272427571148e-05, "loss": 0.5569, "step": 3022 }, { "epoch": 1.3512052264023429, "grad_norm": 0.5009868443523515, "learning_rate": 1.1454598636445905e-05, "loss": 0.5246, "step": 3023 }, { "epoch": 1.3516557783284524, "grad_norm": 0.5525948137856207, "learning_rate": 1.144992452068246e-05, "loss": 0.5369, "step": 3024 }, { "epoch": 1.352106330254562, "grad_norm": 0.5181912134144052, "learning_rate": 1.1445250081323986e-05, "loss": 0.5309, "step": 3025 }, { "epoch": 1.3525568821806713, "grad_norm": 0.5306096259032357, "learning_rate": 1.144057531941373e-05, "loss": 0.538, "step": 3026 }, { "epoch": 1.3530074341067808, "grad_norm": 0.49710171929277885, "learning_rate": 1.1435900235995004e-05, "loss": 0.5549, "step": 3027 }, { "epoch": 1.3534579860328904, "grad_norm": 0.5257750464654984, "learning_rate": 1.1431224832111197e-05, "loss": 0.5261, "step": 3028 }, { "epoch": 1.3539085379589997, "grad_norm": 0.5679709687928713, "learning_rate": 1.1426549108805769e-05, "loss": 0.5694, "step": 3029 }, { "epoch": 1.3543590898851092, "grad_norm": 0.5291201447128783, "learning_rate": 1.142187306712225e-05, "loss": 0.5636, "step": 3030 }, { "epoch": 1.3548096418112188, "grad_norm": 0.5332055812512405, "learning_rate": 1.1417196708104244e-05, "loss": 0.5159, "step": 3031 }, { "epoch": 1.3552601937373283, "grad_norm": 0.5682672082180679, "learning_rate": 1.141252003279542e-05, "loss": 0.5879, "step": 3032 }, { "epoch": 1.3557107456634376, "grad_norm": 0.5299774149057963, "learning_rate": 1.1407843042239524e-05, "loss": 0.5358, "step": 3033 }, { "epoch": 1.3561612975895472, "grad_norm": 0.5613357039405146, "learning_rate": 1.1403165737480368e-05, "loss": 0.5127, "step": 3034 }, { "epoch": 1.3566118495156567, "grad_norm": 0.5579005902212911, "learning_rate": 1.1398488119561836e-05, "loss": 0.5909, "step": 3035 }, { "epoch": 1.357062401441766, "grad_norm": 0.5265863333297789, "learning_rate": 1.1393810189527886e-05, "loss": 0.5698, "step": 3036 }, { "epoch": 1.3575129533678756, "grad_norm": 0.5919081406192709, "learning_rate": 1.1389131948422534e-05, "loss": 0.5506, "step": 3037 }, { "epoch": 1.3579635052939851, "grad_norm": 0.5372882760676473, "learning_rate": 1.1384453397289876e-05, "loss": 0.5378, "step": 3038 }, { "epoch": 1.3584140572200947, "grad_norm": 0.5453856643523407, "learning_rate": 1.137977453717408e-05, "loss": 0.5281, "step": 3039 }, { "epoch": 1.3588646091462042, "grad_norm": 0.5390358776350703, "learning_rate": 1.1375095369119364e-05, "loss": 0.5375, "step": 3040 }, { "epoch": 1.3593151610723135, "grad_norm": 0.5326518678826324, "learning_rate": 1.1370415894170037e-05, "loss": 0.5407, "step": 3041 }, { "epoch": 1.359765712998423, "grad_norm": 0.5332976861288188, "learning_rate": 1.1365736113370463e-05, "loss": 0.5634, "step": 3042 }, { "epoch": 1.3602162649245326, "grad_norm": 0.5472162188487171, "learning_rate": 1.1361056027765081e-05, "loss": 0.5584, "step": 3043 }, { "epoch": 1.360666816850642, "grad_norm": 0.5511265087614469, "learning_rate": 1.1356375638398392e-05, "loss": 0.5581, "step": 3044 }, { "epoch": 1.3611173687767515, "grad_norm": 0.5570812696121242, "learning_rate": 1.135169494631497e-05, "loss": 0.5652, "step": 3045 }, { "epoch": 1.361567920702861, "grad_norm": 0.5461489481902119, "learning_rate": 1.1347013952559457e-05, "loss": 0.5854, "step": 3046 }, { "epoch": 1.3620184726289706, "grad_norm": 0.5587696825834436, "learning_rate": 1.1342332658176556e-05, "loss": 0.5914, "step": 3047 }, { "epoch": 1.36246902455508, "grad_norm": 0.5254156750103294, "learning_rate": 1.1337651064211044e-05, "loss": 0.5502, "step": 3048 }, { "epoch": 1.3629195764811894, "grad_norm": 0.5495638274643088, "learning_rate": 1.133296917170776e-05, "loss": 0.5417, "step": 3049 }, { "epoch": 1.363370128407299, "grad_norm": 0.5263279821035115, "learning_rate": 1.1328286981711614e-05, "loss": 0.5549, "step": 3050 }, { "epoch": 1.3638206803334083, "grad_norm": 0.5359016443155626, "learning_rate": 1.132360449526758e-05, "loss": 0.5226, "step": 3051 }, { "epoch": 1.3642712322595179, "grad_norm": 0.5414175446406784, "learning_rate": 1.1318921713420691e-05, "loss": 0.5089, "step": 3052 }, { "epoch": 1.3647217841856274, "grad_norm": 0.5894106440199794, "learning_rate": 1.1314238637216062e-05, "loss": 0.5682, "step": 3053 }, { "epoch": 1.365172336111737, "grad_norm": 0.5345186626145446, "learning_rate": 1.1309555267698862e-05, "loss": 0.5447, "step": 3054 }, { "epoch": 1.3656228880378465, "grad_norm": 0.5467460120240238, "learning_rate": 1.1304871605914326e-05, "loss": 0.5444, "step": 3055 }, { "epoch": 1.3660734399639558, "grad_norm": 0.5359585126460032, "learning_rate": 1.130018765290776e-05, "loss": 0.5282, "step": 3056 }, { "epoch": 1.3665239918900653, "grad_norm": 0.5360876141336998, "learning_rate": 1.1295503409724526e-05, "loss": 0.5293, "step": 3057 }, { "epoch": 1.366974543816175, "grad_norm": 0.5498359793166326, "learning_rate": 1.1290818877410064e-05, "loss": 0.5452, "step": 3058 }, { "epoch": 1.3674250957422842, "grad_norm": 0.5231934242721565, "learning_rate": 1.1286134057009862e-05, "loss": 0.543, "step": 3059 }, { "epoch": 1.3678756476683938, "grad_norm": 0.51439355431773, "learning_rate": 1.1281448949569487e-05, "loss": 0.5507, "step": 3060 }, { "epoch": 1.3683261995945033, "grad_norm": 0.5529966013803075, "learning_rate": 1.1276763556134566e-05, "loss": 0.5406, "step": 3061 }, { "epoch": 1.3687767515206128, "grad_norm": 0.5129687306241666, "learning_rate": 1.1272077877750782e-05, "loss": 0.5766, "step": 3062 }, { "epoch": 1.3692273034467222, "grad_norm": 0.5590999965568503, "learning_rate": 1.126739191546389e-05, "loss": 0.548, "step": 3063 }, { "epoch": 1.3696778553728317, "grad_norm": 0.5856166549616894, "learning_rate": 1.1262705670319706e-05, "loss": 0.5563, "step": 3064 }, { "epoch": 1.3701284072989413, "grad_norm": 0.523456489022085, "learning_rate": 1.1258019143364112e-05, "loss": 0.5394, "step": 3065 }, { "epoch": 1.3705789592250506, "grad_norm": 0.5333570507042964, "learning_rate": 1.1253332335643043e-05, "loss": 0.5208, "step": 3066 }, { "epoch": 1.3710295111511601, "grad_norm": 0.5449397413770075, "learning_rate": 1.124864524820251e-05, "loss": 0.5283, "step": 3067 }, { "epoch": 1.3714800630772697, "grad_norm": 0.5154139389002123, "learning_rate": 1.1243957882088577e-05, "loss": 0.499, "step": 3068 }, { "epoch": 1.3719306150033792, "grad_norm": 0.5333904989695668, "learning_rate": 1.1239270238347372e-05, "loss": 0.5535, "step": 3069 }, { "epoch": 1.3723811669294887, "grad_norm": 0.5550582280483329, "learning_rate": 1.123458231802509e-05, "loss": 0.5308, "step": 3070 }, { "epoch": 1.372831718855598, "grad_norm": 0.5299899894280401, "learning_rate": 1.1229894122167981e-05, "loss": 0.5353, "step": 3071 }, { "epoch": 1.3732822707817076, "grad_norm": 0.534227169534007, "learning_rate": 1.1225205651822359e-05, "loss": 0.542, "step": 3072 }, { "epoch": 1.3737328227078172, "grad_norm": 0.5424650385376613, "learning_rate": 1.1220516908034602e-05, "loss": 0.5725, "step": 3073 }, { "epoch": 1.3741833746339265, "grad_norm": 0.525382311242503, "learning_rate": 1.1215827891851147e-05, "loss": 0.5546, "step": 3074 }, { "epoch": 1.374633926560036, "grad_norm": 0.5650567875229094, "learning_rate": 1.121113860431849e-05, "loss": 0.5359, "step": 3075 }, { "epoch": 1.3750844784861456, "grad_norm": 0.5308815605080655, "learning_rate": 1.1206449046483188e-05, "loss": 0.5259, "step": 3076 }, { "epoch": 1.375535030412255, "grad_norm": 0.5377715385652944, "learning_rate": 1.1201759219391858e-05, "loss": 0.5756, "step": 3077 }, { "epoch": 1.3759855823383644, "grad_norm": 0.5387742518573936, "learning_rate": 1.1197069124091182e-05, "loss": 0.5469, "step": 3078 }, { "epoch": 1.376436134264474, "grad_norm": 0.5376721201598182, "learning_rate": 1.1192378761627897e-05, "loss": 0.5905, "step": 3079 }, { "epoch": 1.3768866861905835, "grad_norm": 0.5784496419169027, "learning_rate": 1.1187688133048801e-05, "loss": 0.5416, "step": 3080 }, { "epoch": 1.3773372381166928, "grad_norm": 0.5617183550008432, "learning_rate": 1.1182997239400752e-05, "loss": 0.5339, "step": 3081 }, { "epoch": 1.3777877900428024, "grad_norm": 0.5426261260195673, "learning_rate": 1.1178306081730666e-05, "loss": 0.5737, "step": 3082 }, { "epoch": 1.378238341968912, "grad_norm": 0.5695271425259898, "learning_rate": 1.1173614661085516e-05, "loss": 0.5632, "step": 3083 }, { "epoch": 1.3786888938950215, "grad_norm": 0.5228812137451225, "learning_rate": 1.116892297851234e-05, "loss": 0.5599, "step": 3084 }, { "epoch": 1.379139445821131, "grad_norm": 0.5565179010686773, "learning_rate": 1.1164231035058228e-05, "loss": 0.5504, "step": 3085 }, { "epoch": 1.3795899977472403, "grad_norm": 0.5622565116933912, "learning_rate": 1.1159538831770333e-05, "loss": 0.5829, "step": 3086 }, { "epoch": 1.3800405496733499, "grad_norm": 0.554014666759377, "learning_rate": 1.1154846369695864e-05, "loss": 0.5235, "step": 3087 }, { "epoch": 1.3804911015994592, "grad_norm": 0.529599321009182, "learning_rate": 1.1150153649882083e-05, "loss": 0.5414, "step": 3088 }, { "epoch": 1.3809416535255687, "grad_norm": 0.5984565753965113, "learning_rate": 1.1145460673376317e-05, "loss": 0.5416, "step": 3089 }, { "epoch": 1.3813922054516783, "grad_norm": 0.5961167084391201, "learning_rate": 1.1140767441225945e-05, "loss": 0.5535, "step": 3090 }, { "epoch": 1.3818427573777878, "grad_norm": 0.5208364972898278, "learning_rate": 1.113607395447841e-05, "loss": 0.5261, "step": 3091 }, { "epoch": 1.3822933093038974, "grad_norm": 0.5633599105339133, "learning_rate": 1.1131380214181205e-05, "loss": 0.5617, "step": 3092 }, { "epoch": 1.3827438612300067, "grad_norm": 0.5211977549718251, "learning_rate": 1.112668622138188e-05, "loss": 0.5279, "step": 3093 }, { "epoch": 1.3831944131561162, "grad_norm": 0.5219503336928906, "learning_rate": 1.1121991977128046e-05, "loss": 0.5402, "step": 3094 }, { "epoch": 1.3836449650822258, "grad_norm": 0.5474099616206514, "learning_rate": 1.1117297482467366e-05, "loss": 0.5618, "step": 3095 }, { "epoch": 1.384095517008335, "grad_norm": 0.5552630149512693, "learning_rate": 1.1112602738447558e-05, "loss": 0.5721, "step": 3096 }, { "epoch": 1.3845460689344447, "grad_norm": 0.5470839792151483, "learning_rate": 1.1107907746116402e-05, "loss": 0.54, "step": 3097 }, { "epoch": 1.3849966208605542, "grad_norm": 0.5232660287604355, "learning_rate": 1.1103212506521728e-05, "loss": 0.5262, "step": 3098 }, { "epoch": 1.3854471727866637, "grad_norm": 0.5446254483469453, "learning_rate": 1.1098517020711421e-05, "loss": 0.4966, "step": 3099 }, { "epoch": 1.3858977247127733, "grad_norm": 0.5511636439827349, "learning_rate": 1.1093821289733421e-05, "loss": 0.5223, "step": 3100 }, { "epoch": 1.3863482766388826, "grad_norm": 0.5584904880235143, "learning_rate": 1.1089125314635727e-05, "loss": 0.5526, "step": 3101 }, { "epoch": 1.3867988285649921, "grad_norm": 0.569418891243029, "learning_rate": 1.108442909646639e-05, "loss": 0.5634, "step": 3102 }, { "epoch": 1.3872493804911015, "grad_norm": 0.552220380407952, "learning_rate": 1.107973263627351e-05, "loss": 0.5594, "step": 3103 }, { "epoch": 1.387699932417211, "grad_norm": 0.5496796404185176, "learning_rate": 1.1075035935105252e-05, "loss": 0.5357, "step": 3104 }, { "epoch": 1.3881504843433206, "grad_norm": 0.539183571901781, "learning_rate": 1.1070338994009823e-05, "loss": 0.5259, "step": 3105 }, { "epoch": 1.38860103626943, "grad_norm": 0.5383447977369761, "learning_rate": 1.1065641814035495e-05, "loss": 0.5102, "step": 3106 }, { "epoch": 1.3890515881955396, "grad_norm": 0.5833234493273448, "learning_rate": 1.1060944396230583e-05, "loss": 0.5428, "step": 3107 }, { "epoch": 1.389502140121649, "grad_norm": 0.5464842956111294, "learning_rate": 1.105624674164346e-05, "loss": 0.5459, "step": 3108 }, { "epoch": 1.3899526920477585, "grad_norm": 0.5468626599417615, "learning_rate": 1.1051548851322553e-05, "loss": 0.5224, "step": 3109 }, { "epoch": 1.390403243973868, "grad_norm": 0.5573977875092487, "learning_rate": 1.1046850726316338e-05, "loss": 0.5828, "step": 3110 }, { "epoch": 1.3908537958999774, "grad_norm": 0.5518449604394909, "learning_rate": 1.104215236767335e-05, "loss": 0.5468, "step": 3111 }, { "epoch": 1.391304347826087, "grad_norm": 0.605092227951293, "learning_rate": 1.1037453776442164e-05, "loss": 0.5879, "step": 3112 }, { "epoch": 1.3917548997521965, "grad_norm": 0.5217774214621915, "learning_rate": 1.103275495367142e-05, "loss": 0.5211, "step": 3113 }, { "epoch": 1.392205451678306, "grad_norm": 0.5065816598830478, "learning_rate": 1.1028055900409805e-05, "loss": 0.5374, "step": 3114 }, { "epoch": 1.3926560036044153, "grad_norm": 0.5389892091868566, "learning_rate": 1.1023356617706051e-05, "loss": 0.5567, "step": 3115 }, { "epoch": 1.3931065555305249, "grad_norm": 0.532925259807497, "learning_rate": 1.1018657106608952e-05, "loss": 0.5247, "step": 3116 }, { "epoch": 1.3935571074566344, "grad_norm": 0.5211130634936322, "learning_rate": 1.1013957368167343e-05, "loss": 0.5318, "step": 3117 }, { "epoch": 1.3940076593827437, "grad_norm": 0.5219143024962033, "learning_rate": 1.100925740343012e-05, "loss": 0.5289, "step": 3118 }, { "epoch": 1.3944582113088533, "grad_norm": 0.5367219196335788, "learning_rate": 1.1004557213446221e-05, "loss": 0.5297, "step": 3119 }, { "epoch": 1.3949087632349628, "grad_norm": 0.5508605511967831, "learning_rate": 1.0999856799264635e-05, "loss": 0.5744, "step": 3120 }, { "epoch": 1.3953593151610724, "grad_norm": 0.5526861874667716, "learning_rate": 1.0995156161934408e-05, "loss": 0.5397, "step": 3121 }, { "epoch": 1.395809867087182, "grad_norm": 0.5496150291763835, "learning_rate": 1.099045530250463e-05, "loss": 0.5533, "step": 3122 }, { "epoch": 1.3962604190132912, "grad_norm": 0.5388053582724116, "learning_rate": 1.0985754222024437e-05, "loss": 0.5932, "step": 3123 }, { "epoch": 1.3967109709394008, "grad_norm": 0.56177782598653, "learning_rate": 1.0981052921543023e-05, "loss": 0.5514, "step": 3124 }, { "epoch": 1.3971615228655103, "grad_norm": 0.5513044253672698, "learning_rate": 1.0976351402109627e-05, "loss": 0.5471, "step": 3125 }, { "epoch": 1.3976120747916196, "grad_norm": 0.5694095297102434, "learning_rate": 1.0971649664773537e-05, "loss": 0.5501, "step": 3126 }, { "epoch": 1.3980626267177292, "grad_norm": 0.5586948919013983, "learning_rate": 1.0966947710584086e-05, "loss": 0.5397, "step": 3127 }, { "epoch": 1.3985131786438387, "grad_norm": 0.5578807685300102, "learning_rate": 1.0962245540590663e-05, "loss": 0.5376, "step": 3128 }, { "epoch": 1.3989637305699483, "grad_norm": 0.5917199729649338, "learning_rate": 1.0957543155842703e-05, "loss": 0.544, "step": 3129 }, { "epoch": 1.3994142824960576, "grad_norm": 0.5472743305477089, "learning_rate": 1.0952840557389681e-05, "loss": 0.5401, "step": 3130 }, { "epoch": 1.3998648344221671, "grad_norm": 0.5433423438683486, "learning_rate": 1.0948137746281128e-05, "loss": 0.5128, "step": 3131 }, { "epoch": 1.4003153863482767, "grad_norm": 0.5882112566878308, "learning_rate": 1.0943434723566624e-05, "loss": 0.5318, "step": 3132 }, { "epoch": 1.400765938274386, "grad_norm": 0.5304746363904278, "learning_rate": 1.0938731490295788e-05, "loss": 0.5477, "step": 3133 }, { "epoch": 1.4012164902004955, "grad_norm": 0.5361402966827719, "learning_rate": 1.0934028047518295e-05, "loss": 0.5242, "step": 3134 }, { "epoch": 1.401667042126605, "grad_norm": 0.5366499223996493, "learning_rate": 1.0929324396283856e-05, "loss": 0.5359, "step": 3135 }, { "epoch": 1.4021175940527146, "grad_norm": 0.5337175383468888, "learning_rate": 1.0924620537642237e-05, "loss": 0.5314, "step": 3136 }, { "epoch": 1.4025681459788242, "grad_norm": 0.5452236543349297, "learning_rate": 1.091991647264325e-05, "loss": 0.5441, "step": 3137 }, { "epoch": 1.4030186979049335, "grad_norm": 0.5207978424087838, "learning_rate": 1.0915212202336748e-05, "loss": 0.5124, "step": 3138 }, { "epoch": 1.403469249831043, "grad_norm": 0.5534000601153357, "learning_rate": 1.0910507727772637e-05, "loss": 0.5308, "step": 3139 }, { "epoch": 1.4039198017571526, "grad_norm": 0.5582159072440535, "learning_rate": 1.0905803050000863e-05, "loss": 0.5411, "step": 3140 }, { "epoch": 1.404370353683262, "grad_norm": 0.5429695559764163, "learning_rate": 1.0901098170071416e-05, "loss": 0.5607, "step": 3141 }, { "epoch": 1.4048209056093715, "grad_norm": 0.5462682961602555, "learning_rate": 1.0896393089034336e-05, "loss": 0.5802, "step": 3142 }, { "epoch": 1.405271457535481, "grad_norm": 0.520178947597596, "learning_rate": 1.0891687807939707e-05, "loss": 0.5476, "step": 3143 }, { "epoch": 1.4057220094615905, "grad_norm": 0.5731855242928562, "learning_rate": 1.0886982327837655e-05, "loss": 0.5451, "step": 3144 }, { "epoch": 1.4061725613876999, "grad_norm": 0.5089029471776547, "learning_rate": 1.0882276649778352e-05, "loss": 0.5275, "step": 3145 }, { "epoch": 1.4066231133138094, "grad_norm": 0.5338257370620586, "learning_rate": 1.0877570774812015e-05, "loss": 0.5464, "step": 3146 }, { "epoch": 1.407073665239919, "grad_norm": 0.5556075220036932, "learning_rate": 1.0872864703988903e-05, "loss": 0.5683, "step": 3147 }, { "epoch": 1.4075242171660283, "grad_norm": 0.5532864845706331, "learning_rate": 1.086815843835932e-05, "loss": 0.5528, "step": 3148 }, { "epoch": 1.4079747690921378, "grad_norm": 0.5225634104241677, "learning_rate": 1.0863451978973614e-05, "loss": 0.5712, "step": 3149 }, { "epoch": 1.4084253210182474, "grad_norm": 0.543445090371272, "learning_rate": 1.0858745326882172e-05, "loss": 0.5431, "step": 3150 }, { "epoch": 1.408875872944357, "grad_norm": 0.5485531519152792, "learning_rate": 1.0854038483135432e-05, "loss": 0.5632, "step": 3151 }, { "epoch": 1.4093264248704664, "grad_norm": 0.5243463826122251, "learning_rate": 1.0849331448783869e-05, "loss": 0.559, "step": 3152 }, { "epoch": 1.4097769767965758, "grad_norm": 0.5263953136528211, "learning_rate": 1.0844624224878e-05, "loss": 0.5598, "step": 3153 }, { "epoch": 1.4102275287226853, "grad_norm": 0.5422695365544143, "learning_rate": 1.0839916812468387e-05, "loss": 0.5406, "step": 3154 }, { "epoch": 1.4106780806487949, "grad_norm": 0.5478661882873174, "learning_rate": 1.0835209212605633e-05, "loss": 0.5491, "step": 3155 }, { "epoch": 1.4111286325749042, "grad_norm": 0.5071395474306585, "learning_rate": 1.0830501426340383e-05, "loss": 0.535, "step": 3156 }, { "epoch": 1.4115791845010137, "grad_norm": 0.5468356173406818, "learning_rate": 1.0825793454723325e-05, "loss": 0.5461, "step": 3157 }, { "epoch": 1.4120297364271233, "grad_norm": 0.5395312360742613, "learning_rate": 1.0821085298805185e-05, "loss": 0.5347, "step": 3158 }, { "epoch": 1.4124802883532328, "grad_norm": 0.5258105672893403, "learning_rate": 1.0816376959636734e-05, "loss": 0.5313, "step": 3159 }, { "epoch": 1.4129308402793421, "grad_norm": 0.5296566952758149, "learning_rate": 1.0811668438268778e-05, "loss": 0.5669, "step": 3160 }, { "epoch": 1.4133813922054517, "grad_norm": 0.5576736028490358, "learning_rate": 1.0806959735752174e-05, "loss": 0.5411, "step": 3161 }, { "epoch": 1.4138319441315612, "grad_norm": 0.5337551234347468, "learning_rate": 1.0802250853137808e-05, "loss": 0.5387, "step": 3162 }, { "epoch": 1.4142824960576705, "grad_norm": 0.5215523300751781, "learning_rate": 1.0797541791476614e-05, "loss": 0.5156, "step": 3163 }, { "epoch": 1.41473304798378, "grad_norm": 0.54759080100421, "learning_rate": 1.0792832551819558e-05, "loss": 0.5321, "step": 3164 }, { "epoch": 1.4151835999098896, "grad_norm": 0.551850810098317, "learning_rate": 1.078812313521766e-05, "loss": 0.5428, "step": 3165 }, { "epoch": 1.4156341518359992, "grad_norm": 0.5548557901825617, "learning_rate": 1.0783413542721963e-05, "loss": 0.506, "step": 3166 }, { "epoch": 1.4160847037621087, "grad_norm": 0.5223210211607124, "learning_rate": 1.0778703775383559e-05, "loss": 0.5579, "step": 3167 }, { "epoch": 1.416535255688218, "grad_norm": 0.5431419262134098, "learning_rate": 1.0773993834253578e-05, "loss": 0.5286, "step": 3168 }, { "epoch": 1.4169858076143276, "grad_norm": 0.5028295541335944, "learning_rate": 1.0769283720383186e-05, "loss": 0.5181, "step": 3169 }, { "epoch": 1.4174363595404371, "grad_norm": 0.5571024515694986, "learning_rate": 1.076457343482359e-05, "loss": 0.6055, "step": 3170 }, { "epoch": 1.4178869114665464, "grad_norm": 0.5427706072083441, "learning_rate": 1.0759862978626032e-05, "loss": 0.5271, "step": 3171 }, { "epoch": 1.418337463392656, "grad_norm": 0.5571972934767185, "learning_rate": 1.0755152352841798e-05, "loss": 0.563, "step": 3172 }, { "epoch": 1.4187880153187655, "grad_norm": 0.5608816749778625, "learning_rate": 1.0750441558522206e-05, "loss": 0.5318, "step": 3173 }, { "epoch": 1.419238567244875, "grad_norm": 0.5488300031394374, "learning_rate": 1.0745730596718612e-05, "loss": 0.5475, "step": 3174 }, { "epoch": 1.4196891191709844, "grad_norm": 0.5315450562389028, "learning_rate": 1.0741019468482417e-05, "loss": 0.5438, "step": 3175 }, { "epoch": 1.420139671097094, "grad_norm": 0.544471939960507, "learning_rate": 1.0736308174865049e-05, "loss": 0.5164, "step": 3176 }, { "epoch": 1.4205902230232035, "grad_norm": 0.5589730206385497, "learning_rate": 1.0731596716917978e-05, "loss": 0.553, "step": 3177 }, { "epoch": 1.4210407749493128, "grad_norm": 0.508865770745587, "learning_rate": 1.0726885095692712e-05, "loss": 0.5057, "step": 3178 }, { "epoch": 1.4214913268754223, "grad_norm": 0.5441717960425515, "learning_rate": 1.072217331224079e-05, "loss": 0.5453, "step": 3179 }, { "epoch": 1.4219418788015319, "grad_norm": 0.5378617341691994, "learning_rate": 1.0717461367613794e-05, "loss": 0.5558, "step": 3180 }, { "epoch": 1.4223924307276414, "grad_norm": 0.5404494894498594, "learning_rate": 1.0712749262863335e-05, "loss": 0.5038, "step": 3181 }, { "epoch": 1.422842982653751, "grad_norm": 0.5397139155533853, "learning_rate": 1.0708036999041072e-05, "loss": 0.5721, "step": 3182 }, { "epoch": 1.4232935345798603, "grad_norm": 0.5335185631980471, "learning_rate": 1.070332457719868e-05, "loss": 0.5572, "step": 3183 }, { "epoch": 1.4237440865059698, "grad_norm": 0.5337306370947902, "learning_rate": 1.0698611998387885e-05, "loss": 0.5324, "step": 3184 }, { "epoch": 1.4241946384320792, "grad_norm": 0.5411763669743642, "learning_rate": 1.0693899263660442e-05, "loss": 0.5434, "step": 3185 }, { "epoch": 1.4246451903581887, "grad_norm": 0.5474007167990518, "learning_rate": 1.0689186374068143e-05, "loss": 0.5477, "step": 3186 }, { "epoch": 1.4250957422842983, "grad_norm": 0.5429084548904292, "learning_rate": 1.0684473330662815e-05, "loss": 0.5299, "step": 3187 }, { "epoch": 1.4255462942104078, "grad_norm": 0.573485542761778, "learning_rate": 1.0679760134496316e-05, "loss": 0.556, "step": 3188 }, { "epoch": 1.4259968461365173, "grad_norm": 0.5999555950435546, "learning_rate": 1.0675046786620538e-05, "loss": 0.5574, "step": 3189 }, { "epoch": 1.4264473980626267, "grad_norm": 0.5247192399679141, "learning_rate": 1.0670333288087414e-05, "loss": 0.5058, "step": 3190 }, { "epoch": 1.4268979499887362, "grad_norm": 0.5450865761384506, "learning_rate": 1.0665619639948901e-05, "loss": 0.5428, "step": 3191 }, { "epoch": 1.4273485019148457, "grad_norm": 0.5743912122356701, "learning_rate": 1.0660905843256995e-05, "loss": 0.5587, "step": 3192 }, { "epoch": 1.427799053840955, "grad_norm": 0.5351855562830212, "learning_rate": 1.0656191899063727e-05, "loss": 0.5594, "step": 3193 }, { "epoch": 1.4282496057670646, "grad_norm": 0.5529006222790535, "learning_rate": 1.0651477808421152e-05, "loss": 0.5329, "step": 3194 }, { "epoch": 1.4287001576931742, "grad_norm": 0.5538979667154692, "learning_rate": 1.064676357238137e-05, "loss": 0.5749, "step": 3195 }, { "epoch": 1.4291507096192837, "grad_norm": 0.5748511436762952, "learning_rate": 1.0642049191996502e-05, "loss": 0.5412, "step": 3196 }, { "epoch": 1.4296012615453932, "grad_norm": 0.5337032365778173, "learning_rate": 1.0637334668318708e-05, "loss": 0.5557, "step": 3197 }, { "epoch": 1.4300518134715026, "grad_norm": 0.5468245498292831, "learning_rate": 1.0632620002400178e-05, "loss": 0.5517, "step": 3198 }, { "epoch": 1.430502365397612, "grad_norm": 0.6014777652022035, "learning_rate": 1.0627905195293135e-05, "loss": 0.5398, "step": 3199 }, { "epoch": 1.4309529173237214, "grad_norm": 0.553251089365043, "learning_rate": 1.0623190248049832e-05, "loss": 0.5526, "step": 3200 }, { "epoch": 1.431403469249831, "grad_norm": 0.5234360513023786, "learning_rate": 1.0618475161722554e-05, "loss": 0.5382, "step": 3201 }, { "epoch": 1.4318540211759405, "grad_norm": 0.5776426186936429, "learning_rate": 1.0613759937363617e-05, "loss": 0.5392, "step": 3202 }, { "epoch": 1.43230457310205, "grad_norm": 0.5467958219315262, "learning_rate": 1.0609044576025364e-05, "loss": 0.6059, "step": 3203 }, { "epoch": 1.4327551250281596, "grad_norm": 0.5262912260720514, "learning_rate": 1.0604329078760178e-05, "loss": 0.5325, "step": 3204 }, { "epoch": 1.433205676954269, "grad_norm": 0.5455159210904394, "learning_rate": 1.0599613446620463e-05, "loss": 0.5312, "step": 3205 }, { "epoch": 1.4336562288803785, "grad_norm": 0.5599515942409516, "learning_rate": 1.0594897680658657e-05, "loss": 0.5257, "step": 3206 }, { "epoch": 1.434106780806488, "grad_norm": 0.5571184168333919, "learning_rate": 1.0590181781927229e-05, "loss": 0.5595, "step": 3207 }, { "epoch": 1.4345573327325973, "grad_norm": 0.544237646598947, "learning_rate": 1.0585465751478674e-05, "loss": 0.4937, "step": 3208 }, { "epoch": 1.4350078846587069, "grad_norm": 0.5932067230107071, "learning_rate": 1.0580749590365519e-05, "loss": 0.5548, "step": 3209 }, { "epoch": 1.4354584365848164, "grad_norm": 0.5464275074099638, "learning_rate": 1.0576033299640323e-05, "loss": 0.5629, "step": 3210 }, { "epoch": 1.435908988510926, "grad_norm": 0.5375890618287631, "learning_rate": 1.0571316880355664e-05, "loss": 0.5127, "step": 3211 }, { "epoch": 1.4363595404370353, "grad_norm": 0.5644381523430149, "learning_rate": 1.0566600333564163e-05, "loss": 0.5512, "step": 3212 }, { "epoch": 1.4368100923631448, "grad_norm": 0.5451081803436602, "learning_rate": 1.0561883660318456e-05, "loss": 0.5149, "step": 3213 }, { "epoch": 1.4372606442892544, "grad_norm": 0.5748431277530672, "learning_rate": 1.0557166861671213e-05, "loss": 0.5787, "step": 3214 }, { "epoch": 1.4377111962153637, "grad_norm": 0.5713876028671953, "learning_rate": 1.0552449938675136e-05, "loss": 0.521, "step": 3215 }, { "epoch": 1.4381617481414732, "grad_norm": 0.5395794068282266, "learning_rate": 1.0547732892382949e-05, "loss": 0.5298, "step": 3216 }, { "epoch": 1.4386123000675828, "grad_norm": 0.5334114560123814, "learning_rate": 1.0543015723847402e-05, "loss": 0.5417, "step": 3217 }, { "epoch": 1.4390628519936923, "grad_norm": 0.5591648452591061, "learning_rate": 1.0538298434121284e-05, "loss": 0.5308, "step": 3218 }, { "epoch": 1.4395134039198019, "grad_norm": 0.5238134690986201, "learning_rate": 1.0533581024257394e-05, "loss": 0.5404, "step": 3219 }, { "epoch": 1.4399639558459112, "grad_norm": 0.5406431182970631, "learning_rate": 1.0528863495308568e-05, "loss": 0.5311, "step": 3220 }, { "epoch": 1.4404145077720207, "grad_norm": 0.5527477357636743, "learning_rate": 1.0524145848327667e-05, "loss": 0.5154, "step": 3221 }, { "epoch": 1.4408650596981303, "grad_norm": 0.5677093731258293, "learning_rate": 1.0519428084367583e-05, "loss": 0.5551, "step": 3222 }, { "epoch": 1.4413156116242396, "grad_norm": 0.5378260565242562, "learning_rate": 1.0514710204481223e-05, "loss": 0.5125, "step": 3223 }, { "epoch": 1.4417661635503491, "grad_norm": 0.56224793463416, "learning_rate": 1.0509992209721528e-05, "loss": 0.5127, "step": 3224 }, { "epoch": 1.4422167154764587, "grad_norm": 0.524021369231612, "learning_rate": 1.0505274101141466e-05, "loss": 0.5297, "step": 3225 }, { "epoch": 1.4426672674025682, "grad_norm": 0.5706255789959995, "learning_rate": 1.0500555879794028e-05, "loss": 0.5603, "step": 3226 }, { "epoch": 1.4431178193286776, "grad_norm": 0.5514508466758242, "learning_rate": 1.0495837546732224e-05, "loss": 0.5649, "step": 3227 }, { "epoch": 1.443568371254787, "grad_norm": 0.5362502353470717, "learning_rate": 1.0491119103009097e-05, "loss": 0.5683, "step": 3228 }, { "epoch": 1.4440189231808966, "grad_norm": 0.5540661796945128, "learning_rate": 1.0486400549677713e-05, "loss": 0.5338, "step": 3229 }, { "epoch": 1.444469475107006, "grad_norm": 0.5593483668741404, "learning_rate": 1.0481681887791164e-05, "loss": 0.545, "step": 3230 }, { "epoch": 1.4449200270331155, "grad_norm": 0.571333238231604, "learning_rate": 1.0476963118402558e-05, "loss": 0.5727, "step": 3231 }, { "epoch": 1.445370578959225, "grad_norm": 0.5596425978180333, "learning_rate": 1.0472244242565035e-05, "loss": 0.5316, "step": 3232 }, { "epoch": 1.4458211308853346, "grad_norm": 0.5974461943353191, "learning_rate": 1.046752526133176e-05, "loss": 0.5737, "step": 3233 }, { "epoch": 1.4462716828114441, "grad_norm": 0.5514614779938711, "learning_rate": 1.046280617575591e-05, "loss": 0.5437, "step": 3234 }, { "epoch": 1.4467222347375535, "grad_norm": 0.5489953326988541, "learning_rate": 1.0458086986890703e-05, "loss": 0.5285, "step": 3235 }, { "epoch": 1.447172786663663, "grad_norm": 0.5604821852860827, "learning_rate": 1.0453367695789365e-05, "loss": 0.5431, "step": 3236 }, { "epoch": 1.4476233385897725, "grad_norm": 0.5421628930467164, "learning_rate": 1.044864830350515e-05, "loss": 0.5346, "step": 3237 }, { "epoch": 1.4480738905158819, "grad_norm": 0.5845752044672471, "learning_rate": 1.0443928811091337e-05, "loss": 0.5397, "step": 3238 }, { "epoch": 1.4485244424419914, "grad_norm": 0.5263227782139785, "learning_rate": 1.0439209219601224e-05, "loss": 0.4942, "step": 3239 }, { "epoch": 1.448974994368101, "grad_norm": 0.5632180481671187, "learning_rate": 1.0434489530088134e-05, "loss": 0.5293, "step": 3240 }, { "epoch": 1.4494255462942105, "grad_norm": 0.5473669254534983, "learning_rate": 1.0429769743605406e-05, "loss": 0.5557, "step": 3241 }, { "epoch": 1.4498760982203198, "grad_norm": 0.5140880687441477, "learning_rate": 1.042504986120641e-05, "loss": 0.5666, "step": 3242 }, { "epoch": 1.4503266501464294, "grad_norm": 0.5255569754450206, "learning_rate": 1.042032988394453e-05, "loss": 0.5418, "step": 3243 }, { "epoch": 1.450777202072539, "grad_norm": 0.5400740346835345, "learning_rate": 1.0415609812873173e-05, "loss": 0.5368, "step": 3244 }, { "epoch": 1.4512277539986482, "grad_norm": 0.54776058351081, "learning_rate": 1.0410889649045766e-05, "loss": 0.5643, "step": 3245 }, { "epoch": 1.4516783059247578, "grad_norm": 0.5407894934964158, "learning_rate": 1.040616939351576e-05, "loss": 0.5341, "step": 3246 }, { "epoch": 1.4521288578508673, "grad_norm": 0.5398477798631732, "learning_rate": 1.0401449047336622e-05, "loss": 0.5199, "step": 3247 }, { "epoch": 1.4525794097769769, "grad_norm": 0.5705315514905202, "learning_rate": 1.0396728611561843e-05, "loss": 0.5225, "step": 3248 }, { "epoch": 1.4530299617030864, "grad_norm": 0.5483134693640976, "learning_rate": 1.0392008087244936e-05, "loss": 0.5547, "step": 3249 }, { "epoch": 1.4534805136291957, "grad_norm": 0.5281069239426585, "learning_rate": 1.0387287475439426e-05, "loss": 0.5391, "step": 3250 }, { "epoch": 1.4539310655553053, "grad_norm": 0.55448805702396, "learning_rate": 1.0382566777198863e-05, "loss": 0.5421, "step": 3251 }, { "epoch": 1.4543816174814148, "grad_norm": 0.581085931027513, "learning_rate": 1.0377845993576819e-05, "loss": 0.5411, "step": 3252 }, { "epoch": 1.4548321694075241, "grad_norm": 0.5127988565199618, "learning_rate": 1.0373125125626877e-05, "loss": 0.5403, "step": 3253 }, { "epoch": 1.4552827213336337, "grad_norm": 0.5226126965044833, "learning_rate": 1.0368404174402644e-05, "loss": 0.5288, "step": 3254 }, { "epoch": 1.4557332732597432, "grad_norm": 0.5575815659424733, "learning_rate": 1.0363683140957745e-05, "loss": 0.5566, "step": 3255 }, { "epoch": 1.4561838251858528, "grad_norm": 0.5409829773120454, "learning_rate": 1.0358962026345824e-05, "loss": 0.5363, "step": 3256 }, { "epoch": 1.456634377111962, "grad_norm": 0.5430268833061941, "learning_rate": 1.0354240831620542e-05, "loss": 0.5678, "step": 3257 }, { "epoch": 1.4570849290380716, "grad_norm": 0.5258184647513546, "learning_rate": 1.0349519557835574e-05, "loss": 0.543, "step": 3258 }, { "epoch": 1.4575354809641812, "grad_norm": 0.5166190632188518, "learning_rate": 1.0344798206044624e-05, "loss": 0.5481, "step": 3259 }, { "epoch": 1.4579860328902905, "grad_norm": 0.5452465806715522, "learning_rate": 1.0340076777301399e-05, "loss": 0.555, "step": 3260 }, { "epoch": 1.4584365848164, "grad_norm": 0.5367325134672669, "learning_rate": 1.0335355272659638e-05, "loss": 0.5555, "step": 3261 }, { "epoch": 1.4588871367425096, "grad_norm": 0.5142639567576175, "learning_rate": 1.0330633693173083e-05, "loss": 0.5806, "step": 3262 }, { "epoch": 1.4593376886686191, "grad_norm": 0.5177498264236214, "learning_rate": 1.0325912039895501e-05, "loss": 0.5336, "step": 3263 }, { "epoch": 1.4597882405947287, "grad_norm": 0.5546668450007692, "learning_rate": 1.0321190313880674e-05, "loss": 0.5698, "step": 3264 }, { "epoch": 1.460238792520838, "grad_norm": 0.5362846302900495, "learning_rate": 1.0316468516182396e-05, "loss": 0.5398, "step": 3265 }, { "epoch": 1.4606893444469475, "grad_norm": 0.5379047105662441, "learning_rate": 1.031174664785449e-05, "loss": 0.5531, "step": 3266 }, { "epoch": 1.4611398963730569, "grad_norm": 0.5242234920714969, "learning_rate": 1.0307024709950775e-05, "loss": 0.5419, "step": 3267 }, { "epoch": 1.4615904482991664, "grad_norm": 0.551608084716723, "learning_rate": 1.03023027035251e-05, "loss": 0.5674, "step": 3268 }, { "epoch": 1.462041000225276, "grad_norm": 0.5180422225176871, "learning_rate": 1.0297580629631324e-05, "loss": 0.5261, "step": 3269 }, { "epoch": 1.4624915521513855, "grad_norm": 0.5724643937947521, "learning_rate": 1.0292858489323327e-05, "loss": 0.5448, "step": 3270 }, { "epoch": 1.462942104077495, "grad_norm": 0.5519899642326985, "learning_rate": 1.0288136283654996e-05, "loss": 0.561, "step": 3271 }, { "epoch": 1.4633926560036044, "grad_norm": 0.5408508848673764, "learning_rate": 1.0283414013680233e-05, "loss": 0.5381, "step": 3272 }, { "epoch": 1.463843207929714, "grad_norm": 0.539818708993755, "learning_rate": 1.0278691680452958e-05, "loss": 0.5133, "step": 3273 }, { "epoch": 1.4642937598558234, "grad_norm": 0.5388713370397608, "learning_rate": 1.027396928502711e-05, "loss": 0.5332, "step": 3274 }, { "epoch": 1.4647443117819328, "grad_norm": 0.5402600519281123, "learning_rate": 1.026924682845663e-05, "loss": 0.5425, "step": 3275 }, { "epoch": 1.4651948637080423, "grad_norm": 0.5320077245715737, "learning_rate": 1.0264524311795478e-05, "loss": 0.5374, "step": 3276 }, { "epoch": 1.4656454156341518, "grad_norm": 0.5611321495118435, "learning_rate": 1.0259801736097634e-05, "loss": 0.519, "step": 3277 }, { "epoch": 1.4660959675602614, "grad_norm": 0.5343264024898595, "learning_rate": 1.0255079102417083e-05, "loss": 0.5157, "step": 3278 }, { "epoch": 1.466546519486371, "grad_norm": 0.5228626596011162, "learning_rate": 1.0250356411807821e-05, "loss": 0.5041, "step": 3279 }, { "epoch": 1.4669970714124803, "grad_norm": 0.5306447905086282, "learning_rate": 1.0245633665323864e-05, "loss": 0.5415, "step": 3280 }, { "epoch": 1.4674476233385898, "grad_norm": 0.5517069823453434, "learning_rate": 1.0240910864019237e-05, "loss": 0.5565, "step": 3281 }, { "epoch": 1.4678981752646991, "grad_norm": 0.5191465565347976, "learning_rate": 1.023618800894798e-05, "loss": 0.524, "step": 3282 }, { "epoch": 1.4683487271908087, "grad_norm": 0.5547392035396004, "learning_rate": 1.023146510116414e-05, "loss": 0.5695, "step": 3283 }, { "epoch": 1.4687992791169182, "grad_norm": 0.5361878725441811, "learning_rate": 1.022674214172178e-05, "loss": 0.5467, "step": 3284 }, { "epoch": 1.4692498310430278, "grad_norm": 0.5357870047219502, "learning_rate": 1.022201913167497e-05, "loss": 0.5691, "step": 3285 }, { "epoch": 1.4697003829691373, "grad_norm": 0.5395772297234219, "learning_rate": 1.0217296072077798e-05, "loss": 0.5285, "step": 3286 }, { "epoch": 1.4701509348952466, "grad_norm": 0.5371562021262571, "learning_rate": 1.0212572963984358e-05, "loss": 0.5597, "step": 3287 }, { "epoch": 1.4706014868213562, "grad_norm": 0.5461934411299995, "learning_rate": 1.0207849808448753e-05, "loss": 0.5697, "step": 3288 }, { "epoch": 1.4710520387474657, "grad_norm": 0.5492235649980818, "learning_rate": 1.0203126606525104e-05, "loss": 0.5392, "step": 3289 }, { "epoch": 1.471502590673575, "grad_norm": 0.5332231453774631, "learning_rate": 1.0198403359267538e-05, "loss": 0.5484, "step": 3290 }, { "epoch": 1.4719531425996846, "grad_norm": 0.5762406308067755, "learning_rate": 1.0193680067730192e-05, "loss": 0.5662, "step": 3291 }, { "epoch": 1.4724036945257941, "grad_norm": 0.5276954374969142, "learning_rate": 1.0188956732967208e-05, "loss": 0.5416, "step": 3292 }, { "epoch": 1.4728542464519037, "grad_norm": 0.5524807552564, "learning_rate": 1.0184233356032747e-05, "loss": 0.5325, "step": 3293 }, { "epoch": 1.4733047983780132, "grad_norm": 0.5534630362019121, "learning_rate": 1.0179509937980973e-05, "loss": 0.5488, "step": 3294 }, { "epoch": 1.4737553503041225, "grad_norm": 0.5397691285182232, "learning_rate": 1.0174786479866065e-05, "loss": 0.5361, "step": 3295 }, { "epoch": 1.474205902230232, "grad_norm": 0.510588337691447, "learning_rate": 1.0170062982742207e-05, "loss": 0.5065, "step": 3296 }, { "epoch": 1.4746564541563414, "grad_norm": 0.5558428005123731, "learning_rate": 1.0165339447663586e-05, "loss": 0.5594, "step": 3297 }, { "epoch": 1.475107006082451, "grad_norm": 0.5308762359587658, "learning_rate": 1.016061587568441e-05, "loss": 0.5437, "step": 3298 }, { "epoch": 1.4755575580085605, "grad_norm": 0.5257102007053271, "learning_rate": 1.0155892267858884e-05, "loss": 0.5102, "step": 3299 }, { "epoch": 1.47600810993467, "grad_norm": 0.5421796605382041, "learning_rate": 1.015116862524123e-05, "loss": 0.5795, "step": 3300 }, { "epoch": 1.4764586618607796, "grad_norm": 0.5592900360372284, "learning_rate": 1.014644494888567e-05, "loss": 0.5444, "step": 3301 }, { "epoch": 1.4769092137868889, "grad_norm": 0.544098930086657, "learning_rate": 1.0141721239846436e-05, "loss": 0.5537, "step": 3302 }, { "epoch": 1.4773597657129984, "grad_norm": 0.5550408965588395, "learning_rate": 1.0136997499177773e-05, "loss": 0.5258, "step": 3303 }, { "epoch": 1.477810317639108, "grad_norm": 0.5363466564859481, "learning_rate": 1.0132273727933925e-05, "loss": 0.5561, "step": 3304 }, { "epoch": 1.4782608695652173, "grad_norm": 0.5515386554412877, "learning_rate": 1.0127549927169147e-05, "loss": 0.5243, "step": 3305 }, { "epoch": 1.4787114214913268, "grad_norm": 0.5597419083995395, "learning_rate": 1.0122826097937699e-05, "loss": 0.5498, "step": 3306 }, { "epoch": 1.4791619734174364, "grad_norm": 0.5533509076230411, "learning_rate": 1.0118102241293848e-05, "loss": 0.5678, "step": 3307 }, { "epoch": 1.479612525343546, "grad_norm": 0.5299279283694236, "learning_rate": 1.0113378358291864e-05, "loss": 0.5703, "step": 3308 }, { "epoch": 1.4800630772696552, "grad_norm": 0.532031570179923, "learning_rate": 1.0108654449986032e-05, "loss": 0.5397, "step": 3309 }, { "epoch": 1.4805136291957648, "grad_norm": 0.5244762774810477, "learning_rate": 1.0103930517430635e-05, "loss": 0.5555, "step": 3310 }, { "epoch": 1.4809641811218743, "grad_norm": 0.5238536509177996, "learning_rate": 1.0099206561679964e-05, "loss": 0.5192, "step": 3311 }, { "epoch": 1.4814147330479837, "grad_norm": 0.5665888785096148, "learning_rate": 1.0094482583788311e-05, "loss": 0.5589, "step": 3312 }, { "epoch": 1.4818652849740932, "grad_norm": 0.562399663879378, "learning_rate": 1.008975858480998e-05, "loss": 0.5517, "step": 3313 }, { "epoch": 1.4823158369002027, "grad_norm": 0.5432794610449833, "learning_rate": 1.0085034565799275e-05, "loss": 0.5412, "step": 3314 }, { "epoch": 1.4827663888263123, "grad_norm": 0.5130790157369214, "learning_rate": 1.00803105278105e-05, "loss": 0.5602, "step": 3315 }, { "epoch": 1.4832169407524218, "grad_norm": 0.5547307815266541, "learning_rate": 1.0075586471897976e-05, "loss": 0.5025, "step": 3316 }, { "epoch": 1.4836674926785312, "grad_norm": 0.520779375711625, "learning_rate": 1.0070862399116016e-05, "loss": 0.5249, "step": 3317 }, { "epoch": 1.4841180446046407, "grad_norm": 0.5364187953379472, "learning_rate": 1.0066138310518942e-05, "loss": 0.564, "step": 3318 }, { "epoch": 1.4845685965307502, "grad_norm": 0.5359203757624423, "learning_rate": 1.0061414207161082e-05, "loss": 0.5501, "step": 3319 }, { "epoch": 1.4850191484568596, "grad_norm": 0.5327235020382645, "learning_rate": 1.0056690090096761e-05, "loss": 0.5248, "step": 3320 }, { "epoch": 1.485469700382969, "grad_norm": 0.5272117803240027, "learning_rate": 1.0051965960380312e-05, "loss": 0.5268, "step": 3321 }, { "epoch": 1.4859202523090786, "grad_norm": 0.6236979607251093, "learning_rate": 1.0047241819066069e-05, "loss": 0.5617, "step": 3322 }, { "epoch": 1.4863708042351882, "grad_norm": 0.5174836734635095, "learning_rate": 1.004251766720837e-05, "loss": 0.5452, "step": 3323 }, { "epoch": 1.4868213561612975, "grad_norm": 0.5549156456180365, "learning_rate": 1.0037793505861549e-05, "loss": 0.5586, "step": 3324 }, { "epoch": 1.487271908087407, "grad_norm": 0.5803305824330035, "learning_rate": 1.0033069336079952e-05, "loss": 0.5419, "step": 3325 }, { "epoch": 1.4877224600135166, "grad_norm": 0.5611651877317816, "learning_rate": 1.0028345158917923e-05, "loss": 0.5291, "step": 3326 }, { "epoch": 1.488173011939626, "grad_norm": 0.5562578376446087, "learning_rate": 1.0023620975429803e-05, "loss": 0.5526, "step": 3327 }, { "epoch": 1.4886235638657355, "grad_norm": 0.5573164885730664, "learning_rate": 1.0018896786669936e-05, "loss": 0.5419, "step": 3328 }, { "epoch": 1.489074115791845, "grad_norm": 0.5552736995894058, "learning_rate": 1.0014172593692675e-05, "loss": 0.5385, "step": 3329 }, { "epoch": 1.4895246677179546, "grad_norm": 0.5266522589575242, "learning_rate": 1.0009448397552367e-05, "loss": 0.5302, "step": 3330 }, { "epoch": 1.4895246677179546, "eval_loss": 0.6350666284561157, "eval_runtime": 24.4007, "eval_samples_per_second": 11.434, "eval_steps_per_second": 0.492, "step": 3330 }, { "epoch": 1.489975219644064, "grad_norm": 0.5577854364348317, "learning_rate": 1.000472419930336e-05, "loss": 0.5495, "step": 3331 }, { "epoch": 1.4904257715701734, "grad_norm": 0.5501365990440772, "learning_rate": 1e-05, "loss": 0.5401, "step": 3332 }, { "epoch": 1.490876323496283, "grad_norm": 0.571524264681957, "learning_rate": 9.995275800696642e-06, "loss": 0.5496, "step": 3333 }, { "epoch": 1.4913268754223925, "grad_norm": 0.5716797279736268, "learning_rate": 9.990551602447635e-06, "loss": 0.558, "step": 3334 }, { "epoch": 1.4917774273485018, "grad_norm": 0.5398162459667425, "learning_rate": 9.985827406307325e-06, "loss": 0.5754, "step": 3335 }, { "epoch": 1.4922279792746114, "grad_norm": 0.5793542203473507, "learning_rate": 9.981103213330067e-06, "loss": 0.5441, "step": 3336 }, { "epoch": 1.492678531200721, "grad_norm": 0.5092668474944843, "learning_rate": 9.976379024570202e-06, "loss": 0.5288, "step": 3337 }, { "epoch": 1.4931290831268305, "grad_norm": 0.5651395284240474, "learning_rate": 9.97165484108208e-06, "loss": 0.5575, "step": 3338 }, { "epoch": 1.4935796350529398, "grad_norm": 0.5451159845107784, "learning_rate": 9.96693066392005e-06, "loss": 0.5503, "step": 3339 }, { "epoch": 1.4940301869790493, "grad_norm": 0.5585238448917081, "learning_rate": 9.962206494138454e-06, "loss": 0.5548, "step": 3340 }, { "epoch": 1.4944807389051589, "grad_norm": 0.5294319587594065, "learning_rate": 9.957482332791632e-06, "loss": 0.567, "step": 3341 }, { "epoch": 1.4949312908312682, "grad_norm": 0.535957006259404, "learning_rate": 9.952758180933933e-06, "loss": 0.5283, "step": 3342 }, { "epoch": 1.4953818427573777, "grad_norm": 0.5506854442964498, "learning_rate": 9.948034039619688e-06, "loss": 0.5232, "step": 3343 }, { "epoch": 1.4958323946834873, "grad_norm": 0.5365774662987888, "learning_rate": 9.94330990990324e-06, "loss": 0.5408, "step": 3344 }, { "epoch": 1.4962829466095968, "grad_norm": 0.5280665487778572, "learning_rate": 9.938585792838918e-06, "loss": 0.5507, "step": 3345 }, { "epoch": 1.4967334985357064, "grad_norm": 0.5623463376924319, "learning_rate": 9.93386168948106e-06, "loss": 0.5617, "step": 3346 }, { "epoch": 1.4971840504618157, "grad_norm": 0.5338890254085382, "learning_rate": 9.929137600883986e-06, "loss": 0.568, "step": 3347 }, { "epoch": 1.4976346023879252, "grad_norm": 0.543012653078492, "learning_rate": 9.924413528102029e-06, "loss": 0.5403, "step": 3348 }, { "epoch": 1.4980851543140348, "grad_norm": 0.5179242294930263, "learning_rate": 9.919689472189502e-06, "loss": 0.5254, "step": 3349 }, { "epoch": 1.498535706240144, "grad_norm": 0.5589183259923212, "learning_rate": 9.914965434200729e-06, "loss": 0.5136, "step": 3350 }, { "epoch": 1.4989862581662536, "grad_norm": 0.531111607094991, "learning_rate": 9.910241415190022e-06, "loss": 0.5517, "step": 3351 }, { "epoch": 1.4994368100923632, "grad_norm": 0.530826565543791, "learning_rate": 9.90551741621169e-06, "loss": 0.5746, "step": 3352 }, { "epoch": 1.4998873620184727, "grad_norm": 0.5409603704579996, "learning_rate": 9.900793438320037e-06, "loss": 0.517, "step": 3353 }, { "epoch": 1.5003379139445823, "grad_norm": 0.5093686522794134, "learning_rate": 9.896069482569366e-06, "loss": 0.5205, "step": 3354 }, { "epoch": 1.5007884658706916, "grad_norm": 0.5236852990328352, "learning_rate": 9.891345550013967e-06, "loss": 0.5074, "step": 3355 }, { "epoch": 1.5012390177968011, "grad_norm": 0.5419673849220279, "learning_rate": 9.886621641708138e-06, "loss": 0.568, "step": 3356 }, { "epoch": 1.5016895697229105, "grad_norm": 0.5366631073155058, "learning_rate": 9.881897758706155e-06, "loss": 0.5595, "step": 3357 }, { "epoch": 1.50214012164902, "grad_norm": 0.5365976888081132, "learning_rate": 9.877173902062307e-06, "loss": 0.5317, "step": 3358 }, { "epoch": 1.5025906735751295, "grad_norm": 0.519342849890789, "learning_rate": 9.872450072830856e-06, "loss": 0.5585, "step": 3359 }, { "epoch": 1.503041225501239, "grad_norm": 0.5452177929110267, "learning_rate": 9.86772627206608e-06, "loss": 0.5594, "step": 3360 }, { "epoch": 1.5034917774273486, "grad_norm": 0.5167767412402221, "learning_rate": 9.86300250082223e-06, "loss": 0.5171, "step": 3361 }, { "epoch": 1.503942329353458, "grad_norm": 0.5738300799284956, "learning_rate": 9.858278760153567e-06, "loss": 0.5668, "step": 3362 }, { "epoch": 1.5043928812795675, "grad_norm": 0.5328632953781776, "learning_rate": 9.853555051114334e-06, "loss": 0.5167, "step": 3363 }, { "epoch": 1.5048434332056768, "grad_norm": 0.5502438533445116, "learning_rate": 9.848831374758776e-06, "loss": 0.5284, "step": 3364 }, { "epoch": 1.5052939851317864, "grad_norm": 0.5225623637462185, "learning_rate": 9.844107732141119e-06, "loss": 0.5724, "step": 3365 }, { "epoch": 1.505744537057896, "grad_norm": 0.5223079100760126, "learning_rate": 9.839384124315596e-06, "loss": 0.534, "step": 3366 }, { "epoch": 1.5061950889840054, "grad_norm": 0.5484931586285127, "learning_rate": 9.834660552336415e-06, "loss": 0.5436, "step": 3367 }, { "epoch": 1.506645640910115, "grad_norm": 0.5495487012519575, "learning_rate": 9.829937017257798e-06, "loss": 0.5216, "step": 3368 }, { "epoch": 1.5070961928362245, "grad_norm": 0.5346861265987605, "learning_rate": 9.825213520133937e-06, "loss": 0.5378, "step": 3369 }, { "epoch": 1.5075467447623339, "grad_norm": 0.536941443274154, "learning_rate": 9.82049006201903e-06, "loss": 0.5442, "step": 3370 }, { "epoch": 1.5079972966884432, "grad_norm": 0.5462326295625984, "learning_rate": 9.815766643967256e-06, "loss": 0.527, "step": 3371 }, { "epoch": 1.5084478486145527, "grad_norm": 0.5398007464202508, "learning_rate": 9.811043267032797e-06, "loss": 0.5264, "step": 3372 }, { "epoch": 1.5088984005406623, "grad_norm": 0.5194630293856699, "learning_rate": 9.806319932269812e-06, "loss": 0.5589, "step": 3373 }, { "epoch": 1.5093489524667718, "grad_norm": 0.5070709440985893, "learning_rate": 9.801596640732466e-06, "loss": 0.5264, "step": 3374 }, { "epoch": 1.5097995043928814, "grad_norm": 0.5440871943724762, "learning_rate": 9.796873393474899e-06, "loss": 0.5775, "step": 3375 }, { "epoch": 1.510250056318991, "grad_norm": 0.5337215122097322, "learning_rate": 9.79215019155125e-06, "loss": 0.5627, "step": 3376 }, { "epoch": 1.5107006082451002, "grad_norm": 0.547922887470037, "learning_rate": 9.787427036015647e-06, "loss": 0.5457, "step": 3377 }, { "epoch": 1.5111511601712098, "grad_norm": 0.5185694673170826, "learning_rate": 9.782703927922207e-06, "loss": 0.5707, "step": 3378 }, { "epoch": 1.511601712097319, "grad_norm": 0.5372143479517506, "learning_rate": 9.777980868325032e-06, "loss": 0.5041, "step": 3379 }, { "epoch": 1.5120522640234286, "grad_norm": 0.5337927260144876, "learning_rate": 9.773257858278225e-06, "loss": 0.5285, "step": 3380 }, { "epoch": 1.5125028159495382, "grad_norm": 0.5398940169344427, "learning_rate": 9.768534898835864e-06, "loss": 0.5357, "step": 3381 }, { "epoch": 1.5129533678756477, "grad_norm": 0.5319845012791196, "learning_rate": 9.763811991052021e-06, "loss": 0.5303, "step": 3382 }, { "epoch": 1.5134039198017573, "grad_norm": 0.5398065763973207, "learning_rate": 9.759089135980765e-06, "loss": 0.5737, "step": 3383 }, { "epoch": 1.5138544717278668, "grad_norm": 0.5263218855325105, "learning_rate": 9.75436633467614e-06, "loss": 0.5474, "step": 3384 }, { "epoch": 1.5143050236539761, "grad_norm": 0.5308932139586218, "learning_rate": 9.749643588192182e-06, "loss": 0.5389, "step": 3385 }, { "epoch": 1.5147555755800854, "grad_norm": 0.5415794248589006, "learning_rate": 9.744920897582922e-06, "loss": 0.5477, "step": 3386 }, { "epoch": 1.515206127506195, "grad_norm": 0.522690217076062, "learning_rate": 9.74019826390237e-06, "loss": 0.5378, "step": 3387 }, { "epoch": 1.5156566794323045, "grad_norm": 0.5442222308778198, "learning_rate": 9.735475688204522e-06, "loss": 0.4931, "step": 3388 }, { "epoch": 1.516107231358414, "grad_norm": 0.5500630671708753, "learning_rate": 9.730753171543374e-06, "loss": 0.5473, "step": 3389 }, { "epoch": 1.5165577832845236, "grad_norm": 0.5240596517333772, "learning_rate": 9.726030714972891e-06, "loss": 0.514, "step": 3390 }, { "epoch": 1.5170083352106332, "grad_norm": 0.5527654492358468, "learning_rate": 9.721308319547043e-06, "loss": 0.5666, "step": 3391 }, { "epoch": 1.5174588871367425, "grad_norm": 0.5198000818537272, "learning_rate": 9.716585986319769e-06, "loss": 0.5187, "step": 3392 }, { "epoch": 1.517909439062852, "grad_norm": 0.5455386532399019, "learning_rate": 9.711863716345008e-06, "loss": 0.5458, "step": 3393 }, { "epoch": 1.5183599909889613, "grad_norm": 0.558666665356048, "learning_rate": 9.707141510676673e-06, "loss": 0.5441, "step": 3394 }, { "epoch": 1.518810542915071, "grad_norm": 0.5063635109293575, "learning_rate": 9.702419370368677e-06, "loss": 0.5625, "step": 3395 }, { "epoch": 1.5192610948411804, "grad_norm": 0.5641520139690246, "learning_rate": 9.6976972964749e-06, "loss": 0.5433, "step": 3396 }, { "epoch": 1.51971164676729, "grad_norm": 0.5269940153185281, "learning_rate": 9.692975290049228e-06, "loss": 0.552, "step": 3397 }, { "epoch": 1.5201621986933995, "grad_norm": 0.5180616034398902, "learning_rate": 9.688253352145513e-06, "loss": 0.5272, "step": 3398 }, { "epoch": 1.5206127506195088, "grad_norm": 0.5336595935870573, "learning_rate": 9.683531483817606e-06, "loss": 0.5334, "step": 3399 }, { "epoch": 1.5210633025456184, "grad_norm": 0.5484751300988384, "learning_rate": 9.678809686119328e-06, "loss": 0.5343, "step": 3400 }, { "epoch": 1.5215138544717277, "grad_norm": 0.5241646823733758, "learning_rate": 9.674087960104502e-06, "loss": 0.5247, "step": 3401 }, { "epoch": 1.5219644063978373, "grad_norm": 0.5347798252521765, "learning_rate": 9.669366306826919e-06, "loss": 0.5448, "step": 3402 }, { "epoch": 1.5224149583239468, "grad_norm": 0.5056508584384437, "learning_rate": 9.664644727340366e-06, "loss": 0.5175, "step": 3403 }, { "epoch": 1.5228655102500563, "grad_norm": 0.4995861582484772, "learning_rate": 9.6599232226986e-06, "loss": 0.5241, "step": 3404 }, { "epoch": 1.5233160621761659, "grad_norm": 0.523223714806493, "learning_rate": 9.65520179395538e-06, "loss": 0.5301, "step": 3405 }, { "epoch": 1.5237666141022754, "grad_norm": 0.5122196323769199, "learning_rate": 9.650480442164426e-06, "loss": 0.5, "step": 3406 }, { "epoch": 1.5242171660283848, "grad_norm": 0.5411843474778472, "learning_rate": 9.645759168379463e-06, "loss": 0.556, "step": 3407 }, { "epoch": 1.5246677179544943, "grad_norm": 0.5240355480054075, "learning_rate": 9.641037973654179e-06, "loss": 0.5097, "step": 3408 }, { "epoch": 1.5251182698806036, "grad_norm": 0.5604142892587748, "learning_rate": 9.636316859042258e-06, "loss": 0.5585, "step": 3409 }, { "epoch": 1.5255688218067132, "grad_norm": 0.545421289483462, "learning_rate": 9.63159582559736e-06, "loss": 0.5566, "step": 3410 }, { "epoch": 1.5260193737328227, "grad_norm": 0.5176524441803115, "learning_rate": 9.626874874373126e-06, "loss": 0.5339, "step": 3411 }, { "epoch": 1.5264699256589322, "grad_norm": 0.5434001685730753, "learning_rate": 9.622154006423185e-06, "loss": 0.547, "step": 3412 }, { "epoch": 1.5269204775850418, "grad_norm": 0.532272765151279, "learning_rate": 9.61743322280114e-06, "loss": 0.58, "step": 3413 }, { "epoch": 1.5273710295111511, "grad_norm": 0.5151896159153183, "learning_rate": 9.612712524560577e-06, "loss": 0.5236, "step": 3414 }, { "epoch": 1.5278215814372607, "grad_norm": 0.5320424152549652, "learning_rate": 9.60799191275507e-06, "loss": 0.5513, "step": 3415 }, { "epoch": 1.52827213336337, "grad_norm": 0.5261030599399169, "learning_rate": 9.603271388438158e-06, "loss": 0.5201, "step": 3416 }, { "epoch": 1.5287226852894795, "grad_norm": 0.5176572217303904, "learning_rate": 9.598550952663383e-06, "loss": 0.5327, "step": 3417 }, { "epoch": 1.529173237215589, "grad_norm": 0.5461544761403052, "learning_rate": 9.593830606484245e-06, "loss": 0.5282, "step": 3418 }, { "epoch": 1.5296237891416986, "grad_norm": 0.5225212288509463, "learning_rate": 9.589110350954241e-06, "loss": 0.5515, "step": 3419 }, { "epoch": 1.5300743410678082, "grad_norm": 0.5302014029283473, "learning_rate": 9.58439018712683e-06, "loss": 0.5155, "step": 3420 }, { "epoch": 1.5305248929939177, "grad_norm": 0.5518476145278758, "learning_rate": 9.579670116055474e-06, "loss": 0.5532, "step": 3421 }, { "epoch": 1.530975444920027, "grad_norm": 0.5166820227961368, "learning_rate": 9.574950138793593e-06, "loss": 0.5136, "step": 3422 }, { "epoch": 1.5314259968461366, "grad_norm": 0.5434102015248088, "learning_rate": 9.570230256394595e-06, "loss": 0.5154, "step": 3423 }, { "epoch": 1.5318765487722459, "grad_norm": 0.5329223001138227, "learning_rate": 9.565510469911869e-06, "loss": 0.5409, "step": 3424 }, { "epoch": 1.5323271006983554, "grad_norm": 0.5283757343433625, "learning_rate": 9.56079078039878e-06, "loss": 0.5595, "step": 3425 }, { "epoch": 1.532777652624465, "grad_norm": 0.556681688993417, "learning_rate": 9.556071188908665e-06, "loss": 0.5335, "step": 3426 }, { "epoch": 1.5332282045505745, "grad_norm": 0.5397663156474232, "learning_rate": 9.551351696494854e-06, "loss": 0.5581, "step": 3427 }, { "epoch": 1.533678756476684, "grad_norm": 0.5330524362969593, "learning_rate": 9.546632304210638e-06, "loss": 0.5254, "step": 3428 }, { "epoch": 1.5341293084027934, "grad_norm": 0.5647563622763564, "learning_rate": 9.541913013109302e-06, "loss": 0.5451, "step": 3429 }, { "epoch": 1.534579860328903, "grad_norm": 0.5233819843389015, "learning_rate": 9.537193824244092e-06, "loss": 0.5317, "step": 3430 }, { "epoch": 1.5350304122550122, "grad_norm": 0.5093100113511895, "learning_rate": 9.532474738668247e-06, "loss": 0.537, "step": 3431 }, { "epoch": 1.5354809641811218, "grad_norm": 0.5610523287813575, "learning_rate": 9.527755757434968e-06, "loss": 0.5521, "step": 3432 }, { "epoch": 1.5359315161072313, "grad_norm": 0.5179579668458281, "learning_rate": 9.523036881597445e-06, "loss": 0.553, "step": 3433 }, { "epoch": 1.5363820680333409, "grad_norm": 0.5119782956504325, "learning_rate": 9.518318112208839e-06, "loss": 0.5368, "step": 3434 }, { "epoch": 1.5368326199594504, "grad_norm": 0.530904250058871, "learning_rate": 9.513599450322287e-06, "loss": 0.506, "step": 3435 }, { "epoch": 1.53728317188556, "grad_norm": 0.5276429213658014, "learning_rate": 9.508880896990905e-06, "loss": 0.506, "step": 3436 }, { "epoch": 1.5377337238116693, "grad_norm": 0.548536775839654, "learning_rate": 9.504162453267776e-06, "loss": 0.5496, "step": 3437 }, { "epoch": 1.5381842757377788, "grad_norm": 0.5150837839756238, "learning_rate": 9.499444120205976e-06, "loss": 0.544, "step": 3438 }, { "epoch": 1.5386348276638881, "grad_norm": 0.5399966289713397, "learning_rate": 9.494725898858534e-06, "loss": 0.5403, "step": 3439 }, { "epoch": 1.5390853795899977, "grad_norm": 0.5337331704347228, "learning_rate": 9.490007790278473e-06, "loss": 0.5327, "step": 3440 }, { "epoch": 1.5395359315161072, "grad_norm": 0.5389522123442064, "learning_rate": 9.485289795518779e-06, "loss": 0.5027, "step": 3441 }, { "epoch": 1.5399864834422168, "grad_norm": 0.5145809490807257, "learning_rate": 9.480571915632422e-06, "loss": 0.5166, "step": 3442 }, { "epoch": 1.5404370353683263, "grad_norm": 0.5438928580222671, "learning_rate": 9.475854151672333e-06, "loss": 0.5556, "step": 3443 }, { "epoch": 1.5408875872944356, "grad_norm": 0.5200259509392715, "learning_rate": 9.471136504691436e-06, "loss": 0.511, "step": 3444 }, { "epoch": 1.5413381392205452, "grad_norm": 0.5309648519170527, "learning_rate": 9.46641897574261e-06, "loss": 0.5504, "step": 3445 }, { "epoch": 1.5417886911466545, "grad_norm": 0.5560596435617634, "learning_rate": 9.46170156587872e-06, "loss": 0.5733, "step": 3446 }, { "epoch": 1.542239243072764, "grad_norm": 0.52603878594227, "learning_rate": 9.456984276152598e-06, "loss": 0.5343, "step": 3447 }, { "epoch": 1.5426897949988736, "grad_norm": 0.5122587274784657, "learning_rate": 9.452267107617055e-06, "loss": 0.5634, "step": 3448 }, { "epoch": 1.5431403469249831, "grad_norm": 0.5315933411063675, "learning_rate": 9.447550061324864e-06, "loss": 0.5144, "step": 3449 }, { "epoch": 1.5435908988510927, "grad_norm": 0.5441459777310433, "learning_rate": 9.442833138328789e-06, "loss": 0.5602, "step": 3450 }, { "epoch": 1.5440414507772022, "grad_norm": 0.5424357062848506, "learning_rate": 9.438116339681546e-06, "loss": 0.5308, "step": 3451 }, { "epoch": 1.5444920027033116, "grad_norm": 0.5069545277093026, "learning_rate": 9.43339966643584e-06, "loss": 0.5216, "step": 3452 }, { "epoch": 1.544942554629421, "grad_norm": 0.5395164832835809, "learning_rate": 9.428683119644336e-06, "loss": 0.5632, "step": 3453 }, { "epoch": 1.5453931065555304, "grad_norm": 0.5570322321952528, "learning_rate": 9.42396670035968e-06, "loss": 0.5593, "step": 3454 }, { "epoch": 1.54584365848164, "grad_norm": 0.4940118910661828, "learning_rate": 9.41925040963448e-06, "loss": 0.497, "step": 3455 }, { "epoch": 1.5462942104077495, "grad_norm": 0.5317662358047933, "learning_rate": 9.41453424852133e-06, "loss": 0.5535, "step": 3456 }, { "epoch": 1.546744762333859, "grad_norm": 0.5283850849559262, "learning_rate": 9.409818218072774e-06, "loss": 0.5275, "step": 3457 }, { "epoch": 1.5471953142599686, "grad_norm": 0.5355280513181421, "learning_rate": 9.405102319341345e-06, "loss": 0.553, "step": 3458 }, { "epoch": 1.547645866186078, "grad_norm": 0.5307513324238763, "learning_rate": 9.400386553379539e-06, "loss": 0.5352, "step": 3459 }, { "epoch": 1.5480964181121875, "grad_norm": 0.5146889238151733, "learning_rate": 9.395670921239827e-06, "loss": 0.5167, "step": 3460 }, { "epoch": 1.5485469700382968, "grad_norm": 0.5303980631225341, "learning_rate": 9.390955423974638e-06, "loss": 0.5275, "step": 3461 }, { "epoch": 1.5489975219644063, "grad_norm": 0.5339921705000789, "learning_rate": 9.386240062636388e-06, "loss": 0.5377, "step": 3462 }, { "epoch": 1.5494480738905159, "grad_norm": 0.5300876028026034, "learning_rate": 9.381524838277448e-06, "loss": 0.5584, "step": 3463 }, { "epoch": 1.5498986258166254, "grad_norm": 0.5232068609560713, "learning_rate": 9.376809751950173e-06, "loss": 0.5485, "step": 3464 }, { "epoch": 1.550349177742735, "grad_norm": 0.506898868488482, "learning_rate": 9.372094804706867e-06, "loss": 0.5331, "step": 3465 }, { "epoch": 1.5507997296688445, "grad_norm": 0.537512969393655, "learning_rate": 9.367379997599825e-06, "loss": 0.5239, "step": 3466 }, { "epoch": 1.5512502815949538, "grad_norm": 0.5382344547597553, "learning_rate": 9.362665331681294e-06, "loss": 0.5472, "step": 3467 }, { "epoch": 1.5517008335210631, "grad_norm": 0.5172619007971095, "learning_rate": 9.357950808003503e-06, "loss": 0.5277, "step": 3468 }, { "epoch": 1.5521513854471727, "grad_norm": 0.5517688904559488, "learning_rate": 9.353236427618633e-06, "loss": 0.5615, "step": 3469 }, { "epoch": 1.5526019373732822, "grad_norm": 0.5403057730590337, "learning_rate": 9.34852219157885e-06, "loss": 0.5213, "step": 3470 }, { "epoch": 1.5530524892993918, "grad_norm": 0.5755253932652752, "learning_rate": 9.343808100936277e-06, "loss": 0.5468, "step": 3471 }, { "epoch": 1.5535030412255013, "grad_norm": 0.5644880396699266, "learning_rate": 9.339094156743007e-06, "loss": 0.5336, "step": 3472 }, { "epoch": 1.5539535931516109, "grad_norm": 0.5283529569048463, "learning_rate": 9.334380360051102e-06, "loss": 0.5168, "step": 3473 }, { "epoch": 1.5544041450777202, "grad_norm": 0.5257107823042894, "learning_rate": 9.329666711912591e-06, "loss": 0.5539, "step": 3474 }, { "epoch": 1.5548546970038297, "grad_norm": 0.522693056535645, "learning_rate": 9.324953213379464e-06, "loss": 0.5367, "step": 3475 }, { "epoch": 1.555305248929939, "grad_norm": 0.5165494470299271, "learning_rate": 9.32023986550369e-06, "loss": 0.5383, "step": 3476 }, { "epoch": 1.5557558008560486, "grad_norm": 0.5193618917107499, "learning_rate": 9.315526669337189e-06, "loss": 0.5101, "step": 3477 }, { "epoch": 1.5562063527821581, "grad_norm": 0.5250609705130147, "learning_rate": 9.310813625931862e-06, "loss": 0.5652, "step": 3478 }, { "epoch": 1.5566569047082677, "grad_norm": 0.5408585200762198, "learning_rate": 9.30610073633956e-06, "loss": 0.5478, "step": 3479 }, { "epoch": 1.5571074566343772, "grad_norm": 0.5215743400071752, "learning_rate": 9.30138800161212e-06, "loss": 0.5521, "step": 3480 }, { "epoch": 1.5575580085604868, "grad_norm": 0.5384461094865121, "learning_rate": 9.296675422801324e-06, "loss": 0.6104, "step": 3481 }, { "epoch": 1.558008560486596, "grad_norm": 0.5162784553067684, "learning_rate": 9.291963000958932e-06, "loss": 0.5193, "step": 3482 }, { "epoch": 1.5584591124127054, "grad_norm": 0.5244429854261026, "learning_rate": 9.287250737136667e-06, "loss": 0.5596, "step": 3483 }, { "epoch": 1.558909664338815, "grad_norm": 0.5121151592425232, "learning_rate": 9.282538632386208e-06, "loss": 0.5672, "step": 3484 }, { "epoch": 1.5593602162649245, "grad_norm": 0.5278085911797777, "learning_rate": 9.277826687759212e-06, "loss": 0.5584, "step": 3485 }, { "epoch": 1.559810768191034, "grad_norm": 0.5394725346071617, "learning_rate": 9.27311490430729e-06, "loss": 0.5345, "step": 3486 }, { "epoch": 1.5602613201171436, "grad_norm": 0.5467926721873975, "learning_rate": 9.268403283082025e-06, "loss": 0.5647, "step": 3487 }, { "epoch": 1.5607118720432531, "grad_norm": 0.5303102977208621, "learning_rate": 9.263691825134951e-06, "loss": 0.5687, "step": 3488 }, { "epoch": 1.5611624239693624, "grad_norm": 0.5446111322469855, "learning_rate": 9.258980531517585e-06, "loss": 0.5668, "step": 3489 }, { "epoch": 1.561612975895472, "grad_norm": 0.5203344643019839, "learning_rate": 9.254269403281387e-06, "loss": 0.5148, "step": 3490 }, { "epoch": 1.5620635278215813, "grad_norm": 0.5419760647815505, "learning_rate": 9.249558441477798e-06, "loss": 0.5484, "step": 3491 }, { "epoch": 1.5625140797476909, "grad_norm": 0.5252564833140662, "learning_rate": 9.244847647158203e-06, "loss": 0.5753, "step": 3492 }, { "epoch": 1.5629646316738004, "grad_norm": 0.5523317448008309, "learning_rate": 9.24013702137397e-06, "loss": 0.5367, "step": 3493 }, { "epoch": 1.56341518359991, "grad_norm": 0.5366619700314543, "learning_rate": 9.235426565176413e-06, "loss": 0.5281, "step": 3494 }, { "epoch": 1.5638657355260195, "grad_norm": 0.5332191852242877, "learning_rate": 9.230716279616818e-06, "loss": 0.5405, "step": 3495 }, { "epoch": 1.5643162874521288, "grad_norm": 0.5397445247073133, "learning_rate": 9.226006165746422e-06, "loss": 0.5618, "step": 3496 }, { "epoch": 1.5647668393782384, "grad_norm": 0.5356915399727518, "learning_rate": 9.221296224616443e-06, "loss": 0.5539, "step": 3497 }, { "epoch": 1.5652173913043477, "grad_norm": 0.5131662585515318, "learning_rate": 9.216586457278037e-06, "loss": 0.503, "step": 3498 }, { "epoch": 1.5656679432304572, "grad_norm": 0.5469676396942313, "learning_rate": 9.211876864782343e-06, "loss": 0.538, "step": 3499 }, { "epoch": 1.5661184951565668, "grad_norm": 0.5121154263175121, "learning_rate": 9.20716744818044e-06, "loss": 0.5132, "step": 3500 }, { "epoch": 1.5665690470826763, "grad_norm": 0.5421399004841787, "learning_rate": 9.20245820852339e-06, "loss": 0.522, "step": 3501 }, { "epoch": 1.5670195990087858, "grad_norm": 0.5609444896211448, "learning_rate": 9.197749146862193e-06, "loss": 0.5496, "step": 3502 }, { "epoch": 1.5674701509348954, "grad_norm": 0.5078672069605872, "learning_rate": 9.19304026424783e-06, "loss": 0.5302, "step": 3503 }, { "epoch": 1.5679207028610047, "grad_norm": 0.5189560261510741, "learning_rate": 9.18833156173122e-06, "loss": 0.5513, "step": 3504 }, { "epoch": 1.5683712547871143, "grad_norm": 0.5351526208489393, "learning_rate": 9.18362304036327e-06, "loss": 0.5412, "step": 3505 }, { "epoch": 1.5688218067132236, "grad_norm": 0.539541823639698, "learning_rate": 9.178914701194817e-06, "loss": 0.5282, "step": 3506 }, { "epoch": 1.5692723586393331, "grad_norm": 0.526518671444541, "learning_rate": 9.174206545276678e-06, "loss": 0.5456, "step": 3507 }, { "epoch": 1.5697229105654427, "grad_norm": 0.5352191717782239, "learning_rate": 9.16949857365962e-06, "loss": 0.555, "step": 3508 }, { "epoch": 1.5701734624915522, "grad_norm": 0.5560074934351198, "learning_rate": 9.164790787394372e-06, "loss": 0.5625, "step": 3509 }, { "epoch": 1.5706240144176618, "grad_norm": 0.6047358059364158, "learning_rate": 9.160083187531616e-06, "loss": 0.55, "step": 3510 }, { "epoch": 1.571074566343771, "grad_norm": 0.5435290270634919, "learning_rate": 9.155375775122007e-06, "loss": 0.562, "step": 3511 }, { "epoch": 1.5715251182698806, "grad_norm": 0.5638706851427744, "learning_rate": 9.150668551216134e-06, "loss": 0.5689, "step": 3512 }, { "epoch": 1.57197567019599, "grad_norm": 0.5487582254451313, "learning_rate": 9.145961516864572e-06, "loss": 0.5574, "step": 3513 }, { "epoch": 1.5724262221220995, "grad_norm": 0.5662802977562239, "learning_rate": 9.14125467311783e-06, "loss": 0.5235, "step": 3514 }, { "epoch": 1.572876774048209, "grad_norm": 0.560929339136504, "learning_rate": 9.136548021026391e-06, "loss": 0.5453, "step": 3515 }, { "epoch": 1.5733273259743186, "grad_norm": 0.5534062796417462, "learning_rate": 9.131841561640681e-06, "loss": 0.54, "step": 3516 }, { "epoch": 1.5737778779004281, "grad_norm": 0.5207017703320681, "learning_rate": 9.127135296011102e-06, "loss": 0.5272, "step": 3517 }, { "epoch": 1.5742284298265377, "grad_norm": 0.5531951748692119, "learning_rate": 9.122429225187989e-06, "loss": 0.5643, "step": 3518 }, { "epoch": 1.574678981752647, "grad_norm": 0.5232633185919932, "learning_rate": 9.117723350221651e-06, "loss": 0.5269, "step": 3519 }, { "epoch": 1.5751295336787565, "grad_norm": 0.5631346388723624, "learning_rate": 9.113017672162347e-06, "loss": 0.5901, "step": 3520 }, { "epoch": 1.5755800856048658, "grad_norm": 0.5653998722304711, "learning_rate": 9.108312192060298e-06, "loss": 0.5388, "step": 3521 }, { "epoch": 1.5760306375309754, "grad_norm": 0.5094112525858567, "learning_rate": 9.103606910965666e-06, "loss": 0.5517, "step": 3522 }, { "epoch": 1.576481189457085, "grad_norm": 0.5359695155542318, "learning_rate": 9.098901829928589e-06, "loss": 0.5684, "step": 3523 }, { "epoch": 1.5769317413831945, "grad_norm": 0.563637650642966, "learning_rate": 9.09419694999914e-06, "loss": 0.573, "step": 3524 }, { "epoch": 1.577382293309304, "grad_norm": 0.5313758735677061, "learning_rate": 9.089492272227366e-06, "loss": 0.5173, "step": 3525 }, { "epoch": 1.5778328452354133, "grad_norm": 0.5136450699014264, "learning_rate": 9.084787797663253e-06, "loss": 0.5225, "step": 3526 }, { "epoch": 1.5782833971615229, "grad_norm": 0.5287493572258616, "learning_rate": 9.080083527356755e-06, "loss": 0.4919, "step": 3527 }, { "epoch": 1.5787339490876322, "grad_norm": 0.526009002739225, "learning_rate": 9.075379462357766e-06, "loss": 0.5315, "step": 3528 }, { "epoch": 1.5791845010137417, "grad_norm": 0.5221626265796353, "learning_rate": 9.070675603716151e-06, "loss": 0.5224, "step": 3529 }, { "epoch": 1.5796350529398513, "grad_norm": 0.5351920955449153, "learning_rate": 9.065971952481708e-06, "loss": 0.5461, "step": 3530 }, { "epoch": 1.5800856048659608, "grad_norm": 0.5324889159016702, "learning_rate": 9.061268509704214e-06, "loss": 0.5369, "step": 3531 }, { "epoch": 1.5805361567920704, "grad_norm": 0.5505077514793295, "learning_rate": 9.056565276433378e-06, "loss": 0.5591, "step": 3532 }, { "epoch": 1.58098670871818, "grad_norm": 0.5282471126305307, "learning_rate": 9.051862253718872e-06, "loss": 0.5424, "step": 3533 }, { "epoch": 1.5814372606442892, "grad_norm": 0.5292129506229737, "learning_rate": 9.047159442610322e-06, "loss": 0.5288, "step": 3534 }, { "epoch": 1.5818878125703988, "grad_norm": 0.5330920052158512, "learning_rate": 9.042456844157299e-06, "loss": 0.5418, "step": 3535 }, { "epoch": 1.582338364496508, "grad_norm": 0.5324234033679687, "learning_rate": 9.037754459409338e-06, "loss": 0.5082, "step": 3536 }, { "epoch": 1.5827889164226177, "grad_norm": 0.553220598805102, "learning_rate": 9.033052289415914e-06, "loss": 0.5513, "step": 3537 }, { "epoch": 1.5832394683487272, "grad_norm": 0.5505790297743111, "learning_rate": 9.028350335226467e-06, "loss": 0.549, "step": 3538 }, { "epoch": 1.5836900202748367, "grad_norm": 0.531494574650489, "learning_rate": 9.023648597890373e-06, "loss": 0.5168, "step": 3539 }, { "epoch": 1.5841405722009463, "grad_norm": 0.5540303031344803, "learning_rate": 9.01894707845698e-06, "loss": 0.5422, "step": 3540 }, { "epoch": 1.5845911241270556, "grad_norm": 0.5630131471067009, "learning_rate": 9.014245777975565e-06, "loss": 0.519, "step": 3541 }, { "epoch": 1.5850416760531651, "grad_norm": 0.560756088504712, "learning_rate": 9.009544697495373e-06, "loss": 0.5295, "step": 3542 }, { "epoch": 1.5854922279792745, "grad_norm": 0.5530715200610674, "learning_rate": 9.004843838065594e-06, "loss": 0.5519, "step": 3543 }, { "epoch": 1.585942779905384, "grad_norm": 0.5225143403158826, "learning_rate": 9.000143200735367e-06, "loss": 0.5443, "step": 3544 }, { "epoch": 1.5863933318314936, "grad_norm": 0.550437428489755, "learning_rate": 8.99544278655378e-06, "loss": 0.5626, "step": 3545 }, { "epoch": 1.586843883757603, "grad_norm": 0.5216162743048263, "learning_rate": 8.990742596569884e-06, "loss": 0.5476, "step": 3546 }, { "epoch": 1.5872944356837126, "grad_norm": 0.5060268495271576, "learning_rate": 8.986042631832656e-06, "loss": 0.5394, "step": 3547 }, { "epoch": 1.5877449876098222, "grad_norm": 0.5191208830949, "learning_rate": 8.981342893391052e-06, "loss": 0.5722, "step": 3548 }, { "epoch": 1.5881955395359315, "grad_norm": 0.5443834709675097, "learning_rate": 8.97664338229395e-06, "loss": 0.5183, "step": 3549 }, { "epoch": 1.588646091462041, "grad_norm": 0.5129354251212539, "learning_rate": 8.9719440995902e-06, "loss": 0.5451, "step": 3550 }, { "epoch": 1.5890966433881504, "grad_norm": 0.5310075234816879, "learning_rate": 8.96724504632858e-06, "loss": 0.5784, "step": 3551 }, { "epoch": 1.58954719531426, "grad_norm": 0.5306421765375744, "learning_rate": 8.962546223557838e-06, "loss": 0.4927, "step": 3552 }, { "epoch": 1.5899977472403695, "grad_norm": 0.5263989197400039, "learning_rate": 8.957847632326656e-06, "loss": 0.5396, "step": 3553 }, { "epoch": 1.590448299166479, "grad_norm": 0.536929087802485, "learning_rate": 8.953149273683665e-06, "loss": 0.5387, "step": 3554 }, { "epoch": 1.5908988510925886, "grad_norm": 0.5410405194263593, "learning_rate": 8.94845114867745e-06, "loss": 0.5199, "step": 3555 }, { "epoch": 1.5913494030186979, "grad_norm": 0.5006855163039842, "learning_rate": 8.943753258356545e-06, "loss": 0.5214, "step": 3556 }, { "epoch": 1.5917999549448074, "grad_norm": 0.5314879484225002, "learning_rate": 8.93905560376942e-06, "loss": 0.5537, "step": 3557 }, { "epoch": 1.5922505068709167, "grad_norm": 0.5328857965293596, "learning_rate": 8.934358185964512e-06, "loss": 0.5319, "step": 3558 }, { "epoch": 1.5927010587970263, "grad_norm": 0.508740565391694, "learning_rate": 8.929661005990178e-06, "loss": 0.5325, "step": 3559 }, { "epoch": 1.5931516107231358, "grad_norm": 0.5093061708227212, "learning_rate": 8.924964064894753e-06, "loss": 0.5192, "step": 3560 }, { "epoch": 1.5936021626492454, "grad_norm": 0.514711869172408, "learning_rate": 8.920267363726493e-06, "loss": 0.5381, "step": 3561 }, { "epoch": 1.594052714575355, "grad_norm": 0.5491426429681163, "learning_rate": 8.915570903533615e-06, "loss": 0.5365, "step": 3562 }, { "epoch": 1.5945032665014645, "grad_norm": 0.5201475183378923, "learning_rate": 8.910874685364275e-06, "loss": 0.5274, "step": 3563 }, { "epoch": 1.5949538184275738, "grad_norm": 0.5204593139528768, "learning_rate": 8.906178710266584e-06, "loss": 0.5535, "step": 3564 }, { "epoch": 1.595404370353683, "grad_norm": 0.5179904797595307, "learning_rate": 8.901482979288582e-06, "loss": 0.4971, "step": 3565 }, { "epoch": 1.5958549222797926, "grad_norm": 0.5537674124894822, "learning_rate": 8.896787493478277e-06, "loss": 0.5626, "step": 3566 }, { "epoch": 1.5963054742059022, "grad_norm": 0.5551501831614328, "learning_rate": 8.892092253883602e-06, "loss": 0.5275, "step": 3567 }, { "epoch": 1.5967560261320117, "grad_norm": 0.53263068671655, "learning_rate": 8.887397261552443e-06, "loss": 0.5363, "step": 3568 }, { "epoch": 1.5972065780581213, "grad_norm": 0.5511702732584578, "learning_rate": 8.882702517532637e-06, "loss": 0.5058, "step": 3569 }, { "epoch": 1.5976571299842308, "grad_norm": 0.5261524361031163, "learning_rate": 8.878008022871959e-06, "loss": 0.5079, "step": 3570 }, { "epoch": 1.5981076819103401, "grad_norm": 0.544877749575327, "learning_rate": 8.873313778618122e-06, "loss": 0.5352, "step": 3571 }, { "epoch": 1.5985582338364497, "grad_norm": 0.5263405482123975, "learning_rate": 8.8686197858188e-06, "loss": 0.5555, "step": 3572 }, { "epoch": 1.599008785762559, "grad_norm": 0.5419470778277258, "learning_rate": 8.863926045521593e-06, "loss": 0.5966, "step": 3573 }, { "epoch": 1.5994593376886685, "grad_norm": 0.539923830212709, "learning_rate": 8.85923255877406e-06, "loss": 0.5191, "step": 3574 }, { "epoch": 1.599909889614778, "grad_norm": 0.5731090611166589, "learning_rate": 8.854539326623687e-06, "loss": 0.5454, "step": 3575 }, { "epoch": 1.6003604415408876, "grad_norm": 0.5388486213765808, "learning_rate": 8.849846350117923e-06, "loss": 0.5499, "step": 3576 }, { "epoch": 1.6008109934669972, "grad_norm": 0.5612105178063134, "learning_rate": 8.84515363030414e-06, "loss": 0.5638, "step": 3577 }, { "epoch": 1.6012615453931065, "grad_norm": 0.8853179114793734, "learning_rate": 8.84046116822967e-06, "loss": 0.5459, "step": 3578 }, { "epoch": 1.601712097319216, "grad_norm": 0.5409231261320554, "learning_rate": 8.835768964941773e-06, "loss": 0.5513, "step": 3579 }, { "epoch": 1.6021626492453254, "grad_norm": 0.5264656936781896, "learning_rate": 8.831077021487662e-06, "loss": 0.514, "step": 3580 }, { "epoch": 1.602613201171435, "grad_norm": 0.5633878245931428, "learning_rate": 8.826385338914485e-06, "loss": 0.5641, "step": 3581 }, { "epoch": 1.6030637530975445, "grad_norm": 0.5748637132791906, "learning_rate": 8.821693918269334e-06, "loss": 0.5451, "step": 3582 }, { "epoch": 1.603514305023654, "grad_norm": 0.5158396824240727, "learning_rate": 8.81700276059925e-06, "loss": 0.5052, "step": 3583 }, { "epoch": 1.6039648569497635, "grad_norm": 0.549126684295142, "learning_rate": 8.812311866951198e-06, "loss": 0.515, "step": 3584 }, { "epoch": 1.604415408875873, "grad_norm": 0.5598652226809911, "learning_rate": 8.807621238372104e-06, "loss": 0.5714, "step": 3585 }, { "epoch": 1.6048659608019824, "grad_norm": 0.5694741999541032, "learning_rate": 8.802930875908818e-06, "loss": 0.5585, "step": 3586 }, { "epoch": 1.605316512728092, "grad_norm": 0.5497171185742986, "learning_rate": 8.798240780608143e-06, "loss": 0.5379, "step": 3587 }, { "epoch": 1.6057670646542013, "grad_norm": 0.5544099554274512, "learning_rate": 8.793550953516817e-06, "loss": 0.5479, "step": 3588 }, { "epoch": 1.6062176165803108, "grad_norm": 0.5243915429308722, "learning_rate": 8.788861395681512e-06, "loss": 0.5273, "step": 3589 }, { "epoch": 1.6066681685064204, "grad_norm": 0.5558445447278343, "learning_rate": 8.784172108148855e-06, "loss": 0.5713, "step": 3590 }, { "epoch": 1.60711872043253, "grad_norm": 0.5837972370082267, "learning_rate": 8.7794830919654e-06, "loss": 0.5445, "step": 3591 }, { "epoch": 1.6075692723586394, "grad_norm": 0.508061577048167, "learning_rate": 8.774794348177641e-06, "loss": 0.546, "step": 3592 }, { "epoch": 1.6080198242847488, "grad_norm": 0.5460406669426945, "learning_rate": 8.770105877832022e-06, "loss": 0.5283, "step": 3593 }, { "epoch": 1.6084703762108583, "grad_norm": 0.5695649569588825, "learning_rate": 8.76541768197491e-06, "loss": 0.5294, "step": 3594 }, { "epoch": 1.6089209281369676, "grad_norm": 0.5350265159970025, "learning_rate": 8.76072976165263e-06, "loss": 0.5317, "step": 3595 }, { "epoch": 1.6093714800630772, "grad_norm": 0.582252992044158, "learning_rate": 8.756042117911423e-06, "loss": 0.5348, "step": 3596 }, { "epoch": 1.6098220319891867, "grad_norm": 0.5830768799475807, "learning_rate": 8.751354751797492e-06, "loss": 0.5715, "step": 3597 }, { "epoch": 1.6102725839152963, "grad_norm": 0.5466851420379147, "learning_rate": 8.746667664356957e-06, "loss": 0.5037, "step": 3598 }, { "epoch": 1.6107231358414058, "grad_norm": 0.5555473780572663, "learning_rate": 8.741980856635893e-06, "loss": 0.5472, "step": 3599 }, { "epoch": 1.6111736877675154, "grad_norm": 0.5446813305799569, "learning_rate": 8.737294329680294e-06, "loss": 0.5071, "step": 3600 }, { "epoch": 1.6116242396936247, "grad_norm": 0.5206309540610817, "learning_rate": 8.732608084536111e-06, "loss": 0.505, "step": 3601 }, { "epoch": 1.6120747916197342, "grad_norm": 0.5337350337561576, "learning_rate": 8.727922122249221e-06, "loss": 0.5781, "step": 3602 }, { "epoch": 1.6125253435458435, "grad_norm": 0.5521669784716129, "learning_rate": 8.723236443865437e-06, "loss": 0.5589, "step": 3603 }, { "epoch": 1.612975895471953, "grad_norm": 0.5049635446047621, "learning_rate": 8.718551050430515e-06, "loss": 0.5337, "step": 3604 }, { "epoch": 1.6134264473980626, "grad_norm": 0.5228026942419772, "learning_rate": 8.713865942990143e-06, "loss": 0.5395, "step": 3605 }, { "epoch": 1.6138769993241722, "grad_norm": 0.5443005185963534, "learning_rate": 8.709181122589941e-06, "loss": 0.5281, "step": 3606 }, { "epoch": 1.6143275512502817, "grad_norm": 0.5276848009577566, "learning_rate": 8.704496590275479e-06, "loss": 0.5212, "step": 3607 }, { "epoch": 1.614778103176391, "grad_norm": 0.5526201505898839, "learning_rate": 8.699812347092245e-06, "loss": 0.5555, "step": 3608 }, { "epoch": 1.6152286551025006, "grad_norm": 0.5130690203163, "learning_rate": 8.69512839408568e-06, "loss": 0.532, "step": 3609 }, { "epoch": 1.61567920702861, "grad_norm": 0.5097960916863966, "learning_rate": 8.690444732301141e-06, "loss": 0.5487, "step": 3610 }, { "epoch": 1.6161297589547194, "grad_norm": 0.5250792302551722, "learning_rate": 8.685761362783943e-06, "loss": 0.5574, "step": 3611 }, { "epoch": 1.616580310880829, "grad_norm": 0.5244036575943056, "learning_rate": 8.68107828657931e-06, "loss": 0.4945, "step": 3612 }, { "epoch": 1.6170308628069385, "grad_norm": 0.5291941287957797, "learning_rate": 8.676395504732427e-06, "loss": 0.53, "step": 3613 }, { "epoch": 1.617481414733048, "grad_norm": 0.5325580092607496, "learning_rate": 8.671713018288391e-06, "loss": 0.5451, "step": 3614 }, { "epoch": 1.6179319666591576, "grad_norm": 0.5015522256470405, "learning_rate": 8.667030828292242e-06, "loss": 0.5277, "step": 3615 }, { "epoch": 1.618382518585267, "grad_norm": 0.5252596240610298, "learning_rate": 8.662348935788959e-06, "loss": 0.517, "step": 3616 }, { "epoch": 1.6188330705113765, "grad_norm": 0.5200987842599541, "learning_rate": 8.657667341823449e-06, "loss": 0.5114, "step": 3617 }, { "epoch": 1.6192836224374858, "grad_norm": 0.5109347515629709, "learning_rate": 8.652986047440545e-06, "loss": 0.5187, "step": 3618 }, { "epoch": 1.6197341743635953, "grad_norm": 0.563682749858587, "learning_rate": 8.648305053685035e-06, "loss": 0.5217, "step": 3619 }, { "epoch": 1.620184726289705, "grad_norm": 0.5181230784364786, "learning_rate": 8.643624361601611e-06, "loss": 0.5665, "step": 3620 }, { "epoch": 1.6206352782158144, "grad_norm": 0.5150318363225898, "learning_rate": 8.638943972234926e-06, "loss": 0.5527, "step": 3621 }, { "epoch": 1.621085830141924, "grad_norm": 0.5299267629961931, "learning_rate": 8.63426388662954e-06, "loss": 0.5414, "step": 3622 }, { "epoch": 1.6215363820680333, "grad_norm": 0.5170922036716937, "learning_rate": 8.62958410582997e-06, "loss": 0.5305, "step": 3623 }, { "epoch": 1.6219869339941428, "grad_norm": 0.5488395860102543, "learning_rate": 8.624904630880638e-06, "loss": 0.5424, "step": 3624 }, { "epoch": 1.6224374859202522, "grad_norm": 0.5159486295709365, "learning_rate": 8.620225462825927e-06, "loss": 0.5381, "step": 3625 }, { "epoch": 1.6228880378463617, "grad_norm": 0.5003119386990994, "learning_rate": 8.615546602710126e-06, "loss": 0.5403, "step": 3626 }, { "epoch": 1.6233385897724713, "grad_norm": 0.5285361146952012, "learning_rate": 8.61086805157747e-06, "loss": 0.5223, "step": 3627 }, { "epoch": 1.6237891416985808, "grad_norm": 0.5482200038830135, "learning_rate": 8.606189810472117e-06, "loss": 0.5309, "step": 3628 }, { "epoch": 1.6242396936246903, "grad_norm": 0.5291539564055497, "learning_rate": 8.601511880438167e-06, "loss": 0.548, "step": 3629 }, { "epoch": 1.6246902455507999, "grad_norm": 0.5244335503631903, "learning_rate": 8.596834262519634e-06, "loss": 0.5451, "step": 3630 }, { "epoch": 1.6251407974769092, "grad_norm": 0.522458980077781, "learning_rate": 8.592156957760477e-06, "loss": 0.5403, "step": 3631 }, { "epoch": 1.6255913494030187, "grad_norm": 0.5324192499215924, "learning_rate": 8.587479967204584e-06, "loss": 0.5705, "step": 3632 }, { "epoch": 1.626041901329128, "grad_norm": 0.5198367598664808, "learning_rate": 8.582803291895758e-06, "loss": 0.546, "step": 3633 }, { "epoch": 1.6264924532552376, "grad_norm": 0.5597542841246861, "learning_rate": 8.578126932877752e-06, "loss": 0.5609, "step": 3634 }, { "epoch": 1.6269430051813472, "grad_norm": 0.5294449157001129, "learning_rate": 8.573450891194233e-06, "loss": 0.5298, "step": 3635 }, { "epoch": 1.6273935571074567, "grad_norm": 0.4935662866014521, "learning_rate": 8.568775167888806e-06, "loss": 0.4871, "step": 3636 }, { "epoch": 1.6278441090335662, "grad_norm": 0.5134655889260231, "learning_rate": 8.564099764004998e-06, "loss": 0.5387, "step": 3637 }, { "epoch": 1.6282946609596756, "grad_norm": 0.5396915787837142, "learning_rate": 8.559424680586272e-06, "loss": 0.5249, "step": 3638 }, { "epoch": 1.628745212885785, "grad_norm": 0.5456679922530188, "learning_rate": 8.554749918676014e-06, "loss": 0.5434, "step": 3639 }, { "epoch": 1.6291957648118944, "grad_norm": 0.5218073210480485, "learning_rate": 8.550075479317542e-06, "loss": 0.5556, "step": 3640 }, { "epoch": 1.629646316738004, "grad_norm": 0.5465284990974706, "learning_rate": 8.545401363554095e-06, "loss": 0.5721, "step": 3641 }, { "epoch": 1.6300968686641135, "grad_norm": 0.549653197774832, "learning_rate": 8.540727572428854e-06, "loss": 0.5365, "step": 3642 }, { "epoch": 1.630547420590223, "grad_norm": 0.5230844762822133, "learning_rate": 8.536054106984908e-06, "loss": 0.5389, "step": 3643 }, { "epoch": 1.6309979725163326, "grad_norm": 0.5321500368828839, "learning_rate": 8.531380968265295e-06, "loss": 0.5332, "step": 3644 }, { "epoch": 1.6314485244424421, "grad_norm": 0.5234774639606952, "learning_rate": 8.526708157312957e-06, "loss": 0.4872, "step": 3645 }, { "epoch": 1.6318990763685515, "grad_norm": 0.5167796178291002, "learning_rate": 8.522035675170785e-06, "loss": 0.5389, "step": 3646 }, { "epoch": 1.632349628294661, "grad_norm": 0.5601170071449166, "learning_rate": 8.51736352288158e-06, "loss": 0.5344, "step": 3647 }, { "epoch": 1.6328001802207703, "grad_norm": 0.5157826773182946, "learning_rate": 8.51269170148808e-06, "loss": 0.5026, "step": 3648 }, { "epoch": 1.6332507321468799, "grad_norm": 0.5799349108543949, "learning_rate": 8.508020212032943e-06, "loss": 0.5959, "step": 3649 }, { "epoch": 1.6337012840729894, "grad_norm": 0.5374707133485143, "learning_rate": 8.503349055558754e-06, "loss": 0.5023, "step": 3650 }, { "epoch": 1.634151835999099, "grad_norm": 0.5307775360622168, "learning_rate": 8.498678233108026e-06, "loss": 0.4915, "step": 3651 }, { "epoch": 1.6346023879252085, "grad_norm": 0.539295077451084, "learning_rate": 8.494007745723197e-06, "loss": 0.5219, "step": 3652 }, { "epoch": 1.6350529398513178, "grad_norm": 0.5040579005523643, "learning_rate": 8.489337594446621e-06, "loss": 0.508, "step": 3653 }, { "epoch": 1.6355034917774274, "grad_norm": 0.5354191677473494, "learning_rate": 8.484667780320598e-06, "loss": 0.5595, "step": 3654 }, { "epoch": 1.6359540437035367, "grad_norm": 0.5234170397181099, "learning_rate": 8.479998304387329e-06, "loss": 0.5476, "step": 3655 }, { "epoch": 1.6364045956296462, "grad_norm": 0.5359100763348136, "learning_rate": 8.47532916768896e-06, "loss": 0.5314, "step": 3656 }, { "epoch": 1.6368551475557558, "grad_norm": 0.5642398342880439, "learning_rate": 8.47066037126754e-06, "loss": 0.5773, "step": 3657 }, { "epoch": 1.6373056994818653, "grad_norm": 0.513906569937701, "learning_rate": 8.465991916165068e-06, "loss": 0.515, "step": 3658 }, { "epoch": 1.6377562514079749, "grad_norm": 0.5886904986892675, "learning_rate": 8.46132380342344e-06, "loss": 0.5267, "step": 3659 }, { "epoch": 1.6382068033340844, "grad_norm": 0.5279321042938752, "learning_rate": 8.456656034084497e-06, "loss": 0.5339, "step": 3660 }, { "epoch": 1.6386573552601937, "grad_norm": 0.5483611619849698, "learning_rate": 8.451988609189987e-06, "loss": 0.5107, "step": 3661 }, { "epoch": 1.639107907186303, "grad_norm": 0.5398994686380634, "learning_rate": 8.447321529781597e-06, "loss": 0.5254, "step": 3662 }, { "epoch": 1.6395584591124126, "grad_norm": 0.5492866577966509, "learning_rate": 8.442654796900922e-06, "loss": 0.5216, "step": 3663 }, { "epoch": 1.6400090110385221, "grad_norm": 0.5411846019433094, "learning_rate": 8.43798841158949e-06, "loss": 0.5411, "step": 3664 }, { "epoch": 1.6404595629646317, "grad_norm": 0.5332044549482209, "learning_rate": 8.433322374888744e-06, "loss": 0.5159, "step": 3665 }, { "epoch": 1.6409101148907412, "grad_norm": 0.5489322725699768, "learning_rate": 8.428656687840058e-06, "loss": 0.5653, "step": 3666 }, { "epoch": 1.6413606668168508, "grad_norm": 0.5282587101825745, "learning_rate": 8.423991351484715e-06, "loss": 0.5435, "step": 3667 }, { "epoch": 1.64181121874296, "grad_norm": 0.520313659363108, "learning_rate": 8.419326366863937e-06, "loss": 0.5112, "step": 3668 }, { "epoch": 1.6422617706690696, "grad_norm": 0.5511907734440059, "learning_rate": 8.414661735018851e-06, "loss": 0.5553, "step": 3669 }, { "epoch": 1.642712322595179, "grad_norm": 0.5230959633867248, "learning_rate": 8.409997456990519e-06, "loss": 0.5403, "step": 3670 }, { "epoch": 1.6431628745212885, "grad_norm": 0.5220295772515928, "learning_rate": 8.405333533819909e-06, "loss": 0.5374, "step": 3671 }, { "epoch": 1.643613426447398, "grad_norm": 0.536291764336703, "learning_rate": 8.400669966547925e-06, "loss": 0.5535, "step": 3672 }, { "epoch": 1.6440639783735076, "grad_norm": 0.5113163059705187, "learning_rate": 8.39600675621538e-06, "loss": 0.556, "step": 3673 }, { "epoch": 1.6445145302996171, "grad_norm": 0.5438195499731088, "learning_rate": 8.391343903863018e-06, "loss": 0.524, "step": 3674 }, { "epoch": 1.6449650822257265, "grad_norm": 0.539377295419473, "learning_rate": 8.386681410531493e-06, "loss": 0.532, "step": 3675 }, { "epoch": 1.645415634151836, "grad_norm": 0.5189091698212883, "learning_rate": 8.38201927726138e-06, "loss": 0.5586, "step": 3676 }, { "epoch": 1.6458661860779453, "grad_norm": 0.5186897815889404, "learning_rate": 8.377357505093183e-06, "loss": 0.5293, "step": 3677 }, { "epoch": 1.6463167380040549, "grad_norm": 0.5333969458798054, "learning_rate": 8.372696095067318e-06, "loss": 0.5263, "step": 3678 }, { "epoch": 1.6467672899301644, "grad_norm": 0.5422717904951737, "learning_rate": 8.368035048224116e-06, "loss": 0.5402, "step": 3679 }, { "epoch": 1.647217841856274, "grad_norm": 0.5427387793156285, "learning_rate": 8.363374365603836e-06, "loss": 0.5443, "step": 3680 }, { "epoch": 1.6476683937823835, "grad_norm": 0.5378692613004072, "learning_rate": 8.358714048246656e-06, "loss": 0.5283, "step": 3681 }, { "epoch": 1.648118945708493, "grad_norm": 0.5187882838126723, "learning_rate": 8.35405409719266e-06, "loss": 0.5422, "step": 3682 }, { "epoch": 1.6485694976346024, "grad_norm": 0.5413635393683999, "learning_rate": 8.349394513481868e-06, "loss": 0.5782, "step": 3683 }, { "epoch": 1.649020049560712, "grad_norm": 0.5257858912872552, "learning_rate": 8.3447352981542e-06, "loss": 0.5213, "step": 3684 }, { "epoch": 1.6494706014868212, "grad_norm": 0.5204425250103616, "learning_rate": 8.34007645224951e-06, "loss": 0.568, "step": 3685 }, { "epoch": 1.6499211534129308, "grad_norm": 0.5302509370101285, "learning_rate": 8.335417976807554e-06, "loss": 0.5229, "step": 3686 }, { "epoch": 1.6503717053390403, "grad_norm": 0.5352584446722333, "learning_rate": 8.330759872868022e-06, "loss": 0.5226, "step": 3687 }, { "epoch": 1.6508222572651499, "grad_norm": 0.5572516040110059, "learning_rate": 8.326102141470505e-06, "loss": 0.5632, "step": 3688 }, { "epoch": 1.6512728091912594, "grad_norm": 0.5292177125061269, "learning_rate": 8.321444783654524e-06, "loss": 0.5498, "step": 3689 }, { "epoch": 1.6517233611173687, "grad_norm": 0.5436669534326768, "learning_rate": 8.316787800459506e-06, "loss": 0.5417, "step": 3690 }, { "epoch": 1.6521739130434783, "grad_norm": 0.5225209557561512, "learning_rate": 8.312131192924804e-06, "loss": 0.5066, "step": 3691 }, { "epoch": 1.6526244649695876, "grad_norm": 0.5365109987499247, "learning_rate": 8.307474962089676e-06, "loss": 0.557, "step": 3692 }, { "epoch": 1.6530750168956971, "grad_norm": 0.5424791411123062, "learning_rate": 8.302819108993311e-06, "loss": 0.5068, "step": 3693 }, { "epoch": 1.6535255688218067, "grad_norm": 0.5043022621725447, "learning_rate": 8.298163634674798e-06, "loss": 0.5625, "step": 3694 }, { "epoch": 1.6539761207479162, "grad_norm": 0.5381657263343882, "learning_rate": 8.293508540173154e-06, "loss": 0.5183, "step": 3695 }, { "epoch": 1.6544266726740258, "grad_norm": 0.5128210594463235, "learning_rate": 8.2888538265273e-06, "loss": 0.5157, "step": 3696 }, { "epoch": 1.6548772246001353, "grad_norm": 0.5308417390031289, "learning_rate": 8.284199494776083e-06, "loss": 0.543, "step": 3697 }, { "epoch": 1.6553277765262446, "grad_norm": 0.5138159123229514, "learning_rate": 8.279545545958258e-06, "loss": 0.5223, "step": 3698 }, { "epoch": 1.6557783284523542, "grad_norm": 0.5492080049496412, "learning_rate": 8.274891981112494e-06, "loss": 0.5356, "step": 3699 }, { "epoch": 1.6562288803784635, "grad_norm": 0.5210318302640211, "learning_rate": 8.27023880127738e-06, "loss": 0.5373, "step": 3700 }, { "epoch": 1.656679432304573, "grad_norm": 0.5439751162917879, "learning_rate": 8.265586007491413e-06, "loss": 0.5494, "step": 3701 }, { "epoch": 1.6571299842306826, "grad_norm": 0.5500479482752338, "learning_rate": 8.260933600793003e-06, "loss": 0.5153, "step": 3702 }, { "epoch": 1.6575805361567921, "grad_norm": 0.5157043606949145, "learning_rate": 8.256281582220486e-06, "loss": 0.5083, "step": 3703 }, { "epoch": 1.6580310880829017, "grad_norm": 0.5474151483404355, "learning_rate": 8.251629952812092e-06, "loss": 0.5255, "step": 3704 }, { "epoch": 1.658481640009011, "grad_norm": 0.5465714607641381, "learning_rate": 8.246978713605984e-06, "loss": 0.5069, "step": 3705 }, { "epoch": 1.6589321919351205, "grad_norm": 0.5312530931094309, "learning_rate": 8.242327865640218e-06, "loss": 0.5196, "step": 3706 }, { "epoch": 1.6593827438612299, "grad_norm": 0.5540403322772621, "learning_rate": 8.237677409952784e-06, "loss": 0.5278, "step": 3707 }, { "epoch": 1.6598332957873394, "grad_norm": 0.5322362876767058, "learning_rate": 8.233027347581565e-06, "loss": 0.5506, "step": 3708 }, { "epoch": 1.660283847713449, "grad_norm": 0.5566346923116529, "learning_rate": 8.228377679564372e-06, "loss": 0.5495, "step": 3709 }, { "epoch": 1.6607343996395585, "grad_norm": 0.5534459896909643, "learning_rate": 8.223728406938914e-06, "loss": 0.5383, "step": 3710 }, { "epoch": 1.661184951565668, "grad_norm": 0.5190053155374508, "learning_rate": 8.219079530742823e-06, "loss": 0.5612, "step": 3711 }, { "epoch": 1.6616355034917776, "grad_norm": 0.5537096138161745, "learning_rate": 8.214431052013636e-06, "loss": 0.5285, "step": 3712 }, { "epoch": 1.662086055417887, "grad_norm": 0.5176847298150127, "learning_rate": 8.209782971788804e-06, "loss": 0.5343, "step": 3713 }, { "epoch": 1.6625366073439964, "grad_norm": 0.5340205386326842, "learning_rate": 8.205135291105684e-06, "loss": 0.5527, "step": 3714 }, { "epoch": 1.6629871592701058, "grad_norm": 0.5472943280392425, "learning_rate": 8.200488011001558e-06, "loss": 0.5584, "step": 3715 }, { "epoch": 1.6634377111962153, "grad_norm": 0.5052745041025668, "learning_rate": 8.195841132513596e-06, "loss": 0.5044, "step": 3716 }, { "epoch": 1.6638882631223249, "grad_norm": 0.5419182441611402, "learning_rate": 8.191194656678905e-06, "loss": 0.5337, "step": 3717 }, { "epoch": 1.6643388150484344, "grad_norm": 0.5526900703108018, "learning_rate": 8.186548584534476e-06, "loss": 0.5124, "step": 3718 }, { "epoch": 1.664789366974544, "grad_norm": 0.6210595642697169, "learning_rate": 8.18190291711723e-06, "loss": 0.5346, "step": 3719 }, { "epoch": 1.6652399189006533, "grad_norm": 0.5424628264329256, "learning_rate": 8.177257655463984e-06, "loss": 0.5419, "step": 3720 }, { "epoch": 1.6656904708267628, "grad_norm": 0.536430819469175, "learning_rate": 8.17261280061148e-06, "loss": 0.5793, "step": 3721 }, { "epoch": 1.6661410227528721, "grad_norm": 0.5078399126844471, "learning_rate": 8.16796835359635e-06, "loss": 0.5553, "step": 3722 }, { "epoch": 1.6665915746789817, "grad_norm": 0.5456747544586085, "learning_rate": 8.163324315455145e-06, "loss": 0.5515, "step": 3723 }, { "epoch": 1.6670421266050912, "grad_norm": 0.5568741890814312, "learning_rate": 8.15868068722433e-06, "loss": 0.5562, "step": 3724 }, { "epoch": 1.6674926785312008, "grad_norm": 0.5579675931700853, "learning_rate": 8.15403746994027e-06, "loss": 0.5492, "step": 3725 }, { "epoch": 1.6679432304573103, "grad_norm": 0.5402238621216163, "learning_rate": 8.149394664639238e-06, "loss": 0.5483, "step": 3726 }, { "epoch": 1.6683937823834198, "grad_norm": 0.5340504884058286, "learning_rate": 8.144752272357424e-06, "loss": 0.5136, "step": 3727 }, { "epoch": 1.6688443343095292, "grad_norm": 0.5348539201753181, "learning_rate": 8.140110294130911e-06, "loss": 0.5688, "step": 3728 }, { "epoch": 1.6692948862356387, "grad_norm": 0.5437826438336475, "learning_rate": 8.135468730995711e-06, "loss": 0.5559, "step": 3729 }, { "epoch": 1.669745438161748, "grad_norm": 0.5423624376543165, "learning_rate": 8.130827583987718e-06, "loss": 0.5598, "step": 3730 }, { "epoch": 1.6701959900878576, "grad_norm": 0.5085995427727569, "learning_rate": 8.126186854142752e-06, "loss": 0.5327, "step": 3731 }, { "epoch": 1.6706465420139671, "grad_norm": 0.5328385745429388, "learning_rate": 8.12154654249654e-06, "loss": 0.4984, "step": 3732 }, { "epoch": 1.6710970939400767, "grad_norm": 0.5121593913998705, "learning_rate": 8.116906650084699e-06, "loss": 0.5587, "step": 3733 }, { "epoch": 1.6715476458661862, "grad_norm": 0.5513560767199619, "learning_rate": 8.112267177942767e-06, "loss": 0.5271, "step": 3734 }, { "epoch": 1.6719981977922955, "grad_norm": 0.5161647514325939, "learning_rate": 8.107628127106186e-06, "loss": 0.5228, "step": 3735 }, { "epoch": 1.672448749718405, "grad_norm": 0.538394065415541, "learning_rate": 8.102989498610303e-06, "loss": 0.5407, "step": 3736 }, { "epoch": 1.6728993016445144, "grad_norm": 0.5273356883711076, "learning_rate": 8.098351293490365e-06, "loss": 0.5589, "step": 3737 }, { "epoch": 1.673349853570624, "grad_norm": 0.5116167057931824, "learning_rate": 8.093713512781534e-06, "loss": 0.5029, "step": 3738 }, { "epoch": 1.6738004054967335, "grad_norm": 0.5362918556361959, "learning_rate": 8.089076157518866e-06, "loss": 0.5354, "step": 3739 }, { "epoch": 1.674250957422843, "grad_norm": 0.5294278091401519, "learning_rate": 8.08443922873734e-06, "loss": 0.567, "step": 3740 }, { "epoch": 1.6747015093489526, "grad_norm": 0.5102257817905129, "learning_rate": 8.079802727471815e-06, "loss": 0.5566, "step": 3741 }, { "epoch": 1.675152061275062, "grad_norm": 0.5378936179995588, "learning_rate": 8.07516665475708e-06, "loss": 0.5494, "step": 3742 }, { "epoch": 1.6756026132011714, "grad_norm": 0.528720063804696, "learning_rate": 8.070531011627809e-06, "loss": 0.5433, "step": 3743 }, { "epoch": 1.6760531651272808, "grad_norm": 0.5318784091662264, "learning_rate": 8.065895799118595e-06, "loss": 0.573, "step": 3744 }, { "epoch": 1.6765037170533903, "grad_norm": 0.5317111159578845, "learning_rate": 8.06126101826392e-06, "loss": 0.5495, "step": 3745 }, { "epoch": 1.6769542689794998, "grad_norm": 0.5168181049840612, "learning_rate": 8.05662667009818e-06, "loss": 0.5213, "step": 3746 }, { "epoch": 1.6774048209056094, "grad_norm": 0.5200676317211939, "learning_rate": 8.051992755655672e-06, "loss": 0.5301, "step": 3747 }, { "epoch": 1.677855372831719, "grad_norm": 0.5082475641649135, "learning_rate": 8.047359275970596e-06, "loss": 0.5197, "step": 3748 }, { "epoch": 1.6783059247578285, "grad_norm": 0.5353453942256371, "learning_rate": 8.042726232077052e-06, "loss": 0.5472, "step": 3749 }, { "epoch": 1.6787564766839378, "grad_norm": 0.5320840500461823, "learning_rate": 8.038093625009052e-06, "loss": 0.5054, "step": 3750 }, { "epoch": 1.6792070286100473, "grad_norm": 0.5150493450465115, "learning_rate": 8.033461455800493e-06, "loss": 0.5092, "step": 3751 }, { "epoch": 1.6796575805361567, "grad_norm": 0.5360534165810031, "learning_rate": 8.0288297254852e-06, "loss": 0.5191, "step": 3752 }, { "epoch": 1.6801081324622662, "grad_norm": 0.5305456348141436, "learning_rate": 8.02419843509687e-06, "loss": 0.556, "step": 3753 }, { "epoch": 1.6805586843883757, "grad_norm": 0.5515363988831471, "learning_rate": 8.01956758566913e-06, "loss": 0.5529, "step": 3754 }, { "epoch": 1.6810092363144853, "grad_norm": 0.5421388605537416, "learning_rate": 8.014937178235488e-06, "loss": 0.5475, "step": 3755 }, { "epoch": 1.6814597882405948, "grad_norm": 0.5254270242028313, "learning_rate": 8.010307213829367e-06, "loss": 0.5076, "step": 3756 }, { "epoch": 1.6819103401667044, "grad_norm": 0.5315224251152253, "learning_rate": 8.005677693484077e-06, "loss": 0.5739, "step": 3757 }, { "epoch": 1.6823608920928137, "grad_norm": 0.5184481512937014, "learning_rate": 8.001048618232848e-06, "loss": 0.5273, "step": 3758 }, { "epoch": 1.682811444018923, "grad_norm": 0.5150830000437779, "learning_rate": 7.99641998910879e-06, "loss": 0.5195, "step": 3759 }, { "epoch": 1.6832619959450326, "grad_norm": 0.5283519038135578, "learning_rate": 7.991791807144928e-06, "loss": 0.5607, "step": 3760 }, { "epoch": 1.683712547871142, "grad_norm": 0.5116009098703471, "learning_rate": 7.98716407337418e-06, "loss": 0.513, "step": 3761 }, { "epoch": 1.6841630997972517, "grad_norm": 0.5322323279472729, "learning_rate": 7.98253678882937e-06, "loss": 0.5684, "step": 3762 }, { "epoch": 1.6846136517233612, "grad_norm": 0.541740555705489, "learning_rate": 7.977909954543212e-06, "loss": 0.5517, "step": 3763 }, { "epoch": 1.6850642036494707, "grad_norm": 0.5053497354488773, "learning_rate": 7.973283571548336e-06, "loss": 0.5233, "step": 3764 }, { "epoch": 1.68551475557558, "grad_norm": 0.5529052413639262, "learning_rate": 7.968657640877248e-06, "loss": 0.5243, "step": 3765 }, { "epoch": 1.6859653075016896, "grad_norm": 0.562363435066154, "learning_rate": 7.964032163562378e-06, "loss": 0.58, "step": 3766 }, { "epoch": 1.686415859427799, "grad_norm": 0.5229201415227843, "learning_rate": 7.959407140636034e-06, "loss": 0.504, "step": 3767 }, { "epoch": 1.6868664113539085, "grad_norm": 0.5361206476208255, "learning_rate": 7.954782573130439e-06, "loss": 0.5458, "step": 3768 }, { "epoch": 1.687316963280018, "grad_norm": 0.5387581644753022, "learning_rate": 7.950158462077697e-06, "loss": 0.5568, "step": 3769 }, { "epoch": 1.6877675152061276, "grad_norm": 0.5604185527604325, "learning_rate": 7.945534808509831e-06, "loss": 0.5263, "step": 3770 }, { "epoch": 1.688218067132237, "grad_norm": 0.5159683884205384, "learning_rate": 7.940911613458742e-06, "loss": 0.5308, "step": 3771 }, { "epoch": 1.6886686190583464, "grad_norm": 0.537100051165467, "learning_rate": 7.936288877956243e-06, "loss": 0.5289, "step": 3772 }, { "epoch": 1.689119170984456, "grad_norm": 0.5296312926145725, "learning_rate": 7.931666603034034e-06, "loss": 0.5933, "step": 3773 }, { "epoch": 1.6895697229105653, "grad_norm": 0.5128381180808799, "learning_rate": 7.927044789723724e-06, "loss": 0.513, "step": 3774 }, { "epoch": 1.6900202748366748, "grad_norm": 0.5284548755150067, "learning_rate": 7.922423439056803e-06, "loss": 0.5596, "step": 3775 }, { "epoch": 1.6904708267627844, "grad_norm": 0.5341480320943696, "learning_rate": 7.917802552064675e-06, "loss": 0.5246, "step": 3776 }, { "epoch": 1.690921378688894, "grad_norm": 0.5494339111092246, "learning_rate": 7.913182129778625e-06, "loss": 0.5066, "step": 3777 }, { "epoch": 1.6913719306150035, "grad_norm": 0.5294882655599296, "learning_rate": 7.90856217322985e-06, "loss": 0.5355, "step": 3778 }, { "epoch": 1.691822482541113, "grad_norm": 0.5170720033604516, "learning_rate": 7.903942683449429e-06, "loss": 0.4977, "step": 3779 }, { "epoch": 1.6922730344672223, "grad_norm": 0.5682128221155138, "learning_rate": 7.899323661468344e-06, "loss": 0.5491, "step": 3780 }, { "epoch": 1.6927235863933319, "grad_norm": 0.5312146888603336, "learning_rate": 7.89470510831747e-06, "loss": 0.5228, "step": 3781 }, { "epoch": 1.6931741383194412, "grad_norm": 0.5903939600471755, "learning_rate": 7.89008702502758e-06, "loss": 0.5339, "step": 3782 }, { "epoch": 1.6936246902455507, "grad_norm": 0.5327867161894974, "learning_rate": 7.885469412629342e-06, "loss": 0.5009, "step": 3783 }, { "epoch": 1.6940752421716603, "grad_norm": 0.5344096844912513, "learning_rate": 7.880852272153312e-06, "loss": 0.533, "step": 3784 }, { "epoch": 1.6945257940977698, "grad_norm": 0.5291548741715421, "learning_rate": 7.876235604629955e-06, "loss": 0.5477, "step": 3785 }, { "epoch": 1.6949763460238794, "grad_norm": 0.5356577281715871, "learning_rate": 7.87161941108961e-06, "loss": 0.5597, "step": 3786 }, { "epoch": 1.6954268979499887, "grad_norm": 0.544112044689049, "learning_rate": 7.867003692562533e-06, "loss": 0.5489, "step": 3787 }, { "epoch": 1.6958774498760982, "grad_norm": 0.5185052188808884, "learning_rate": 7.862388450078854e-06, "loss": 0.533, "step": 3788 }, { "epoch": 1.6963280018022076, "grad_norm": 0.5306908200075051, "learning_rate": 7.857773684668616e-06, "loss": 0.5095, "step": 3789 }, { "epoch": 1.696778553728317, "grad_norm": 0.5345271769375516, "learning_rate": 7.853159397361732e-06, "loss": 0.5197, "step": 3790 }, { "epoch": 1.6972291056544266, "grad_norm": 0.5228168176920763, "learning_rate": 7.848545589188035e-06, "loss": 0.5284, "step": 3791 }, { "epoch": 1.6976796575805362, "grad_norm": 0.5762002172577301, "learning_rate": 7.843932261177224e-06, "loss": 0.5366, "step": 3792 }, { "epoch": 1.6981302095066457, "grad_norm": 0.5254020506860761, "learning_rate": 7.839319414358917e-06, "loss": 0.5465, "step": 3793 }, { "epoch": 1.6985807614327553, "grad_norm": 0.5280901243279242, "learning_rate": 7.834707049762605e-06, "loss": 0.5444, "step": 3794 }, { "epoch": 1.6990313133588646, "grad_norm": 0.5799389095010231, "learning_rate": 7.83009516841768e-06, "loss": 0.5445, "step": 3795 }, { "epoch": 1.6994818652849741, "grad_norm": 0.5338336795684228, "learning_rate": 7.82548377135342e-06, "loss": 0.534, "step": 3796 }, { "epoch": 1.6999324172110835, "grad_norm": 0.5207303031739371, "learning_rate": 7.820872859599006e-06, "loss": 0.534, "step": 3797 }, { "epoch": 1.700382969137193, "grad_norm": 0.5435433122198822, "learning_rate": 7.816262434183498e-06, "loss": 0.5114, "step": 3798 }, { "epoch": 1.7008335210633025, "grad_norm": 0.5578228224190657, "learning_rate": 7.811652496135861e-06, "loss": 0.5081, "step": 3799 }, { "epoch": 1.701284072989412, "grad_norm": 0.5407676741870664, "learning_rate": 7.807043046484935e-06, "loss": 0.5381, "step": 3800 }, { "epoch": 1.7017346249155216, "grad_norm": 0.5599801452075928, "learning_rate": 7.80243408625947e-06, "loss": 0.5257, "step": 3801 }, { "epoch": 1.702185176841631, "grad_norm": 0.547211026502493, "learning_rate": 7.797825616488086e-06, "loss": 0.5397, "step": 3802 }, { "epoch": 1.7026357287677405, "grad_norm": 0.5171128042823014, "learning_rate": 7.793217638199314e-06, "loss": 0.5426, "step": 3803 }, { "epoch": 1.7030862806938498, "grad_norm": 0.5354885148826583, "learning_rate": 7.788610152421554e-06, "loss": 0.5055, "step": 3804 }, { "epoch": 1.7035368326199594, "grad_norm": 0.549143418739164, "learning_rate": 7.784003160183117e-06, "loss": 0.4886, "step": 3805 }, { "epoch": 1.703987384546069, "grad_norm": 0.5207150879219147, "learning_rate": 7.779396662512193e-06, "loss": 0.5595, "step": 3806 }, { "epoch": 1.7044379364721784, "grad_norm": 0.5313432822665539, "learning_rate": 7.774790660436857e-06, "loss": 0.532, "step": 3807 }, { "epoch": 1.704888488398288, "grad_norm": 0.5210173235680142, "learning_rate": 7.770185154985086e-06, "loss": 0.5247, "step": 3808 }, { "epoch": 1.7053390403243975, "grad_norm": 0.5313429476108361, "learning_rate": 7.765580147184737e-06, "loss": 0.5243, "step": 3809 }, { "epoch": 1.7057895922505069, "grad_norm": 0.5277950746611322, "learning_rate": 7.760975638063555e-06, "loss": 0.5195, "step": 3810 }, { "epoch": 1.7062401441766164, "grad_norm": 0.506029204265088, "learning_rate": 7.756371628649185e-06, "loss": 0.5277, "step": 3811 }, { "epoch": 1.7066906961027257, "grad_norm": 0.5375929938551036, "learning_rate": 7.751768119969142e-06, "loss": 0.5511, "step": 3812 }, { "epoch": 1.7071412480288353, "grad_norm": 0.525277544708809, "learning_rate": 7.74716511305085e-06, "loss": 0.5504, "step": 3813 }, { "epoch": 1.7075917999549448, "grad_norm": 0.5374776941588845, "learning_rate": 7.742562608921605e-06, "loss": 0.5455, "step": 3814 }, { "epoch": 1.7080423518810544, "grad_norm": 0.5219280999169128, "learning_rate": 7.7379606086086e-06, "loss": 0.5393, "step": 3815 }, { "epoch": 1.708492903807164, "grad_norm": 0.5297298273405919, "learning_rate": 7.733359113138906e-06, "loss": 0.5594, "step": 3816 }, { "epoch": 1.7089434557332732, "grad_norm": 0.5387618689985133, "learning_rate": 7.728758123539498e-06, "loss": 0.5737, "step": 3817 }, { "epoch": 1.7093940076593828, "grad_norm": 0.5065303246182363, "learning_rate": 7.724157640837219e-06, "loss": 0.524, "step": 3818 }, { "epoch": 1.709844559585492, "grad_norm": 0.5598495453843091, "learning_rate": 7.719557666058809e-06, "loss": 0.5673, "step": 3819 }, { "epoch": 1.7102951115116016, "grad_norm": 0.5155650409489052, "learning_rate": 7.714958200230896e-06, "loss": 0.5013, "step": 3820 }, { "epoch": 1.7107456634377112, "grad_norm": 0.531156870223323, "learning_rate": 7.71035924437999e-06, "loss": 0.5449, "step": 3821 }, { "epoch": 1.7111962153638207, "grad_norm": 0.5438297315498092, "learning_rate": 7.705760799532485e-06, "loss": 0.5693, "step": 3822 }, { "epoch": 1.7116467672899303, "grad_norm": 0.530093413721517, "learning_rate": 7.701162866714674e-06, "loss": 0.5198, "step": 3823 }, { "epoch": 1.7120973192160398, "grad_norm": 0.520174004045343, "learning_rate": 7.696565446952716e-06, "loss": 0.5307, "step": 3824 }, { "epoch": 1.7125478711421491, "grad_norm": 0.52858320447711, "learning_rate": 7.691968541272674e-06, "loss": 0.5413, "step": 3825 }, { "epoch": 1.7129984230682587, "grad_norm": 0.5397851594667031, "learning_rate": 7.687372150700479e-06, "loss": 0.546, "step": 3826 }, { "epoch": 1.713448974994368, "grad_norm": 0.5252891969794234, "learning_rate": 7.682776276261969e-06, "loss": 0.5252, "step": 3827 }, { "epoch": 1.7138995269204775, "grad_norm": 0.5298837081511556, "learning_rate": 7.67818091898284e-06, "loss": 0.5173, "step": 3828 }, { "epoch": 1.714350078846587, "grad_norm": 0.5324233251982919, "learning_rate": 7.673586079888699e-06, "loss": 0.5119, "step": 3829 }, { "epoch": 1.7148006307726966, "grad_norm": 0.517212958055149, "learning_rate": 7.668991760005017e-06, "loss": 0.5261, "step": 3830 }, { "epoch": 1.7152511826988062, "grad_norm": 0.5124537563067322, "learning_rate": 7.66439796035716e-06, "loss": 0.53, "step": 3831 }, { "epoch": 1.7157017346249155, "grad_norm": 0.5270120216579729, "learning_rate": 7.659804681970378e-06, "loss": 0.5545, "step": 3832 }, { "epoch": 1.716152286551025, "grad_norm": 0.5597120796012054, "learning_rate": 7.655211925869793e-06, "loss": 0.5144, "step": 3833 }, { "epoch": 1.7166028384771344, "grad_norm": 0.5075006712480337, "learning_rate": 7.65061969308043e-06, "loss": 0.5403, "step": 3834 }, { "epoch": 1.717053390403244, "grad_norm": 0.5042511902700756, "learning_rate": 7.646027984627176e-06, "loss": 0.51, "step": 3835 }, { "epoch": 1.7175039423293534, "grad_norm": 0.5440346504090179, "learning_rate": 7.641436801534818e-06, "loss": 0.5353, "step": 3836 }, { "epoch": 1.717954494255463, "grad_norm": 0.5158670038434775, "learning_rate": 7.636846144828014e-06, "loss": 0.5517, "step": 3837 }, { "epoch": 1.7184050461815725, "grad_norm": 0.5042969114108756, "learning_rate": 7.63225601553132e-06, "loss": 0.5432, "step": 3838 }, { "epoch": 1.718855598107682, "grad_norm": 0.5295291008455169, "learning_rate": 7.627666414669147e-06, "loss": 0.5192, "step": 3839 }, { "epoch": 1.7193061500337914, "grad_norm": 0.558993923866967, "learning_rate": 7.623077343265821e-06, "loss": 0.5355, "step": 3840 }, { "epoch": 1.7197567019599007, "grad_norm": 0.5387955286653154, "learning_rate": 7.618488802345524e-06, "loss": 0.5499, "step": 3841 }, { "epoch": 1.7202072538860103, "grad_norm": 0.5157603554414906, "learning_rate": 7.613900792932331e-06, "loss": 0.4924, "step": 3842 }, { "epoch": 1.7206578058121198, "grad_norm": 0.5595288268698874, "learning_rate": 7.6093133160502e-06, "loss": 0.5402, "step": 3843 }, { "epoch": 1.7211083577382293, "grad_norm": 0.5266737681854082, "learning_rate": 7.604726372722967e-06, "loss": 0.5485, "step": 3844 }, { "epoch": 1.7215589096643389, "grad_norm": 0.5497806156532392, "learning_rate": 7.600139963974341e-06, "loss": 0.5249, "step": 3845 }, { "epoch": 1.7220094615904484, "grad_norm": 0.5263626083812514, "learning_rate": 7.5955540908279305e-06, "loss": 0.5338, "step": 3846 }, { "epoch": 1.7224600135165578, "grad_norm": 0.530791856970669, "learning_rate": 7.590968754307202e-06, "loss": 0.5218, "step": 3847 }, { "epoch": 1.7229105654426673, "grad_norm": 0.5197401945461405, "learning_rate": 7.586383955435526e-06, "loss": 0.5464, "step": 3848 }, { "epoch": 1.7233611173687766, "grad_norm": 0.5462953143441464, "learning_rate": 7.5817996952361285e-06, "loss": 0.5724, "step": 3849 }, { "epoch": 1.7238116692948862, "grad_norm": 0.5520975678556065, "learning_rate": 7.577215974732139e-06, "loss": 0.5323, "step": 3850 }, { "epoch": 1.7242622212209957, "grad_norm": 0.5404715948645429, "learning_rate": 7.572632794946543e-06, "loss": 0.5282, "step": 3851 }, { "epoch": 1.7247127731471052, "grad_norm": 0.5307186286342864, "learning_rate": 7.5680501569022295e-06, "loss": 0.5513, "step": 3852 }, { "epoch": 1.7251633250732148, "grad_norm": 0.5337059372669712, "learning_rate": 7.5634680616219455e-06, "loss": 0.5496, "step": 3853 }, { "epoch": 1.7256138769993243, "grad_norm": 0.5407161442856724, "learning_rate": 7.558886510128329e-06, "loss": 0.5289, "step": 3854 }, { "epoch": 1.7260644289254337, "grad_norm": 0.5249213949225815, "learning_rate": 7.554305503443893e-06, "loss": 0.5425, "step": 3855 }, { "epoch": 1.726514980851543, "grad_norm": 0.5174199125835544, "learning_rate": 7.5497250425910315e-06, "loss": 0.5063, "step": 3856 }, { "epoch": 1.7269655327776525, "grad_norm": 0.5562864680010038, "learning_rate": 7.545145128592009e-06, "loss": 0.5761, "step": 3857 }, { "epoch": 1.727416084703762, "grad_norm": 0.5061229691562327, "learning_rate": 7.540565762468981e-06, "loss": 0.5203, "step": 3858 }, { "epoch": 1.7278666366298716, "grad_norm": 0.5355751393594133, "learning_rate": 7.535986945243966e-06, "loss": 0.5496, "step": 3859 }, { "epoch": 1.7283171885559812, "grad_norm": 0.5200623498198553, "learning_rate": 7.531408677938874e-06, "loss": 0.5331, "step": 3860 }, { "epoch": 1.7287677404820907, "grad_norm": 0.5350054347636355, "learning_rate": 7.526830961575477e-06, "loss": 0.5656, "step": 3861 }, { "epoch": 1.7292182924082, "grad_norm": 0.50934289535413, "learning_rate": 7.5222537971754425e-06, "loss": 0.5286, "step": 3862 }, { "epoch": 1.7296688443343096, "grad_norm": 0.5294541444394959, "learning_rate": 7.517677185760295e-06, "loss": 0.5581, "step": 3863 }, { "epoch": 1.7301193962604189, "grad_norm": 0.5448423359234432, "learning_rate": 7.513101128351454e-06, "loss": 0.5318, "step": 3864 }, { "epoch": 1.7305699481865284, "grad_norm": 0.5311175758007844, "learning_rate": 7.5085256259702e-06, "loss": 0.577, "step": 3865 }, { "epoch": 1.731020500112638, "grad_norm": 0.5197469596580511, "learning_rate": 7.5039506796377014e-06, "loss": 0.5306, "step": 3866 }, { "epoch": 1.7314710520387475, "grad_norm": 0.5156604073556754, "learning_rate": 7.499376290374994e-06, "loss": 0.5431, "step": 3867 }, { "epoch": 1.731921603964857, "grad_norm": 0.5182908942283537, "learning_rate": 7.494802459202994e-06, "loss": 0.5107, "step": 3868 }, { "epoch": 1.7323721558909664, "grad_norm": 0.551159741995321, "learning_rate": 7.490229187142493e-06, "loss": 0.536, "step": 3869 }, { "epoch": 1.732822707817076, "grad_norm": 0.52415136115428, "learning_rate": 7.485656475214157e-06, "loss": 0.5273, "step": 3870 }, { "epoch": 1.7332732597431852, "grad_norm": 0.5292198126303775, "learning_rate": 7.481084324438521e-06, "loss": 0.5836, "step": 3871 }, { "epoch": 1.7337238116692948, "grad_norm": 0.5303606316210282, "learning_rate": 7.476512735836008e-06, "loss": 0.549, "step": 3872 }, { "epoch": 1.7341743635954043, "grad_norm": 0.5169259953915586, "learning_rate": 7.471941710426901e-06, "loss": 0.5353, "step": 3873 }, { "epoch": 1.7346249155215139, "grad_norm": 0.5117385808255849, "learning_rate": 7.467371249231371e-06, "loss": 0.5261, "step": 3874 }, { "epoch": 1.7350754674476234, "grad_norm": 0.5178979136807943, "learning_rate": 7.462801353269449e-06, "loss": 0.5607, "step": 3875 }, { "epoch": 1.735526019373733, "grad_norm": 0.5091800180418307, "learning_rate": 7.458232023561055e-06, "loss": 0.5529, "step": 3876 }, { "epoch": 1.7359765712998423, "grad_norm": 0.5361992984346409, "learning_rate": 7.453663261125967e-06, "loss": 0.5273, "step": 3877 }, { "epoch": 1.7364271232259518, "grad_norm": 0.5084268317240404, "learning_rate": 7.449095066983849e-06, "loss": 0.533, "step": 3878 }, { "epoch": 1.7368776751520612, "grad_norm": 0.5115409127327941, "learning_rate": 7.444527442154234e-06, "loss": 0.4849, "step": 3879 }, { "epoch": 1.7373282270781707, "grad_norm": 0.5395022934658218, "learning_rate": 7.439960387656524e-06, "loss": 0.5352, "step": 3880 }, { "epoch": 1.7377787790042802, "grad_norm": 0.528029673390764, "learning_rate": 7.43539390451e-06, "loss": 0.5285, "step": 3881 }, { "epoch": 1.7382293309303898, "grad_norm": 0.5309093979626799, "learning_rate": 7.430827993733809e-06, "loss": 0.5248, "step": 3882 }, { "epoch": 1.7386798828564993, "grad_norm": 0.5167368600999004, "learning_rate": 7.426262656346979e-06, "loss": 0.5559, "step": 3883 }, { "epoch": 1.7391304347826086, "grad_norm": 0.5271783079294903, "learning_rate": 7.421697893368396e-06, "loss": 0.5664, "step": 3884 }, { "epoch": 1.7395809867087182, "grad_norm": 0.5170396137538897, "learning_rate": 7.4171337058168365e-06, "loss": 0.5188, "step": 3885 }, { "epoch": 1.7395809867087182, "eval_loss": 0.6227230429649353, "eval_runtime": 24.3811, "eval_samples_per_second": 11.443, "eval_steps_per_second": 0.492, "step": 3885 }, { "epoch": 1.7400315386348275, "grad_norm": 0.5440732481694406, "learning_rate": 7.41257009471093e-06, "loss": 0.5265, "step": 3886 }, { "epoch": 1.740482090560937, "grad_norm": 0.5452318359948183, "learning_rate": 7.408007061069194e-06, "loss": 0.5118, "step": 3887 }, { "epoch": 1.7409326424870466, "grad_norm": 0.5301786898221359, "learning_rate": 7.4034446059100014e-06, "loss": 0.5672, "step": 3888 }, { "epoch": 1.7413831944131561, "grad_norm": 0.5312935699563265, "learning_rate": 7.398882730251613e-06, "loss": 0.5367, "step": 3889 }, { "epoch": 1.7418337463392657, "grad_norm": 0.548308411909161, "learning_rate": 7.394321435112142e-06, "loss": 0.5274, "step": 3890 }, { "epoch": 1.7422842982653752, "grad_norm": 0.5313953244810578, "learning_rate": 7.389760721509587e-06, "loss": 0.5166, "step": 3891 }, { "epoch": 1.7427348501914846, "grad_norm": 0.5171580196334215, "learning_rate": 7.385200590461803e-06, "loss": 0.529, "step": 3892 }, { "epoch": 1.743185402117594, "grad_norm": 0.5039887928700829, "learning_rate": 7.380641042986533e-06, "loss": 0.4999, "step": 3893 }, { "epoch": 1.7436359540437034, "grad_norm": 0.5366206184775083, "learning_rate": 7.37608208010137e-06, "loss": 0.5472, "step": 3894 }, { "epoch": 1.744086505969813, "grad_norm": 0.5406487679012586, "learning_rate": 7.371523702823795e-06, "loss": 0.5521, "step": 3895 }, { "epoch": 1.7445370578959225, "grad_norm": 0.5062376108814746, "learning_rate": 7.36696591217114e-06, "loss": 0.4992, "step": 3896 }, { "epoch": 1.744987609822032, "grad_norm": 0.534313500972052, "learning_rate": 7.362408709160626e-06, "loss": 0.5578, "step": 3897 }, { "epoch": 1.7454381617481416, "grad_norm": 0.5020336884088875, "learning_rate": 7.3578520948093234e-06, "loss": 0.5179, "step": 3898 }, { "epoch": 1.745888713674251, "grad_norm": 0.4972747436685379, "learning_rate": 7.353296070134186e-06, "loss": 0.5404, "step": 3899 }, { "epoch": 1.7463392656003605, "grad_norm": 0.553808379836788, "learning_rate": 7.348740636152024e-06, "loss": 0.5481, "step": 3900 }, { "epoch": 1.7467898175264698, "grad_norm": 0.5245073431902474, "learning_rate": 7.344185793879531e-06, "loss": 0.5371, "step": 3901 }, { "epoch": 1.7472403694525793, "grad_norm": 0.5271910926149302, "learning_rate": 7.33963154433325e-06, "loss": 0.5461, "step": 3902 }, { "epoch": 1.7476909213786889, "grad_norm": 0.523571042198563, "learning_rate": 7.335077888529607e-06, "loss": 0.5271, "step": 3903 }, { "epoch": 1.7481414733047984, "grad_norm": 0.49572944760708787, "learning_rate": 7.3305248274848865e-06, "loss": 0.5297, "step": 3904 }, { "epoch": 1.748592025230908, "grad_norm": 0.5526787875354806, "learning_rate": 7.325972362215248e-06, "loss": 0.504, "step": 3905 }, { "epoch": 1.7490425771570175, "grad_norm": 0.5206912335180309, "learning_rate": 7.321420493736705e-06, "loss": 0.5292, "step": 3906 }, { "epoch": 1.7494931290831268, "grad_norm": 0.5344553257606215, "learning_rate": 7.316869223065156e-06, "loss": 0.5182, "step": 3907 }, { "epoch": 1.7499436810092364, "grad_norm": 0.515024669840133, "learning_rate": 7.312318551216348e-06, "loss": 0.5288, "step": 3908 }, { "epoch": 1.7503942329353457, "grad_norm": 0.5104385988140321, "learning_rate": 7.30776847920591e-06, "loss": 0.5463, "step": 3909 }, { "epoch": 1.7508447848614552, "grad_norm": 0.5244771444488222, "learning_rate": 7.303219008049323e-06, "loss": 0.5404, "step": 3910 }, { "epoch": 1.7512953367875648, "grad_norm": 0.5414470947700809, "learning_rate": 7.298670138761948e-06, "loss": 0.5455, "step": 3911 }, { "epoch": 1.7517458887136743, "grad_norm": 0.5018787192716667, "learning_rate": 7.2941218723589945e-06, "loss": 0.4885, "step": 3912 }, { "epoch": 1.7521964406397839, "grad_norm": 0.5359344855314724, "learning_rate": 7.28957420985556e-06, "loss": 0.5115, "step": 3913 }, { "epoch": 1.7526469925658932, "grad_norm": 0.5163053893816357, "learning_rate": 7.285027152266584e-06, "loss": 0.4871, "step": 3914 }, { "epoch": 1.7530975444920027, "grad_norm": 0.5154112323543767, "learning_rate": 7.280480700606887e-06, "loss": 0.5381, "step": 3915 }, { "epoch": 1.753548096418112, "grad_norm": 0.5564162684820447, "learning_rate": 7.2759348558911456e-06, "loss": 0.5474, "step": 3916 }, { "epoch": 1.7539986483442216, "grad_norm": 0.5114673618351825, "learning_rate": 7.271389619133908e-06, "loss": 0.4919, "step": 3917 }, { "epoch": 1.7544492002703311, "grad_norm": 0.525237127395142, "learning_rate": 7.266844991349577e-06, "loss": 0.5511, "step": 3918 }, { "epoch": 1.7548997521964407, "grad_norm": 0.5577980814941476, "learning_rate": 7.262300973552434e-06, "loss": 0.5548, "step": 3919 }, { "epoch": 1.7553503041225502, "grad_norm": 0.540536795411842, "learning_rate": 7.257757566756604e-06, "loss": 0.5045, "step": 3920 }, { "epoch": 1.7558008560486598, "grad_norm": 0.5240088851316068, "learning_rate": 7.2532147719761e-06, "loss": 0.5156, "step": 3921 }, { "epoch": 1.756251407974769, "grad_norm": 0.5156411943940399, "learning_rate": 7.248672590224776e-06, "loss": 0.5611, "step": 3922 }, { "epoch": 1.7567019599008786, "grad_norm": 0.5403086713530852, "learning_rate": 7.244131022516366e-06, "loss": 0.5299, "step": 3923 }, { "epoch": 1.757152511826988, "grad_norm": 0.5254473737351643, "learning_rate": 7.239590069864451e-06, "loss": 0.5527, "step": 3924 }, { "epoch": 1.7576030637530975, "grad_norm": 0.5324122598589817, "learning_rate": 7.235049733282493e-06, "loss": 0.5158, "step": 3925 }, { "epoch": 1.758053615679207, "grad_norm": 0.5546974462049911, "learning_rate": 7.2305100137838005e-06, "loss": 0.5689, "step": 3926 }, { "epoch": 1.7585041676053166, "grad_norm": 0.5356428676146094, "learning_rate": 7.225970912381557e-06, "loss": 0.5197, "step": 3927 }, { "epoch": 1.7589547195314261, "grad_norm": 0.5183266154735612, "learning_rate": 7.2214324300887954e-06, "loss": 0.5527, "step": 3928 }, { "epoch": 1.7594052714575354, "grad_norm": 0.5323729840984177, "learning_rate": 7.216894567918416e-06, "loss": 0.5442, "step": 3929 }, { "epoch": 1.759855823383645, "grad_norm": 0.5423274605259216, "learning_rate": 7.212357326883191e-06, "loss": 0.5401, "step": 3930 }, { "epoch": 1.7603063753097543, "grad_norm": 0.5085105450774289, "learning_rate": 7.207820707995735e-06, "loss": 0.5363, "step": 3931 }, { "epoch": 1.7607569272358639, "grad_norm": 0.504051379088344, "learning_rate": 7.203284712268541e-06, "loss": 0.5231, "step": 3932 }, { "epoch": 1.7612074791619734, "grad_norm": 0.5145517851390758, "learning_rate": 7.198749340713946e-06, "loss": 0.4871, "step": 3933 }, { "epoch": 1.761658031088083, "grad_norm": 0.5426318738704529, "learning_rate": 7.194214594344169e-06, "loss": 0.5525, "step": 3934 }, { "epoch": 1.7621085830141925, "grad_norm": 0.5405591326618195, "learning_rate": 7.189680474171266e-06, "loss": 0.5258, "step": 3935 }, { "epoch": 1.762559134940302, "grad_norm": 0.5022539008612757, "learning_rate": 7.185146981207172e-06, "loss": 0.4854, "step": 3936 }, { "epoch": 1.7630096868664114, "grad_norm": 0.5279238206920115, "learning_rate": 7.180614116463671e-06, "loss": 0.5356, "step": 3937 }, { "epoch": 1.7634602387925207, "grad_norm": 0.5550248472990191, "learning_rate": 7.1760818809524124e-06, "loss": 0.5556, "step": 3938 }, { "epoch": 1.7639107907186302, "grad_norm": 0.5597302150857215, "learning_rate": 7.171550275684902e-06, "loss": 0.5295, "step": 3939 }, { "epoch": 1.7643613426447398, "grad_norm": 0.5202167256767235, "learning_rate": 7.167019301672508e-06, "loss": 0.5525, "step": 3940 }, { "epoch": 1.7648118945708493, "grad_norm": 0.4985883779671033, "learning_rate": 7.16248895992645e-06, "loss": 0.5245, "step": 3941 }, { "epoch": 1.7652624464969588, "grad_norm": 0.5294541937902085, "learning_rate": 7.1579592514578234e-06, "loss": 0.504, "step": 3942 }, { "epoch": 1.7657129984230684, "grad_norm": 0.5632054791292539, "learning_rate": 7.15343017727756e-06, "loss": 0.6078, "step": 3943 }, { "epoch": 1.7661635503491777, "grad_norm": 0.5269844256040542, "learning_rate": 7.1489017383964695e-06, "loss": 0.5169, "step": 3944 }, { "epoch": 1.7666141022752873, "grad_norm": 0.5103309555182386, "learning_rate": 7.144373935825204e-06, "loss": 0.507, "step": 3945 }, { "epoch": 1.7670646542013966, "grad_norm": 0.5381161086600103, "learning_rate": 7.1398467705742916e-06, "loss": 0.4879, "step": 3946 }, { "epoch": 1.7675152061275061, "grad_norm": 0.534200172942757, "learning_rate": 7.135320243654096e-06, "loss": 0.5313, "step": 3947 }, { "epoch": 1.7679657580536157, "grad_norm": 0.5294427538115308, "learning_rate": 7.13079435607486e-06, "loss": 0.5403, "step": 3948 }, { "epoch": 1.7684163099797252, "grad_norm": 0.53340858811451, "learning_rate": 7.126269108846669e-06, "loss": 0.5408, "step": 3949 }, { "epoch": 1.7688668619058348, "grad_norm": 0.5264708825027162, "learning_rate": 7.12174450297947e-06, "loss": 0.5385, "step": 3950 }, { "epoch": 1.7693174138319443, "grad_norm": 0.531029156030685, "learning_rate": 7.117220539483068e-06, "loss": 0.5428, "step": 3951 }, { "epoch": 1.7697679657580536, "grad_norm": 0.5202938764833429, "learning_rate": 7.1126972193671285e-06, "loss": 0.5223, "step": 3952 }, { "epoch": 1.770218517684163, "grad_norm": 0.5081743260075204, "learning_rate": 7.108174543641159e-06, "loss": 0.5454, "step": 3953 }, { "epoch": 1.7706690696102725, "grad_norm": 0.5321424157336988, "learning_rate": 7.103652513314543e-06, "loss": 0.5334, "step": 3954 }, { "epoch": 1.771119621536382, "grad_norm": 0.5077221911813867, "learning_rate": 7.099131129396501e-06, "loss": 0.5315, "step": 3955 }, { "epoch": 1.7715701734624916, "grad_norm": 0.5272905940546582, "learning_rate": 7.094610392896129e-06, "loss": 0.4941, "step": 3956 }, { "epoch": 1.7720207253886011, "grad_norm": 0.550413578471038, "learning_rate": 7.090090304822356e-06, "loss": 0.5019, "step": 3957 }, { "epoch": 1.7724712773147107, "grad_norm": 0.5244284541954755, "learning_rate": 7.085570866183989e-06, "loss": 0.5382, "step": 3958 }, { "epoch": 1.77292182924082, "grad_norm": 0.5200603912147272, "learning_rate": 7.081052077989668e-06, "loss": 0.5018, "step": 3959 }, { "epoch": 1.7733723811669295, "grad_norm": 0.5327620749619, "learning_rate": 7.076533941247907e-06, "loss": 0.5318, "step": 3960 }, { "epoch": 1.7738229330930388, "grad_norm": 0.5405346562725658, "learning_rate": 7.072016456967062e-06, "loss": 0.5371, "step": 3961 }, { "epoch": 1.7742734850191484, "grad_norm": 0.527578417434621, "learning_rate": 7.067499626155354e-06, "loss": 0.5424, "step": 3962 }, { "epoch": 1.774724036945258, "grad_norm": 0.5166970285847761, "learning_rate": 7.0629834498208465e-06, "loss": 0.5392, "step": 3963 }, { "epoch": 1.7751745888713675, "grad_norm": 0.5261750410613663, "learning_rate": 7.058467928971462e-06, "loss": 0.5165, "step": 3964 }, { "epoch": 1.775625140797477, "grad_norm": 0.5560583341133409, "learning_rate": 7.05395306461498e-06, "loss": 0.5557, "step": 3965 }, { "epoch": 1.7760756927235863, "grad_norm": 0.513165910365131, "learning_rate": 7.04943885775903e-06, "loss": 0.561, "step": 3966 }, { "epoch": 1.7765262446496959, "grad_norm": 0.57315252573975, "learning_rate": 7.0449253094110925e-06, "loss": 0.5314, "step": 3967 }, { "epoch": 1.7769767965758052, "grad_norm": 0.546792923457531, "learning_rate": 7.040412420578511e-06, "loss": 0.5167, "step": 3968 }, { "epoch": 1.7774273485019148, "grad_norm": 0.5200376544047415, "learning_rate": 7.035900192268464e-06, "loss": 0.5286, "step": 3969 }, { "epoch": 1.7778779004280243, "grad_norm": 0.5460803425854782, "learning_rate": 7.031388625488006e-06, "loss": 0.5429, "step": 3970 }, { "epoch": 1.7783284523541338, "grad_norm": 0.5519673295205539, "learning_rate": 7.026877721244019e-06, "loss": 0.557, "step": 3971 }, { "epoch": 1.7787790042802434, "grad_norm": 0.5535112441061484, "learning_rate": 7.0223674805432595e-06, "loss": 0.533, "step": 3972 }, { "epoch": 1.779229556206353, "grad_norm": 0.5284287779805338, "learning_rate": 7.017857904392318e-06, "loss": 0.4965, "step": 3973 }, { "epoch": 1.7796801081324622, "grad_norm": 0.5181672291294565, "learning_rate": 7.0133489937976505e-06, "loss": 0.5205, "step": 3974 }, { "epoch": 1.7801306600585718, "grad_norm": 0.5601014514436871, "learning_rate": 7.008840749765555e-06, "loss": 0.546, "step": 3975 }, { "epoch": 1.7805812119846811, "grad_norm": 0.5361439011916986, "learning_rate": 7.004333173302185e-06, "loss": 0.5258, "step": 3976 }, { "epoch": 1.7810317639107907, "grad_norm": 0.49482252655296477, "learning_rate": 6.999826265413542e-06, "loss": 0.5231, "step": 3977 }, { "epoch": 1.7814823158369002, "grad_norm": 0.5259403023151201, "learning_rate": 6.995320027105481e-06, "loss": 0.5571, "step": 3978 }, { "epoch": 1.7819328677630097, "grad_norm": 0.5441120183979418, "learning_rate": 6.990814459383713e-06, "loss": 0.5418, "step": 3979 }, { "epoch": 1.7823834196891193, "grad_norm": 0.5416573317340129, "learning_rate": 6.986309563253783e-06, "loss": 0.5278, "step": 3980 }, { "epoch": 1.7828339716152286, "grad_norm": 0.5095514756879153, "learning_rate": 6.981805339721107e-06, "loss": 0.5512, "step": 3981 }, { "epoch": 1.7832845235413382, "grad_norm": 0.5214357520888969, "learning_rate": 6.977301789790931e-06, "loss": 0.5614, "step": 3982 }, { "epoch": 1.7837350754674475, "grad_norm": 0.5304936903134484, "learning_rate": 6.972798914468369e-06, "loss": 0.4934, "step": 3983 }, { "epoch": 1.784185627393557, "grad_norm": 0.5489373718663634, "learning_rate": 6.968296714758369e-06, "loss": 0.5583, "step": 3984 }, { "epoch": 1.7846361793196666, "grad_norm": 0.5384161348843884, "learning_rate": 6.963795191665737e-06, "loss": 0.5519, "step": 3985 }, { "epoch": 1.785086731245776, "grad_norm": 0.533148906155829, "learning_rate": 6.9592943461951226e-06, "loss": 0.5229, "step": 3986 }, { "epoch": 1.7855372831718856, "grad_norm": 0.5044495610434093, "learning_rate": 6.954794179351033e-06, "loss": 0.5087, "step": 3987 }, { "epoch": 1.7859878350979952, "grad_norm": 0.5106758033926164, "learning_rate": 6.950294692137811e-06, "loss": 0.5465, "step": 3988 }, { "epoch": 1.7864383870241045, "grad_norm": 0.5170015136695093, "learning_rate": 6.945795885559663e-06, "loss": 0.534, "step": 3989 }, { "epoch": 1.786888938950214, "grad_norm": 0.5381395397714187, "learning_rate": 6.941297760620626e-06, "loss": 0.5194, "step": 3990 }, { "epoch": 1.7873394908763234, "grad_norm": 0.5043364119681097, "learning_rate": 6.936800318324603e-06, "loss": 0.5301, "step": 3991 }, { "epoch": 1.787790042802433, "grad_norm": 0.5373470106756605, "learning_rate": 6.93230355967533e-06, "loss": 0.5536, "step": 3992 }, { "epoch": 1.7882405947285425, "grad_norm": 0.504834608068346, "learning_rate": 6.9278074856764015e-06, "loss": 0.5287, "step": 3993 }, { "epoch": 1.788691146654652, "grad_norm": 0.5209447549917043, "learning_rate": 6.923312097331247e-06, "loss": 0.5481, "step": 3994 }, { "epoch": 1.7891416985807616, "grad_norm": 0.5214053053410581, "learning_rate": 6.918817395643158e-06, "loss": 0.5143, "step": 3995 }, { "epoch": 1.7895922505068709, "grad_norm": 0.5308318619722173, "learning_rate": 6.914323381615255e-06, "loss": 0.538, "step": 3996 }, { "epoch": 1.7900428024329804, "grad_norm": 0.5452554514139346, "learning_rate": 6.909830056250527e-06, "loss": 0.5412, "step": 3997 }, { "epoch": 1.7904933543590897, "grad_norm": 0.5258450108855027, "learning_rate": 6.905337420551787e-06, "loss": 0.5595, "step": 3998 }, { "epoch": 1.7909439062851993, "grad_norm": 0.5260714751710097, "learning_rate": 6.900845475521709e-06, "loss": 0.515, "step": 3999 }, { "epoch": 1.7913944582113088, "grad_norm": 0.5199544274777259, "learning_rate": 6.8963542221628056e-06, "loss": 0.5231, "step": 4000 }, { "epoch": 1.7918450101374184, "grad_norm": 0.5140359216970236, "learning_rate": 6.8918636614774405e-06, "loss": 0.5263, "step": 4001 }, { "epoch": 1.792295562063528, "grad_norm": 0.49752885680567677, "learning_rate": 6.8873737944678145e-06, "loss": 0.5251, "step": 4002 }, { "epoch": 1.7927461139896375, "grad_norm": 0.5100493752715257, "learning_rate": 6.882884622135985e-06, "loss": 0.523, "step": 4003 }, { "epoch": 1.7931966659157468, "grad_norm": 0.5224749744445741, "learning_rate": 6.878396145483841e-06, "loss": 0.5451, "step": 4004 }, { "epoch": 1.7936472178418563, "grad_norm": 0.529232718312547, "learning_rate": 6.8739083655131335e-06, "loss": 0.5565, "step": 4005 }, { "epoch": 1.7940977697679656, "grad_norm": 0.5486250974597119, "learning_rate": 6.869421283225437e-06, "loss": 0.5232, "step": 4006 }, { "epoch": 1.7945483216940752, "grad_norm": 0.5270014069451222, "learning_rate": 6.864934899622191e-06, "loss": 0.5112, "step": 4007 }, { "epoch": 1.7949988736201847, "grad_norm": 0.5172175871432175, "learning_rate": 6.8604492157046585e-06, "loss": 0.514, "step": 4008 }, { "epoch": 1.7954494255462943, "grad_norm": 0.5096795580480574, "learning_rate": 6.855964232473969e-06, "loss": 0.5318, "step": 4009 }, { "epoch": 1.7958999774724038, "grad_norm": 0.5153073889585288, "learning_rate": 6.851479950931077e-06, "loss": 0.5385, "step": 4010 }, { "epoch": 1.7963505293985131, "grad_norm": 0.5213863225343491, "learning_rate": 6.846996372076786e-06, "loss": 0.5646, "step": 4011 }, { "epoch": 1.7968010813246227, "grad_norm": 0.5339345877090194, "learning_rate": 6.842513496911747e-06, "loss": 0.5653, "step": 4012 }, { "epoch": 1.797251633250732, "grad_norm": 0.5105677248704324, "learning_rate": 6.838031326436453e-06, "loss": 0.4989, "step": 4013 }, { "epoch": 1.7977021851768415, "grad_norm": 0.49553207249718323, "learning_rate": 6.833549861651229e-06, "loss": 0.5236, "step": 4014 }, { "epoch": 1.798152737102951, "grad_norm": 0.5169808181420726, "learning_rate": 6.829069103556261e-06, "loss": 0.5248, "step": 4015 }, { "epoch": 1.7986032890290606, "grad_norm": 0.536729733637095, "learning_rate": 6.824589053151558e-06, "loss": 0.5793, "step": 4016 }, { "epoch": 1.7990538409551702, "grad_norm": 0.521595149900934, "learning_rate": 6.820109711436989e-06, "loss": 0.5032, "step": 4017 }, { "epoch": 1.7995043928812797, "grad_norm": 0.5119788513566599, "learning_rate": 6.815631079412249e-06, "loss": 0.5213, "step": 4018 }, { "epoch": 1.799954944807389, "grad_norm": 0.5345912527566853, "learning_rate": 6.8111531580768875e-06, "loss": 0.5249, "step": 4019 }, { "epoch": 1.8004054967334986, "grad_norm": 0.5442343510470697, "learning_rate": 6.806675948430283e-06, "loss": 0.587, "step": 4020 }, { "epoch": 1.800856048659608, "grad_norm": 0.5074953199443938, "learning_rate": 6.802199451471672e-06, "loss": 0.5549, "step": 4021 }, { "epoch": 1.8013066005857175, "grad_norm": 0.5362591555182146, "learning_rate": 6.797723668200113e-06, "loss": 0.5596, "step": 4022 }, { "epoch": 1.801757152511827, "grad_norm": 0.5108831653386998, "learning_rate": 6.793248599614517e-06, "loss": 0.5439, "step": 4023 }, { "epoch": 1.8022077044379365, "grad_norm": 0.5302279533994008, "learning_rate": 6.788774246713632e-06, "loss": 0.537, "step": 4024 }, { "epoch": 1.802658256364046, "grad_norm": 0.5112508640678439, "learning_rate": 6.784300610496049e-06, "loss": 0.5265, "step": 4025 }, { "epoch": 1.8031088082901554, "grad_norm": 0.5111100841856067, "learning_rate": 6.77982769196019e-06, "loss": 0.5332, "step": 4026 }, { "epoch": 1.803559360216265, "grad_norm": 0.5087468452511041, "learning_rate": 6.775355492104329e-06, "loss": 0.5403, "step": 4027 }, { "epoch": 1.8040099121423743, "grad_norm": 0.522123938547562, "learning_rate": 6.77088401192658e-06, "loss": 0.536, "step": 4028 }, { "epoch": 1.8044604640684838, "grad_norm": 0.5255244086097868, "learning_rate": 6.766413252424878e-06, "loss": 0.5566, "step": 4029 }, { "epoch": 1.8049110159945934, "grad_norm": 0.4890809497026346, "learning_rate": 6.761943214597022e-06, "loss": 0.5511, "step": 4030 }, { "epoch": 1.805361567920703, "grad_norm": 0.5401862847716621, "learning_rate": 6.757473899440626e-06, "loss": 0.5707, "step": 4031 }, { "epoch": 1.8058121198468124, "grad_norm": 0.5241528127817149, "learning_rate": 6.7530053079531664e-06, "loss": 0.5578, "step": 4032 }, { "epoch": 1.806262671772922, "grad_norm": 0.5082009595564794, "learning_rate": 6.748537441131937e-06, "loss": 0.515, "step": 4033 }, { "epoch": 1.8067132236990313, "grad_norm": 0.486817499140271, "learning_rate": 6.744070299974082e-06, "loss": 0.5159, "step": 4034 }, { "epoch": 1.8071637756251406, "grad_norm": 0.5130827404357955, "learning_rate": 6.7396038854765825e-06, "loss": 0.5407, "step": 4035 }, { "epoch": 1.8076143275512502, "grad_norm": 0.5309394826233012, "learning_rate": 6.7351381986362555e-06, "loss": 0.5307, "step": 4036 }, { "epoch": 1.8080648794773597, "grad_norm": 0.526376033789589, "learning_rate": 6.7306732404497475e-06, "loss": 0.5458, "step": 4037 }, { "epoch": 1.8085154314034693, "grad_norm": 0.5554191725556847, "learning_rate": 6.726209011913565e-06, "loss": 0.5586, "step": 4038 }, { "epoch": 1.8089659833295788, "grad_norm": 0.5073530819955835, "learning_rate": 6.721745514024023e-06, "loss": 0.4833, "step": 4039 }, { "epoch": 1.8094165352556884, "grad_norm": 0.5183025164723851, "learning_rate": 6.717282747777299e-06, "loss": 0.4955, "step": 4040 }, { "epoch": 1.8098670871817977, "grad_norm": 0.5276540716655383, "learning_rate": 6.712820714169386e-06, "loss": 0.5047, "step": 4041 }, { "epoch": 1.8103176391079072, "grad_norm": 0.5435206713143177, "learning_rate": 6.7083594141961326e-06, "loss": 0.5299, "step": 4042 }, { "epoch": 1.8107681910340165, "grad_norm": 0.5334597166856744, "learning_rate": 6.703898848853207e-06, "loss": 0.5434, "step": 4043 }, { "epoch": 1.811218742960126, "grad_norm": 0.5209346643010742, "learning_rate": 6.699439019136127e-06, "loss": 0.542, "step": 4044 }, { "epoch": 1.8116692948862356, "grad_norm": 0.5184350244711176, "learning_rate": 6.694979926040237e-06, "loss": 0.5215, "step": 4045 }, { "epoch": 1.8121198468123452, "grad_norm": 0.5212683102359659, "learning_rate": 6.690521570560717e-06, "loss": 0.5353, "step": 4046 }, { "epoch": 1.8125703987384547, "grad_norm": 0.5158794810808665, "learning_rate": 6.686063953692591e-06, "loss": 0.531, "step": 4047 }, { "epoch": 1.8130209506645643, "grad_norm": 0.5308516986106051, "learning_rate": 6.6816070764307115e-06, "loss": 0.546, "step": 4048 }, { "epoch": 1.8134715025906736, "grad_norm": 0.5246966458295591, "learning_rate": 6.677150939769761e-06, "loss": 0.5305, "step": 4049 }, { "epoch": 1.813922054516783, "grad_norm": 0.5327828210071609, "learning_rate": 6.672695544704273e-06, "loss": 0.581, "step": 4050 }, { "epoch": 1.8143726064428924, "grad_norm": 0.5238225757780866, "learning_rate": 6.668240892228594e-06, "loss": 0.5343, "step": 4051 }, { "epoch": 1.814823158369002, "grad_norm": 0.5477590254538508, "learning_rate": 6.663786983336928e-06, "loss": 0.5518, "step": 4052 }, { "epoch": 1.8152737102951115, "grad_norm": 0.5355644942339387, "learning_rate": 6.659333819023291e-06, "loss": 0.5308, "step": 4053 }, { "epoch": 1.815724262221221, "grad_norm": 0.5443637266447061, "learning_rate": 6.654881400281548e-06, "loss": 0.5426, "step": 4054 }, { "epoch": 1.8161748141473306, "grad_norm": 0.5125298069468534, "learning_rate": 6.65042972810539e-06, "loss": 0.5481, "step": 4055 }, { "epoch": 1.81662536607344, "grad_norm": 0.5110367452095564, "learning_rate": 6.6459788034883465e-06, "loss": 0.5299, "step": 4056 }, { "epoch": 1.8170759179995495, "grad_norm": 0.5381461513600803, "learning_rate": 6.6415286274237744e-06, "loss": 0.5334, "step": 4057 }, { "epoch": 1.8175264699256588, "grad_norm": 0.508114371710363, "learning_rate": 6.637079200904872e-06, "loss": 0.522, "step": 4058 }, { "epoch": 1.8179770218517683, "grad_norm": 0.515660810818144, "learning_rate": 6.632630524924659e-06, "loss": 0.5688, "step": 4059 }, { "epoch": 1.818427573777878, "grad_norm": 0.5179817110751368, "learning_rate": 6.6281826004759985e-06, "loss": 0.5257, "step": 4060 }, { "epoch": 1.8188781257039874, "grad_norm": 0.5026701727434156, "learning_rate": 6.623735428551574e-06, "loss": 0.5393, "step": 4061 }, { "epoch": 1.819328677630097, "grad_norm": 0.5141238832142825, "learning_rate": 6.619289010143918e-06, "loss": 0.5352, "step": 4062 }, { "epoch": 1.8197792295562063, "grad_norm": 0.5234369293130695, "learning_rate": 6.6148433462453745e-06, "loss": 0.5379, "step": 4063 }, { "epoch": 1.8202297814823158, "grad_norm": 0.5211349870558837, "learning_rate": 6.610398437848138e-06, "loss": 0.5236, "step": 4064 }, { "epoch": 1.8206803334084252, "grad_norm": 0.507948319180586, "learning_rate": 6.605954285944218e-06, "loss": 0.5395, "step": 4065 }, { "epoch": 1.8211308853345347, "grad_norm": 0.5113717494274441, "learning_rate": 6.601510891525471e-06, "loss": 0.5055, "step": 4066 }, { "epoch": 1.8215814372606443, "grad_norm": 0.5368211078425392, "learning_rate": 6.59706825558357e-06, "loss": 0.52, "step": 4067 }, { "epoch": 1.8220319891867538, "grad_norm": 0.5065058009881835, "learning_rate": 6.592626379110031e-06, "loss": 0.552, "step": 4068 }, { "epoch": 1.8224825411128633, "grad_norm": 0.5288033712382583, "learning_rate": 6.588185263096188e-06, "loss": 0.5374, "step": 4069 }, { "epoch": 1.8229330930389729, "grad_norm": 0.5186197939840366, "learning_rate": 6.583744908533218e-06, "loss": 0.5424, "step": 4070 }, { "epoch": 1.8233836449650822, "grad_norm": 0.519456328958029, "learning_rate": 6.579305316412119e-06, "loss": 0.5582, "step": 4071 }, { "epoch": 1.8238341968911918, "grad_norm": 0.5140241005950188, "learning_rate": 6.5748664877237215e-06, "loss": 0.5439, "step": 4072 }, { "epoch": 1.824284748817301, "grad_norm": 0.5101001500470452, "learning_rate": 6.570428423458687e-06, "loss": 0.543, "step": 4073 }, { "epoch": 1.8247353007434106, "grad_norm": 0.5209620975476865, "learning_rate": 6.565991124607507e-06, "loss": 0.536, "step": 4074 }, { "epoch": 1.8251858526695202, "grad_norm": 0.5169225417839468, "learning_rate": 6.561554592160494e-06, "loss": 0.5298, "step": 4075 }, { "epoch": 1.8256364045956297, "grad_norm": 0.5194433397701859, "learning_rate": 6.5571188271078045e-06, "loss": 0.5451, "step": 4076 }, { "epoch": 1.8260869565217392, "grad_norm": 0.5315706073034845, "learning_rate": 6.552683830439408e-06, "loss": 0.5307, "step": 4077 }, { "epoch": 1.8265375084478486, "grad_norm": 0.5315385190704505, "learning_rate": 6.5482496031451136e-06, "loss": 0.534, "step": 4078 }, { "epoch": 1.8269880603739581, "grad_norm": 0.5168137263852799, "learning_rate": 6.5438161462145575e-06, "loss": 0.5607, "step": 4079 }, { "epoch": 1.8274386123000674, "grad_norm": 0.52946175640545, "learning_rate": 6.539383460637197e-06, "loss": 0.5536, "step": 4080 }, { "epoch": 1.827889164226177, "grad_norm": 0.5224733979705517, "learning_rate": 6.534951547402322e-06, "loss": 0.5391, "step": 4081 }, { "epoch": 1.8283397161522865, "grad_norm": 0.5449927914191133, "learning_rate": 6.530520407499049e-06, "loss": 0.5346, "step": 4082 }, { "epoch": 1.828790268078396, "grad_norm": 0.5240084708389381, "learning_rate": 6.526090041916327e-06, "loss": 0.5459, "step": 4083 }, { "epoch": 1.8292408200045056, "grad_norm": 0.5236462569776389, "learning_rate": 6.5216604516429196e-06, "loss": 0.5066, "step": 4084 }, { "epoch": 1.8296913719306152, "grad_norm": 0.5480296287858825, "learning_rate": 6.517231637667435e-06, "loss": 0.4984, "step": 4085 }, { "epoch": 1.8301419238567245, "grad_norm": 0.5270436169324527, "learning_rate": 6.512803600978289e-06, "loss": 0.5371, "step": 4086 }, { "epoch": 1.830592475782834, "grad_norm": 0.5319356003361234, "learning_rate": 6.508376342563742e-06, "loss": 0.5145, "step": 4087 }, { "epoch": 1.8310430277089433, "grad_norm": 0.5349866904900092, "learning_rate": 6.503949863411866e-06, "loss": 0.5367, "step": 4088 }, { "epoch": 1.8314935796350529, "grad_norm": 0.5354115122748369, "learning_rate": 6.499524164510571e-06, "loss": 0.4973, "step": 4089 }, { "epoch": 1.8319441315611624, "grad_norm": 0.5416294517411173, "learning_rate": 6.495099246847578e-06, "loss": 0.5318, "step": 4090 }, { "epoch": 1.832394683487272, "grad_norm": 0.5122602990159478, "learning_rate": 6.490675111410455e-06, "loss": 0.5529, "step": 4091 }, { "epoch": 1.8328452354133815, "grad_norm": 0.5570983246185117, "learning_rate": 6.486251759186573e-06, "loss": 0.5588, "step": 4092 }, { "epoch": 1.8332957873394908, "grad_norm": 0.5600482951341302, "learning_rate": 6.4818291911631445e-06, "loss": 0.5239, "step": 4093 }, { "epoch": 1.8337463392656004, "grad_norm": 0.5180837150326034, "learning_rate": 6.477407408327198e-06, "loss": 0.5343, "step": 4094 }, { "epoch": 1.8341968911917097, "grad_norm": 0.5474098536793135, "learning_rate": 6.47298641166559e-06, "loss": 0.5385, "step": 4095 }, { "epoch": 1.8346474431178192, "grad_norm": 0.5440375012096891, "learning_rate": 6.4685662021650005e-06, "loss": 0.5573, "step": 4096 }, { "epoch": 1.8350979950439288, "grad_norm": 0.5389866188433617, "learning_rate": 6.464146780811938e-06, "loss": 0.5601, "step": 4097 }, { "epoch": 1.8355485469700383, "grad_norm": 0.5468322969223023, "learning_rate": 6.459728148592727e-06, "loss": 0.5679, "step": 4098 }, { "epoch": 1.8359990988961479, "grad_norm": 0.5372449792832905, "learning_rate": 6.455310306493525e-06, "loss": 0.5159, "step": 4099 }, { "epoch": 1.8364496508222574, "grad_norm": 0.5430926306502617, "learning_rate": 6.450893255500303e-06, "loss": 0.5415, "step": 4100 }, { "epoch": 1.8369002027483667, "grad_norm": 0.560050432663696, "learning_rate": 6.4464769965988676e-06, "loss": 0.5174, "step": 4101 }, { "epoch": 1.8373507546744763, "grad_norm": 0.5281785438683625, "learning_rate": 6.442061530774835e-06, "loss": 0.5177, "step": 4102 }, { "epoch": 1.8378013066005856, "grad_norm": 0.5342580056375265, "learning_rate": 6.43764685901366e-06, "loss": 0.5442, "step": 4103 }, { "epoch": 1.8382518585266951, "grad_norm": 0.535467214012743, "learning_rate": 6.433232982300604e-06, "loss": 0.5856, "step": 4104 }, { "epoch": 1.8387024104528047, "grad_norm": 0.511398315029172, "learning_rate": 6.4288199016207645e-06, "loss": 0.5189, "step": 4105 }, { "epoch": 1.8391529623789142, "grad_norm": 0.5173482155723556, "learning_rate": 6.424407617959052e-06, "loss": 0.4975, "step": 4106 }, { "epoch": 1.8396035143050238, "grad_norm": 0.5287753837649688, "learning_rate": 6.419996132300203e-06, "loss": 0.5409, "step": 4107 }, { "epoch": 1.840054066231133, "grad_norm": 0.5076833444550497, "learning_rate": 6.415585445628776e-06, "loss": 0.5337, "step": 4108 }, { "epoch": 1.8405046181572426, "grad_norm": 0.5593151668683135, "learning_rate": 6.411175558929152e-06, "loss": 0.5774, "step": 4109 }, { "epoch": 1.840955170083352, "grad_norm": 0.5159619510646013, "learning_rate": 6.406766473185528e-06, "loss": 0.5079, "step": 4110 }, { "epoch": 1.8414057220094615, "grad_norm": 0.524898323682833, "learning_rate": 6.4023581893819345e-06, "loss": 0.5276, "step": 4111 }, { "epoch": 1.841856273935571, "grad_norm": 0.5267379821413425, "learning_rate": 6.397950708502203e-06, "loss": 0.4895, "step": 4112 }, { "epoch": 1.8423068258616806, "grad_norm": 0.5092132372869329, "learning_rate": 6.39354403153001e-06, "loss": 0.5396, "step": 4113 }, { "epoch": 1.8427573777877901, "grad_norm": 0.5177522597652795, "learning_rate": 6.389138159448831e-06, "loss": 0.5101, "step": 4114 }, { "epoch": 1.8432079297138997, "grad_norm": 0.5192701246237792, "learning_rate": 6.384733093241979e-06, "loss": 0.5198, "step": 4115 }, { "epoch": 1.843658481640009, "grad_norm": 0.5260081458858047, "learning_rate": 6.380328833892571e-06, "loss": 0.5253, "step": 4116 }, { "epoch": 1.8441090335661185, "grad_norm": 0.5483084083490406, "learning_rate": 6.375925382383561e-06, "loss": 0.5135, "step": 4117 }, { "epoch": 1.8445595854922279, "grad_norm": 0.5180649727440788, "learning_rate": 6.371522739697707e-06, "loss": 0.5249, "step": 4118 }, { "epoch": 1.8450101374183374, "grad_norm": 0.5421778901529396, "learning_rate": 6.367120906817597e-06, "loss": 0.555, "step": 4119 }, { "epoch": 1.845460689344447, "grad_norm": 0.515713091685765, "learning_rate": 6.362719884725633e-06, "loss": 0.5013, "step": 4120 }, { "epoch": 1.8459112412705565, "grad_norm": 0.5297505726172896, "learning_rate": 6.358319674404041e-06, "loss": 0.5404, "step": 4121 }, { "epoch": 1.846361793196666, "grad_norm": 0.5396094043088905, "learning_rate": 6.353920276834855e-06, "loss": 0.5521, "step": 4122 }, { "epoch": 1.8468123451227754, "grad_norm": 0.5312623810899202, "learning_rate": 6.349521692999945e-06, "loss": 0.5493, "step": 4123 }, { "epoch": 1.847262897048885, "grad_norm": 0.5323900232577472, "learning_rate": 6.345123923880981e-06, "loss": 0.5438, "step": 4124 }, { "epoch": 1.8477134489749942, "grad_norm": 0.5317837998804126, "learning_rate": 6.3407269704594674e-06, "loss": 0.5524, "step": 4125 }, { "epoch": 1.8481640009011038, "grad_norm": 0.5356931341874038, "learning_rate": 6.336330833716713e-06, "loss": 0.5484, "step": 4126 }, { "epoch": 1.8486145528272133, "grad_norm": 0.5251638527069249, "learning_rate": 6.33193551463385e-06, "loss": 0.5037, "step": 4127 }, { "epoch": 1.8490651047533229, "grad_norm": 0.5404265062526049, "learning_rate": 6.327541014191836e-06, "loss": 0.5232, "step": 4128 }, { "epoch": 1.8495156566794324, "grad_norm": 0.572789278753079, "learning_rate": 6.323147333371431e-06, "loss": 0.5163, "step": 4129 }, { "epoch": 1.849966208605542, "grad_norm": 0.5494126801260385, "learning_rate": 6.318754473153221e-06, "loss": 0.5125, "step": 4130 }, { "epoch": 1.8504167605316513, "grad_norm": 0.5093124698428391, "learning_rate": 6.3143624345176065e-06, "loss": 0.5371, "step": 4131 }, { "epoch": 1.8508673124577606, "grad_norm": 0.5229653582259071, "learning_rate": 6.30997121844481e-06, "loss": 0.5146, "step": 4132 }, { "epoch": 1.8513178643838701, "grad_norm": 0.5355920878633479, "learning_rate": 6.305580825914859e-06, "loss": 0.485, "step": 4133 }, { "epoch": 1.8517684163099797, "grad_norm": 0.5292856435793469, "learning_rate": 6.301191257907609e-06, "loss": 0.5158, "step": 4134 }, { "epoch": 1.8522189682360892, "grad_norm": 0.5235742042297765, "learning_rate": 6.296802515402722e-06, "loss": 0.5215, "step": 4135 }, { "epoch": 1.8526695201621988, "grad_norm": 1.3706468642215255, "learning_rate": 6.292414599379686e-06, "loss": 0.5524, "step": 4136 }, { "epoch": 1.8531200720883083, "grad_norm": 0.5908152620098531, "learning_rate": 6.2880275108177915e-06, "loss": 0.5246, "step": 4137 }, { "epoch": 1.8535706240144176, "grad_norm": 0.5489531286190314, "learning_rate": 6.28364125069616e-06, "loss": 0.5543, "step": 4138 }, { "epoch": 1.8540211759405272, "grad_norm": 0.5353353183553312, "learning_rate": 6.279255819993711e-06, "loss": 0.5332, "step": 4139 }, { "epoch": 1.8544717278666365, "grad_norm": 0.5679175674419515, "learning_rate": 6.274871219689196e-06, "loss": 0.502, "step": 4140 }, { "epoch": 1.854922279792746, "grad_norm": 0.5498516559712952, "learning_rate": 6.270487450761167e-06, "loss": 0.4905, "step": 4141 }, { "epoch": 1.8553728317188556, "grad_norm": 0.5919785070876754, "learning_rate": 6.266104514187997e-06, "loss": 0.5764, "step": 4142 }, { "epoch": 1.8558233836449651, "grad_norm": 0.5323458279770901, "learning_rate": 6.261722410947873e-06, "loss": 0.5072, "step": 4143 }, { "epoch": 1.8562739355710747, "grad_norm": 0.5514215800256219, "learning_rate": 6.257341142018798e-06, "loss": 0.5038, "step": 4144 }, { "epoch": 1.856724487497184, "grad_norm": 0.5601370943722098, "learning_rate": 6.25296070837858e-06, "loss": 0.5258, "step": 4145 }, { "epoch": 1.8571750394232935, "grad_norm": 0.513279655863446, "learning_rate": 6.248581111004855e-06, "loss": 0.5257, "step": 4146 }, { "epoch": 1.8576255913494029, "grad_norm": 0.5540398046946297, "learning_rate": 6.244202350875055e-06, "loss": 0.5127, "step": 4147 }, { "epoch": 1.8580761432755124, "grad_norm": 0.5440689229259474, "learning_rate": 6.2398244289664435e-06, "loss": 0.5429, "step": 4148 }, { "epoch": 1.858526695201622, "grad_norm": 0.5475591734630524, "learning_rate": 6.23544734625608e-06, "loss": 0.5461, "step": 4149 }, { "epoch": 1.8589772471277315, "grad_norm": 0.5153494053682282, "learning_rate": 6.2310711037208515e-06, "loss": 0.5367, "step": 4150 }, { "epoch": 1.859427799053841, "grad_norm": 0.5410016390188028, "learning_rate": 6.2266957023374434e-06, "loss": 0.5276, "step": 4151 }, { "epoch": 1.8598783509799506, "grad_norm": 0.5460190558623482, "learning_rate": 6.222321143082366e-06, "loss": 0.5266, "step": 4152 }, { "epoch": 1.86032890290606, "grad_norm": 0.5095408985942743, "learning_rate": 6.217947426931932e-06, "loss": 0.5705, "step": 4153 }, { "epoch": 1.8607794548321694, "grad_norm": 0.5217929012441115, "learning_rate": 6.213574554862275e-06, "loss": 0.5146, "step": 4154 }, { "epoch": 1.8612300067582788, "grad_norm": 0.5603934283603925, "learning_rate": 6.20920252784933e-06, "loss": 0.5572, "step": 4155 }, { "epoch": 1.8616805586843883, "grad_norm": 0.5305169716800924, "learning_rate": 6.204831346868854e-06, "loss": 0.538, "step": 4156 }, { "epoch": 1.8621311106104979, "grad_norm": 0.5369477215831551, "learning_rate": 6.200461012896401e-06, "loss": 0.5607, "step": 4157 }, { "epoch": 1.8625816625366074, "grad_norm": 0.5355074073205589, "learning_rate": 6.196091526907355e-06, "loss": 0.5294, "step": 4158 }, { "epoch": 1.863032214462717, "grad_norm": 0.5202892109721596, "learning_rate": 6.191722889876892e-06, "loss": 0.5124, "step": 4159 }, { "epoch": 1.8634827663888263, "grad_norm": 0.5611435985827281, "learning_rate": 6.187355102780015e-06, "loss": 0.5178, "step": 4160 }, { "epoch": 1.8639333183149358, "grad_norm": 0.5218487169358437, "learning_rate": 6.182988166591522e-06, "loss": 0.5281, "step": 4161 }, { "epoch": 1.8643838702410451, "grad_norm": 0.5392567372057404, "learning_rate": 6.178622082286034e-06, "loss": 0.5472, "step": 4162 }, { "epoch": 1.8648344221671547, "grad_norm": 0.5102750951300106, "learning_rate": 6.174256850837972e-06, "loss": 0.5265, "step": 4163 }, { "epoch": 1.8652849740932642, "grad_norm": 0.8385601630812395, "learning_rate": 6.169892473221577e-06, "loss": 0.5205, "step": 4164 }, { "epoch": 1.8657355260193738, "grad_norm": 0.5165995926830618, "learning_rate": 6.165528950410884e-06, "loss": 0.493, "step": 4165 }, { "epoch": 1.8661860779454833, "grad_norm": 0.5324442878068987, "learning_rate": 6.161166283379757e-06, "loss": 0.5258, "step": 4166 }, { "epoch": 1.8666366298715928, "grad_norm": 0.4890669663863523, "learning_rate": 6.156804473101852e-06, "loss": 0.5198, "step": 4167 }, { "epoch": 1.8670871817977022, "grad_norm": 0.5403605340419468, "learning_rate": 6.152443520550642e-06, "loss": 0.5919, "step": 4168 }, { "epoch": 1.8675377337238117, "grad_norm": 0.5133952873271778, "learning_rate": 6.148083426699407e-06, "loss": 0.543, "step": 4169 }, { "epoch": 1.867988285649921, "grad_norm": 0.5283331257610121, "learning_rate": 6.143724192521238e-06, "loss": 0.525, "step": 4170 }, { "epoch": 1.8684388375760306, "grad_norm": 0.5111015486371766, "learning_rate": 6.139365818989025e-06, "loss": 0.5445, "step": 4171 }, { "epoch": 1.8688893895021401, "grad_norm": 0.5530408333329045, "learning_rate": 6.13500830707548e-06, "loss": 0.5659, "step": 4172 }, { "epoch": 1.8693399414282497, "grad_norm": 0.542862013997174, "learning_rate": 6.130651657753109e-06, "loss": 0.5237, "step": 4173 }, { "epoch": 1.8697904933543592, "grad_norm": 0.5328584362720947, "learning_rate": 6.126295871994236e-06, "loss": 0.5046, "step": 4174 }, { "epoch": 1.8702410452804685, "grad_norm": 0.5100690576474948, "learning_rate": 6.121940950770986e-06, "loss": 0.5202, "step": 4175 }, { "epoch": 1.870691597206578, "grad_norm": 0.5269044387073407, "learning_rate": 6.117586895055292e-06, "loss": 0.5274, "step": 4176 }, { "epoch": 1.8711421491326874, "grad_norm": 0.5413366159686581, "learning_rate": 6.113233705818897e-06, "loss": 0.542, "step": 4177 }, { "epoch": 1.871592701058797, "grad_norm": 0.546870183269487, "learning_rate": 6.108881384033348e-06, "loss": 0.5349, "step": 4178 }, { "epoch": 1.8720432529849065, "grad_norm": 0.5318434937437492, "learning_rate": 6.10452993067e-06, "loss": 0.5083, "step": 4179 }, { "epoch": 1.872493804911016, "grad_norm": 0.5395301104784628, "learning_rate": 6.100179346700007e-06, "loss": 0.5102, "step": 4180 }, { "epoch": 1.8729443568371256, "grad_norm": 0.5275310320916924, "learning_rate": 6.095829633094344e-06, "loss": 0.5166, "step": 4181 }, { "epoch": 1.8733949087632351, "grad_norm": 0.5195377242375223, "learning_rate": 6.091480790823772e-06, "loss": 0.5227, "step": 4182 }, { "epoch": 1.8738454606893444, "grad_norm": 0.5280196877114898, "learning_rate": 6.0871328208588785e-06, "loss": 0.5379, "step": 4183 }, { "epoch": 1.874296012615454, "grad_norm": 0.5152933543813041, "learning_rate": 6.0827857241700375e-06, "loss": 0.5269, "step": 4184 }, { "epoch": 1.8747465645415633, "grad_norm": 0.5052760317120434, "learning_rate": 6.078439501727446e-06, "loss": 0.5017, "step": 4185 }, { "epoch": 1.8751971164676728, "grad_norm": 0.5214444712421065, "learning_rate": 6.074094154501087e-06, "loss": 0.5303, "step": 4186 }, { "epoch": 1.8756476683937824, "grad_norm": 0.5575981263887396, "learning_rate": 6.069749683460765e-06, "loss": 0.5241, "step": 4187 }, { "epoch": 1.876098220319892, "grad_norm": 0.5435508722751865, "learning_rate": 6.0654060895760755e-06, "loss": 0.5643, "step": 4188 }, { "epoch": 1.8765487722460015, "grad_norm": 0.5078202056799589, "learning_rate": 6.061063373816432e-06, "loss": 0.5274, "step": 4189 }, { "epoch": 1.8769993241721108, "grad_norm": 0.5321408822294319, "learning_rate": 6.056721537151037e-06, "loss": 0.5131, "step": 4190 }, { "epoch": 1.8774498760982203, "grad_norm": 0.5377683800335261, "learning_rate": 6.052380580548908e-06, "loss": 0.5432, "step": 4191 }, { "epoch": 1.8779004280243297, "grad_norm": 0.5269545701496154, "learning_rate": 6.048040504978861e-06, "loss": 0.5321, "step": 4192 }, { "epoch": 1.8783509799504392, "grad_norm": 0.5334051394788968, "learning_rate": 6.04370131140952e-06, "loss": 0.5199, "step": 4193 }, { "epoch": 1.8788015318765487, "grad_norm": 0.5013555768229302, "learning_rate": 6.039363000809302e-06, "loss": 0.5087, "step": 4194 }, { "epoch": 1.8792520838026583, "grad_norm": 0.5257505212011543, "learning_rate": 6.035025574146441e-06, "loss": 0.5439, "step": 4195 }, { "epoch": 1.8797026357287678, "grad_norm": 0.5058966270948753, "learning_rate": 6.030689032388959e-06, "loss": 0.5151, "step": 4196 }, { "epoch": 1.8801531876548774, "grad_norm": 0.541180463675636, "learning_rate": 6.026353376504698e-06, "loss": 0.5405, "step": 4197 }, { "epoch": 1.8806037395809867, "grad_norm": 0.5459902181951023, "learning_rate": 6.022018607461282e-06, "loss": 0.5218, "step": 4198 }, { "epoch": 1.8810542915070962, "grad_norm": 0.5239535698618825, "learning_rate": 6.017684726226156e-06, "loss": 0.5741, "step": 4199 }, { "epoch": 1.8815048434332056, "grad_norm": 0.5411003576223062, "learning_rate": 6.0133517337665504e-06, "loss": 0.5617, "step": 4200 }, { "epoch": 1.881955395359315, "grad_norm": 0.5334985223635598, "learning_rate": 6.009019631049512e-06, "loss": 0.5055, "step": 4201 }, { "epoch": 1.8824059472854247, "grad_norm": 0.5300270946221666, "learning_rate": 6.004688419041877e-06, "loss": 0.5106, "step": 4202 }, { "epoch": 1.8828564992115342, "grad_norm": 0.5402477520311906, "learning_rate": 6.000358098710292e-06, "loss": 0.5391, "step": 4203 }, { "epoch": 1.8833070511376437, "grad_norm": 0.5339865135423324, "learning_rate": 5.996028671021197e-06, "loss": 0.5523, "step": 4204 }, { "epoch": 1.883757603063753, "grad_norm": 0.5227897907541693, "learning_rate": 5.991700136940839e-06, "loss": 0.5103, "step": 4205 }, { "epoch": 1.8842081549898626, "grad_norm": 0.5294393748213391, "learning_rate": 5.987372497435259e-06, "loss": 0.5387, "step": 4206 }, { "epoch": 1.884658706915972, "grad_norm": 0.5465057893672541, "learning_rate": 5.983045753470308e-06, "loss": 0.5284, "step": 4207 }, { "epoch": 1.8851092588420815, "grad_norm": 0.5109085166086524, "learning_rate": 5.978719906011624e-06, "loss": 0.5295, "step": 4208 }, { "epoch": 1.885559810768191, "grad_norm": 0.5136621780582101, "learning_rate": 5.97439495602466e-06, "loss": 0.5291, "step": 4209 }, { "epoch": 1.8860103626943006, "grad_norm": 0.5241203922973976, "learning_rate": 5.970070904474654e-06, "loss": 0.5061, "step": 4210 }, { "epoch": 1.88646091462041, "grad_norm": 0.5436652813491321, "learning_rate": 5.965747752326658e-06, "loss": 0.5538, "step": 4211 }, { "epoch": 1.8869114665465196, "grad_norm": 0.5090479618264493, "learning_rate": 5.961425500545508e-06, "loss": 0.5525, "step": 4212 }, { "epoch": 1.887362018472629, "grad_norm": 0.518639343724184, "learning_rate": 5.957104150095853e-06, "loss": 0.5363, "step": 4213 }, { "epoch": 1.8878125703987385, "grad_norm": 0.5207231819708823, "learning_rate": 5.952783701942129e-06, "loss": 0.5309, "step": 4214 }, { "epoch": 1.8882631223248478, "grad_norm": 0.5306938903689269, "learning_rate": 5.948464157048581e-06, "loss": 0.5119, "step": 4215 }, { "epoch": 1.8887136742509574, "grad_norm": 0.5694868660024894, "learning_rate": 5.944145516379244e-06, "loss": 0.5597, "step": 4216 }, { "epoch": 1.889164226177067, "grad_norm": 0.5006273447584632, "learning_rate": 5.93982778089796e-06, "loss": 0.4822, "step": 4217 }, { "epoch": 1.8896147781031765, "grad_norm": 0.5114866145958403, "learning_rate": 5.9355109515683555e-06, "loss": 0.5234, "step": 4218 }, { "epoch": 1.890065330029286, "grad_norm": 0.5300137001413524, "learning_rate": 5.9311950293538714e-06, "loss": 0.5391, "step": 4219 }, { "epoch": 1.8905158819553953, "grad_norm": 0.5174828334166133, "learning_rate": 5.92688001521773e-06, "loss": 0.5397, "step": 4220 }, { "epoch": 1.8909664338815049, "grad_norm": 0.5186489499812413, "learning_rate": 5.922565910122967e-06, "loss": 0.5314, "step": 4221 }, { "epoch": 1.8914169858076142, "grad_norm": 0.5309048272708579, "learning_rate": 5.9182527150324e-06, "loss": 0.5611, "step": 4222 }, { "epoch": 1.8918675377337237, "grad_norm": 0.510486206104985, "learning_rate": 5.913940430908657e-06, "loss": 0.5522, "step": 4223 }, { "epoch": 1.8923180896598333, "grad_norm": 0.5060413496743049, "learning_rate": 5.909629058714148e-06, "loss": 0.5289, "step": 4224 }, { "epoch": 1.8927686415859428, "grad_norm": 0.526259739414804, "learning_rate": 5.9053185994110975e-06, "loss": 0.5191, "step": 4225 }, { "epoch": 1.8932191935120524, "grad_norm": 0.5129268500456862, "learning_rate": 5.9010090539615065e-06, "loss": 0.5399, "step": 4226 }, { "epoch": 1.893669745438162, "grad_norm": 0.5170234704691115, "learning_rate": 5.896700423327189e-06, "loss": 0.505, "step": 4227 }, { "epoch": 1.8941202973642712, "grad_norm": 0.5347218837531019, "learning_rate": 5.8923927084697475e-06, "loss": 0.5141, "step": 4228 }, { "epoch": 1.8945708492903806, "grad_norm": 0.5136613335421157, "learning_rate": 5.888085910350574e-06, "loss": 0.4723, "step": 4229 }, { "epoch": 1.89502140121649, "grad_norm": 0.5297099625511383, "learning_rate": 5.88378002993087e-06, "loss": 0.5425, "step": 4230 }, { "epoch": 1.8954719531425996, "grad_norm": 0.5162091637224145, "learning_rate": 5.879475068171618e-06, "loss": 0.5249, "step": 4231 }, { "epoch": 1.8959225050687092, "grad_norm": 0.5130369256868802, "learning_rate": 5.875171026033609e-06, "loss": 0.5513, "step": 4232 }, { "epoch": 1.8963730569948187, "grad_norm": 0.5290020159430157, "learning_rate": 5.870867904477413e-06, "loss": 0.5029, "step": 4233 }, { "epoch": 1.8968236089209283, "grad_norm": 0.52419224893173, "learning_rate": 5.866565704463414e-06, "loss": 0.5428, "step": 4234 }, { "epoch": 1.8972741608470376, "grad_norm": 0.4996111207329128, "learning_rate": 5.862264426951768e-06, "loss": 0.5254, "step": 4235 }, { "epoch": 1.8977247127731471, "grad_norm": 0.5426713693926366, "learning_rate": 5.8579640729024465e-06, "loss": 0.5182, "step": 4236 }, { "epoch": 1.8981752646992565, "grad_norm": 0.53484467619406, "learning_rate": 5.8536646432751994e-06, "loss": 0.5399, "step": 4237 }, { "epoch": 1.898625816625366, "grad_norm": 0.5330816436013405, "learning_rate": 5.849366139029578e-06, "loss": 0.5521, "step": 4238 }, { "epoch": 1.8990763685514755, "grad_norm": 0.5436482606871762, "learning_rate": 5.845068561124925e-06, "loss": 0.5313, "step": 4239 }, { "epoch": 1.899526920477585, "grad_norm": 0.5147859248861226, "learning_rate": 5.840771910520376e-06, "loss": 0.5225, "step": 4240 }, { "epoch": 1.8999774724036946, "grad_norm": 0.5242894056609675, "learning_rate": 5.836476188174855e-06, "loss": 0.563, "step": 4241 }, { "epoch": 1.900428024329804, "grad_norm": 0.5316006621039928, "learning_rate": 5.832181395047099e-06, "loss": 0.5215, "step": 4242 }, { "epoch": 1.9008785762559135, "grad_norm": 0.5223731977870786, "learning_rate": 5.827887532095606e-06, "loss": 0.5431, "step": 4243 }, { "epoch": 1.9013291281820228, "grad_norm": 0.5193793909490496, "learning_rate": 5.82359460027869e-06, "loss": 0.5686, "step": 4244 }, { "epoch": 1.9017796801081324, "grad_norm": 0.5158999022400731, "learning_rate": 5.8193026005544504e-06, "loss": 0.5465, "step": 4245 }, { "epoch": 1.902230232034242, "grad_norm": 0.5458029814256885, "learning_rate": 5.8150115338807775e-06, "loss": 0.5341, "step": 4246 }, { "epoch": 1.9026807839603515, "grad_norm": 0.5068734644790758, "learning_rate": 5.810721401215353e-06, "loss": 0.5244, "step": 4247 }, { "epoch": 1.903131335886461, "grad_norm": 0.5154702488071911, "learning_rate": 5.806432203515655e-06, "loss": 0.5447, "step": 4248 }, { "epoch": 1.9035818878125705, "grad_norm": 0.5156644270415655, "learning_rate": 5.802143941738945e-06, "loss": 0.5406, "step": 4249 }, { "epoch": 1.9040324397386799, "grad_norm": 0.5108778192348965, "learning_rate": 5.797856616842281e-06, "loss": 0.5305, "step": 4250 }, { "epoch": 1.9044829916647894, "grad_norm": 0.5088592723798506, "learning_rate": 5.793570229782512e-06, "loss": 0.5594, "step": 4251 }, { "epoch": 1.9049335435908987, "grad_norm": 0.5066888797128246, "learning_rate": 5.7892847815162754e-06, "loss": 0.5377, "step": 4252 }, { "epoch": 1.9053840955170083, "grad_norm": 0.5025488667824004, "learning_rate": 5.785000273000001e-06, "loss": 0.5119, "step": 4253 }, { "epoch": 1.9058346474431178, "grad_norm": 0.5454041959320938, "learning_rate": 5.780716705189906e-06, "loss": 0.5068, "step": 4254 }, { "epoch": 1.9062851993692274, "grad_norm": 0.5003448913960751, "learning_rate": 5.776434079042003e-06, "loss": 0.5166, "step": 4255 }, { "epoch": 1.906735751295337, "grad_norm": 0.4986924781745644, "learning_rate": 5.772152395512087e-06, "loss": 0.5204, "step": 4256 }, { "epoch": 1.9071863032214462, "grad_norm": 0.5245402824665636, "learning_rate": 5.7678716555557515e-06, "loss": 0.5455, "step": 4257 }, { "epoch": 1.9076368551475558, "grad_norm": 0.5208161887304097, "learning_rate": 5.7635918601283745e-06, "loss": 0.5386, "step": 4258 }, { "epoch": 1.908087407073665, "grad_norm": 0.5197544207692141, "learning_rate": 5.759313010185113e-06, "loss": 0.5689, "step": 4259 }, { "epoch": 1.9085379589997746, "grad_norm": 0.5254469041798833, "learning_rate": 5.75503510668094e-06, "loss": 0.5342, "step": 4260 }, { "epoch": 1.9089885109258842, "grad_norm": 0.49551825353009815, "learning_rate": 5.750758150570583e-06, "loss": 0.5181, "step": 4261 }, { "epoch": 1.9094390628519937, "grad_norm": 0.5302266294064966, "learning_rate": 5.746482142808592e-06, "loss": 0.5737, "step": 4262 }, { "epoch": 1.9098896147781033, "grad_norm": 0.5284945817359985, "learning_rate": 5.742207084349274e-06, "loss": 0.5441, "step": 4263 }, { "epoch": 1.9103401667042128, "grad_norm": 0.5091563901502785, "learning_rate": 5.737932976146754e-06, "loss": 0.5389, "step": 4264 }, { "epoch": 1.9107907186303221, "grad_norm": 0.5265145638846146, "learning_rate": 5.733659819154915e-06, "loss": 0.5618, "step": 4265 }, { "epoch": 1.9112412705564317, "grad_norm": 0.5372715351883841, "learning_rate": 5.729387614327458e-06, "loss": 0.5404, "step": 4266 }, { "epoch": 1.911691822482541, "grad_norm": 0.511947554971738, "learning_rate": 5.72511636261784e-06, "loss": 0.5233, "step": 4267 }, { "epoch": 1.9121423744086505, "grad_norm": 0.49340111099766615, "learning_rate": 5.720846064979338e-06, "loss": 0.5093, "step": 4268 }, { "epoch": 1.91259292633476, "grad_norm": 0.4847951245362489, "learning_rate": 5.716576722364988e-06, "loss": 0.5164, "step": 4269 }, { "epoch": 1.9130434782608696, "grad_norm": 0.5099302235685644, "learning_rate": 5.712308335727629e-06, "loss": 0.513, "step": 4270 }, { "epoch": 1.9134940301869792, "grad_norm": 0.5017454141269901, "learning_rate": 5.708040906019881e-06, "loss": 0.5373, "step": 4271 }, { "epoch": 1.9139445821130885, "grad_norm": 0.5163106718540507, "learning_rate": 5.7037744341941515e-06, "loss": 0.5406, "step": 4272 }, { "epoch": 1.914395134039198, "grad_norm": 0.5241821798341061, "learning_rate": 5.699508921202635e-06, "loss": 0.4969, "step": 4273 }, { "epoch": 1.9148456859653074, "grad_norm": 0.5275888337829175, "learning_rate": 5.69524436799731e-06, "loss": 0.5548, "step": 4274 }, { "epoch": 1.915296237891417, "grad_norm": 0.5078690033815492, "learning_rate": 5.690980775529943e-06, "loss": 0.5274, "step": 4275 }, { "epoch": 1.9157467898175264, "grad_norm": 0.5083148874857518, "learning_rate": 5.686718144752081e-06, "loss": 0.5378, "step": 4276 }, { "epoch": 1.916197341743636, "grad_norm": 0.550168306283703, "learning_rate": 5.6824564766150724e-06, "loss": 0.537, "step": 4277 }, { "epoch": 1.9166478936697455, "grad_norm": 0.5085554537888501, "learning_rate": 5.678195772070027e-06, "loss": 0.5173, "step": 4278 }, { "epoch": 1.917098445595855, "grad_norm": 0.5211521196874359, "learning_rate": 5.673936032067856e-06, "loss": 0.5442, "step": 4279 }, { "epoch": 1.9175489975219644, "grad_norm": 0.510684998058519, "learning_rate": 5.669677257559249e-06, "loss": 0.5484, "step": 4280 }, { "epoch": 1.917999549448074, "grad_norm": 0.5317332280484335, "learning_rate": 5.665419449494685e-06, "loss": 0.5148, "step": 4281 }, { "epoch": 1.9184501013741833, "grad_norm": 0.5102446679400593, "learning_rate": 5.66116260882442e-06, "loss": 0.5382, "step": 4282 }, { "epoch": 1.9189006533002928, "grad_norm": 0.5130015667665415, "learning_rate": 5.656906736498502e-06, "loss": 0.5431, "step": 4283 }, { "epoch": 1.9193512052264023, "grad_norm": 0.518524712087688, "learning_rate": 5.652651833466756e-06, "loss": 0.5085, "step": 4284 }, { "epoch": 1.919801757152512, "grad_norm": 0.5108154033550034, "learning_rate": 5.648397900678796e-06, "loss": 0.5079, "step": 4285 }, { "epoch": 1.9202523090786214, "grad_norm": 0.5312058423343021, "learning_rate": 5.6441449390840176e-06, "loss": 0.5583, "step": 4286 }, { "epoch": 1.9207028610047308, "grad_norm": 0.5201226831813967, "learning_rate": 5.639892949631597e-06, "loss": 0.5402, "step": 4287 }, { "epoch": 1.9211534129308403, "grad_norm": 0.5231559426224316, "learning_rate": 5.6356419332704985e-06, "loss": 0.5388, "step": 4288 }, { "epoch": 1.9216039648569496, "grad_norm": 0.5088414288264752, "learning_rate": 5.631391890949465e-06, "loss": 0.509, "step": 4289 }, { "epoch": 1.9220545167830592, "grad_norm": 0.5229627354889911, "learning_rate": 5.627142823617022e-06, "loss": 0.5041, "step": 4290 }, { "epoch": 1.9225050687091687, "grad_norm": 0.5018750402877623, "learning_rate": 5.622894732221482e-06, "loss": 0.5131, "step": 4291 }, { "epoch": 1.9229556206352783, "grad_norm": 0.5030172391083453, "learning_rate": 5.618647617710935e-06, "loss": 0.5358, "step": 4292 }, { "epoch": 1.9234061725613878, "grad_norm": 0.4963522725552246, "learning_rate": 5.614401481033255e-06, "loss": 0.5589, "step": 4293 }, { "epoch": 1.9238567244874973, "grad_norm": 0.5135867020396326, "learning_rate": 5.610156323136096e-06, "loss": 0.5431, "step": 4294 }, { "epoch": 1.9243072764136067, "grad_norm": 0.5261302971596993, "learning_rate": 5.605912144966901e-06, "loss": 0.5381, "step": 4295 }, { "epoch": 1.9247578283397162, "grad_norm": 0.5201946704449049, "learning_rate": 5.601668947472876e-06, "loss": 0.54, "step": 4296 }, { "epoch": 1.9252083802658255, "grad_norm": 0.5023817521512337, "learning_rate": 5.597426731601035e-06, "loss": 0.5089, "step": 4297 }, { "epoch": 1.925658932191935, "grad_norm": 0.5063924414099218, "learning_rate": 5.593185498298142e-06, "loss": 0.5149, "step": 4298 }, { "epoch": 1.9261094841180446, "grad_norm": 0.524764770289847, "learning_rate": 5.588945248510775e-06, "loss": 0.5376, "step": 4299 }, { "epoch": 1.9265600360441542, "grad_norm": 0.5204618523917579, "learning_rate": 5.584705983185262e-06, "loss": 0.5248, "step": 4300 }, { "epoch": 1.9270105879702637, "grad_norm": 0.5071885715822989, "learning_rate": 5.580467703267736e-06, "loss": 0.543, "step": 4301 }, { "epoch": 1.927461139896373, "grad_norm": 0.49974056110973747, "learning_rate": 5.576230409704084e-06, "loss": 0.489, "step": 4302 }, { "epoch": 1.9279116918224826, "grad_norm": 0.5186375029477643, "learning_rate": 5.571994103440007e-06, "loss": 0.5238, "step": 4303 }, { "epoch": 1.9283622437485919, "grad_norm": 0.5109644877699392, "learning_rate": 5.567758785420951e-06, "loss": 0.5443, "step": 4304 }, { "epoch": 1.9288127956747014, "grad_norm": 0.5223294790150791, "learning_rate": 5.563524456592163e-06, "loss": 0.5504, "step": 4305 }, { "epoch": 1.929263347600811, "grad_norm": 0.5236707328067287, "learning_rate": 5.559291117898662e-06, "loss": 0.5439, "step": 4306 }, { "epoch": 1.9297138995269205, "grad_norm": 0.5157286646658068, "learning_rate": 5.5550587702852465e-06, "loss": 0.5334, "step": 4307 }, { "epoch": 1.93016445145303, "grad_norm": 0.5167351634745779, "learning_rate": 5.550827414696496e-06, "loss": 0.516, "step": 4308 }, { "epoch": 1.9306150033791396, "grad_norm": 0.50951297475142, "learning_rate": 5.546597052076765e-06, "loss": 0.5341, "step": 4309 }, { "epoch": 1.931065555305249, "grad_norm": 0.5149916285673367, "learning_rate": 5.542367683370189e-06, "loss": 0.5391, "step": 4310 }, { "epoch": 1.9315161072313582, "grad_norm": 0.5038362046120101, "learning_rate": 5.538139309520683e-06, "loss": 0.5391, "step": 4311 }, { "epoch": 1.9319666591574678, "grad_norm": 0.5130358281834277, "learning_rate": 5.533911931471936e-06, "loss": 0.54, "step": 4312 }, { "epoch": 1.9324172110835773, "grad_norm": 0.5125339045415028, "learning_rate": 5.529685550167417e-06, "loss": 0.5054, "step": 4313 }, { "epoch": 1.9328677630096869, "grad_norm": 0.5491828537078403, "learning_rate": 5.525460166550374e-06, "loss": 0.54, "step": 4314 }, { "epoch": 1.9333183149357964, "grad_norm": 0.507661553524342, "learning_rate": 5.521235781563827e-06, "loss": 0.5085, "step": 4315 }, { "epoch": 1.933768866861906, "grad_norm": 0.5162408939746392, "learning_rate": 5.517012396150581e-06, "loss": 0.5443, "step": 4316 }, { "epoch": 1.9342194187880153, "grad_norm": 0.4898843832852967, "learning_rate": 5.512790011253211e-06, "loss": 0.4821, "step": 4317 }, { "epoch": 1.9346699707141248, "grad_norm": 0.5190411999628419, "learning_rate": 5.508568627814072e-06, "loss": 0.5324, "step": 4318 }, { "epoch": 1.9351205226402342, "grad_norm": 0.5042705805037649, "learning_rate": 5.504348246775299e-06, "loss": 0.5387, "step": 4319 }, { "epoch": 1.9355710745663437, "grad_norm": 0.5255418670675616, "learning_rate": 5.500128869078789e-06, "loss": 0.5843, "step": 4320 }, { "epoch": 1.9360216264924532, "grad_norm": 0.5360839828662424, "learning_rate": 5.495910495666239e-06, "loss": 0.568, "step": 4321 }, { "epoch": 1.9364721784185628, "grad_norm": 0.5200317968504824, "learning_rate": 5.491693127479092e-06, "loss": 0.5483, "step": 4322 }, { "epoch": 1.9369227303446723, "grad_norm": 0.5218032888501051, "learning_rate": 5.4874767654586e-06, "loss": 0.5505, "step": 4323 }, { "epoch": 1.9373732822707819, "grad_norm": 0.5063871136692875, "learning_rate": 5.483261410545755e-06, "loss": 0.5505, "step": 4324 }, { "epoch": 1.9378238341968912, "grad_norm": 0.5173132193809765, "learning_rate": 5.479047063681357e-06, "loss": 0.5345, "step": 4325 }, { "epoch": 1.9382743861230005, "grad_norm": 0.5175417926841713, "learning_rate": 5.474833725805963e-06, "loss": 0.5261, "step": 4326 }, { "epoch": 1.93872493804911, "grad_norm": 0.525251785617397, "learning_rate": 5.4706213978599055e-06, "loss": 0.5558, "step": 4327 }, { "epoch": 1.9391754899752196, "grad_norm": 0.5114910036376604, "learning_rate": 5.4664100807832954e-06, "loss": 0.542, "step": 4328 }, { "epoch": 1.9396260419013291, "grad_norm": 0.5132567931031556, "learning_rate": 5.462199775516018e-06, "loss": 0.5295, "step": 4329 }, { "epoch": 1.9400765938274387, "grad_norm": 0.5310699734545036, "learning_rate": 5.457990482997735e-06, "loss": 0.525, "step": 4330 }, { "epoch": 1.9405271457535482, "grad_norm": 0.5233045641532164, "learning_rate": 5.453782204167868e-06, "loss": 0.5251, "step": 4331 }, { "epoch": 1.9409776976796576, "grad_norm": 0.49784200424704034, "learning_rate": 5.449574939965637e-06, "loss": 0.5289, "step": 4332 }, { "epoch": 1.941428249605767, "grad_norm": 0.5365795964826264, "learning_rate": 5.445368691330008e-06, "loss": 0.5507, "step": 4333 }, { "epoch": 1.9418788015318764, "grad_norm": 0.5030499706211433, "learning_rate": 5.4411634591997475e-06, "loss": 0.5368, "step": 4334 }, { "epoch": 1.942329353457986, "grad_norm": 0.5323216994176151, "learning_rate": 5.436959244513369e-06, "loss": 0.4992, "step": 4335 }, { "epoch": 1.9427799053840955, "grad_norm": 0.5171486643958932, "learning_rate": 5.432756048209185e-06, "loss": 0.5436, "step": 4336 }, { "epoch": 1.943230457310205, "grad_norm": 0.5158462697609466, "learning_rate": 5.4285538712252515e-06, "loss": 0.5403, "step": 4337 }, { "epoch": 1.9436810092363146, "grad_norm": 0.5244445841143305, "learning_rate": 5.424352714499431e-06, "loss": 0.5645, "step": 4338 }, { "epoch": 1.944131561162424, "grad_norm": 0.5064663337428739, "learning_rate": 5.420152578969327e-06, "loss": 0.5593, "step": 4339 }, { "epoch": 1.9445821130885335, "grad_norm": 0.5250875913395601, "learning_rate": 5.415953465572332e-06, "loss": 0.5505, "step": 4340 }, { "epoch": 1.9450326650146428, "grad_norm": 0.5059366045414717, "learning_rate": 5.4117553752456065e-06, "loss": 0.5497, "step": 4341 }, { "epoch": 1.9454832169407523, "grad_norm": 0.5208111765984302, "learning_rate": 5.407558308926083e-06, "loss": 0.5257, "step": 4342 }, { "epoch": 1.9459337688668619, "grad_norm": 0.5170453313382575, "learning_rate": 5.403362267550466e-06, "loss": 0.5638, "step": 4343 }, { "epoch": 1.9463843207929714, "grad_norm": 0.49882681283681524, "learning_rate": 5.399167252055231e-06, "loss": 0.5033, "step": 4344 }, { "epoch": 1.946834872719081, "grad_norm": 0.5076257429821885, "learning_rate": 5.3949732633766215e-06, "loss": 0.5078, "step": 4345 }, { "epoch": 1.9472854246451905, "grad_norm": 0.5488480474407342, "learning_rate": 5.3907803024506554e-06, "loss": 0.5125, "step": 4346 }, { "epoch": 1.9477359765712998, "grad_norm": 0.504648527810718, "learning_rate": 5.386588370213124e-06, "loss": 0.4946, "step": 4347 }, { "epoch": 1.9481865284974094, "grad_norm": 0.5254721406009831, "learning_rate": 5.38239746759958e-06, "loss": 0.5576, "step": 4348 }, { "epoch": 1.9486370804235187, "grad_norm": 0.5230970267033592, "learning_rate": 5.378207595545353e-06, "loss": 0.5053, "step": 4349 }, { "epoch": 1.9490876323496282, "grad_norm": 0.5156381114647728, "learning_rate": 5.374018754985544e-06, "loss": 0.5343, "step": 4350 }, { "epoch": 1.9495381842757378, "grad_norm": 0.5173057316188453, "learning_rate": 5.369830946855017e-06, "loss": 0.495, "step": 4351 }, { "epoch": 1.9499887362018473, "grad_norm": 0.5514487468924718, "learning_rate": 5.365644172088412e-06, "loss": 0.5327, "step": 4352 }, { "epoch": 1.9504392881279569, "grad_norm": 0.507699927029942, "learning_rate": 5.3614584316201365e-06, "loss": 0.531, "step": 4353 }, { "epoch": 1.9508898400540662, "grad_norm": 0.5207455366803063, "learning_rate": 5.357273726384368e-06, "loss": 0.559, "step": 4354 }, { "epoch": 1.9513403919801757, "grad_norm": 0.5276282833793655, "learning_rate": 5.3530900573150415e-06, "loss": 0.5598, "step": 4355 }, { "epoch": 1.951790943906285, "grad_norm": 0.5076486165984241, "learning_rate": 5.348907425345886e-06, "loss": 0.5387, "step": 4356 }, { "epoch": 1.9522414958323946, "grad_norm": 0.5183087528898285, "learning_rate": 5.344725831410369e-06, "loss": 0.5189, "step": 4357 }, { "epoch": 1.9526920477585041, "grad_norm": 0.5172459208551985, "learning_rate": 5.340545276441755e-06, "loss": 0.5375, "step": 4358 }, { "epoch": 1.9531425996846137, "grad_norm": 0.5214841087124749, "learning_rate": 5.336365761373048e-06, "loss": 0.5465, "step": 4359 }, { "epoch": 1.9535931516107232, "grad_norm": 0.5332730523062336, "learning_rate": 5.332187287137051e-06, "loss": 0.5487, "step": 4360 }, { "epoch": 1.9540437035368328, "grad_norm": 0.5184032838115955, "learning_rate": 5.328009854666303e-06, "loss": 0.565, "step": 4361 }, { "epoch": 1.954494255462942, "grad_norm": 0.510957089369536, "learning_rate": 5.32383346489314e-06, "loss": 0.5162, "step": 4362 }, { "epoch": 1.9549448073890516, "grad_norm": 0.5101301587036912, "learning_rate": 5.319658118749637e-06, "loss": 0.5006, "step": 4363 }, { "epoch": 1.955395359315161, "grad_norm": 0.520208713272615, "learning_rate": 5.315483817167664e-06, "loss": 0.5073, "step": 4364 }, { "epoch": 1.9558459112412705, "grad_norm": 0.5230806475320734, "learning_rate": 5.311310561078835e-06, "loss": 0.5184, "step": 4365 }, { "epoch": 1.95629646316738, "grad_norm": 0.5397324728440207, "learning_rate": 5.307138351414542e-06, "loss": 0.5132, "step": 4366 }, { "epoch": 1.9567470150934896, "grad_norm": 0.5359599181642294, "learning_rate": 5.302967189105941e-06, "loss": 0.553, "step": 4367 }, { "epoch": 1.9571975670195991, "grad_norm": 0.5078593271075644, "learning_rate": 5.298797075083956e-06, "loss": 0.5038, "step": 4368 }, { "epoch": 1.9576481189457084, "grad_norm": 0.5205662132998015, "learning_rate": 5.294628010279274e-06, "loss": 0.5546, "step": 4369 }, { "epoch": 1.958098670871818, "grad_norm": 0.5259201710552576, "learning_rate": 5.290459995622351e-06, "loss": 0.5367, "step": 4370 }, { "epoch": 1.9585492227979273, "grad_norm": 0.5411711693769795, "learning_rate": 5.286293032043406e-06, "loss": 0.5327, "step": 4371 }, { "epoch": 1.9589997747240369, "grad_norm": 0.5394712129265457, "learning_rate": 5.282127120472424e-06, "loss": 0.5488, "step": 4372 }, { "epoch": 1.9594503266501464, "grad_norm": 0.5100306808084365, "learning_rate": 5.277962261839157e-06, "loss": 0.5129, "step": 4373 }, { "epoch": 1.959900878576256, "grad_norm": 0.5144584449853032, "learning_rate": 5.273798457073119e-06, "loss": 0.5312, "step": 4374 }, { "epoch": 1.9603514305023655, "grad_norm": 0.5365305640304865, "learning_rate": 5.269635707103593e-06, "loss": 0.5217, "step": 4375 }, { "epoch": 1.960801982428475, "grad_norm": 0.5109177914983267, "learning_rate": 5.2654740128596215e-06, "loss": 0.5201, "step": 4376 }, { "epoch": 1.9612525343545844, "grad_norm": 0.5159533335288131, "learning_rate": 5.2613133752700145e-06, "loss": 0.5616, "step": 4377 }, { "epoch": 1.961703086280694, "grad_norm": 0.5083809902489238, "learning_rate": 5.2571537952633455e-06, "loss": 0.5304, "step": 4378 }, { "epoch": 1.9621536382068032, "grad_norm": 0.5103629393051698, "learning_rate": 5.2529952737679536e-06, "loss": 0.5202, "step": 4379 }, { "epoch": 1.9626041901329128, "grad_norm": 0.5323697148112659, "learning_rate": 5.24883781171194e-06, "loss": 0.5703, "step": 4380 }, { "epoch": 1.9630547420590223, "grad_norm": 0.5188661027940602, "learning_rate": 5.244681410023167e-06, "loss": 0.5348, "step": 4381 }, { "epoch": 1.9635052939851318, "grad_norm": 0.5000183060378569, "learning_rate": 5.240526069629265e-06, "loss": 0.532, "step": 4382 }, { "epoch": 1.9639558459112414, "grad_norm": 0.5169989005716538, "learning_rate": 5.236371791457625e-06, "loss": 0.5199, "step": 4383 }, { "epoch": 1.9644063978373507, "grad_norm": 0.503016950746473, "learning_rate": 5.232218576435401e-06, "loss": 0.4987, "step": 4384 }, { "epoch": 1.9648569497634603, "grad_norm": 0.513380749548904, "learning_rate": 5.228066425489511e-06, "loss": 0.5371, "step": 4385 }, { "epoch": 1.9653075016895696, "grad_norm": 0.4917958795838744, "learning_rate": 5.223915339546633e-06, "loss": 0.5083, "step": 4386 }, { "epoch": 1.9657580536156791, "grad_norm": 0.496768125853736, "learning_rate": 5.21976531953321e-06, "loss": 0.509, "step": 4387 }, { "epoch": 1.9662086055417887, "grad_norm": 0.5300203668380171, "learning_rate": 5.215616366375445e-06, "loss": 0.5394, "step": 4388 }, { "epoch": 1.9666591574678982, "grad_norm": 0.520753237840837, "learning_rate": 5.211468480999304e-06, "loss": 0.5123, "step": 4389 }, { "epoch": 1.9671097093940078, "grad_norm": 0.5379737852873193, "learning_rate": 5.207321664330517e-06, "loss": 0.542, "step": 4390 }, { "epoch": 1.9675602613201173, "grad_norm": 0.5289252424955015, "learning_rate": 5.203175917294574e-06, "loss": 0.5451, "step": 4391 }, { "epoch": 1.9680108132462266, "grad_norm": 0.5104851313128185, "learning_rate": 5.199031240816715e-06, "loss": 0.5055, "step": 4392 }, { "epoch": 1.9684613651723362, "grad_norm": 0.5243434980244013, "learning_rate": 5.194887635821966e-06, "loss": 0.489, "step": 4393 }, { "epoch": 1.9689119170984455, "grad_norm": 0.5316286811980956, "learning_rate": 5.190745103235084e-06, "loss": 0.5607, "step": 4394 }, { "epoch": 1.969362469024555, "grad_norm": 0.5252408474416728, "learning_rate": 5.186603643980619e-06, "loss": 0.5369, "step": 4395 }, { "epoch": 1.9698130209506646, "grad_norm": 0.5289489565592974, "learning_rate": 5.1824632589828465e-06, "loss": 0.5566, "step": 4396 }, { "epoch": 1.9702635728767741, "grad_norm": 0.510530822445175, "learning_rate": 5.178323949165837e-06, "loss": 0.5185, "step": 4397 }, { "epoch": 1.9707141248028837, "grad_norm": 0.5173269365277807, "learning_rate": 5.174185715453388e-06, "loss": 0.5153, "step": 4398 }, { "epoch": 1.971164676728993, "grad_norm": 0.5275916777880326, "learning_rate": 5.17004855876909e-06, "loss": 0.514, "step": 4399 }, { "epoch": 1.9716152286551025, "grad_norm": 0.5016604612526991, "learning_rate": 5.165912480036262e-06, "loss": 0.5388, "step": 4400 }, { "epoch": 1.9720657805812118, "grad_norm": 0.5347620371688796, "learning_rate": 5.161777480178003e-06, "loss": 0.5102, "step": 4401 }, { "epoch": 1.9725163325073214, "grad_norm": 0.5140824355433802, "learning_rate": 5.1576435601171625e-06, "loss": 0.5295, "step": 4402 }, { "epoch": 1.972966884433431, "grad_norm": 0.5260933205995697, "learning_rate": 5.153510720776354e-06, "loss": 0.5081, "step": 4403 }, { "epoch": 1.9734174363595405, "grad_norm": 0.510900786522866, "learning_rate": 5.149378963077943e-06, "loss": 0.5143, "step": 4404 }, { "epoch": 1.97386798828565, "grad_norm": 0.5281349770363365, "learning_rate": 5.145248287944062e-06, "loss": 0.5088, "step": 4405 }, { "epoch": 1.9743185402117596, "grad_norm": 0.5155992972295382, "learning_rate": 5.141118696296595e-06, "loss": 0.5351, "step": 4406 }, { "epoch": 1.9747690921378689, "grad_norm": 0.5118250620490777, "learning_rate": 5.136990189057187e-06, "loss": 0.5373, "step": 4407 }, { "epoch": 1.9752196440639782, "grad_norm": 0.5305825394764244, "learning_rate": 5.132862767147242e-06, "loss": 0.5113, "step": 4408 }, { "epoch": 1.9756701959900878, "grad_norm": 0.5206697246470641, "learning_rate": 5.128736431487919e-06, "loss": 0.5153, "step": 4409 }, { "epoch": 1.9761207479161973, "grad_norm": 0.5054425418819939, "learning_rate": 5.124611183000138e-06, "loss": 0.5121, "step": 4410 }, { "epoch": 1.9765712998423068, "grad_norm": 0.5218385014598892, "learning_rate": 5.120487022604572e-06, "loss": 0.5262, "step": 4411 }, { "epoch": 1.9770218517684164, "grad_norm": 0.5355458361548022, "learning_rate": 5.116363951221654e-06, "loss": 0.5012, "step": 4412 }, { "epoch": 1.977472403694526, "grad_norm": 0.4979016538753682, "learning_rate": 5.112241969771577e-06, "loss": 0.5253, "step": 4413 }, { "epoch": 1.9779229556206352, "grad_norm": 0.5176541647604003, "learning_rate": 5.108121079174282e-06, "loss": 0.4966, "step": 4414 }, { "epoch": 1.9783735075467448, "grad_norm": 0.5301820786710328, "learning_rate": 5.10400128034948e-06, "loss": 0.5555, "step": 4415 }, { "epoch": 1.9788240594728541, "grad_norm": 0.5246747456984924, "learning_rate": 5.099882574216617e-06, "loss": 0.5555, "step": 4416 }, { "epoch": 1.9792746113989637, "grad_norm": 0.5234296172426599, "learning_rate": 5.095764961694923e-06, "loss": 0.5304, "step": 4417 }, { "epoch": 1.9797251633250732, "grad_norm": 0.5422702043153951, "learning_rate": 5.091648443703354e-06, "loss": 0.5112, "step": 4418 }, { "epoch": 1.9801757152511827, "grad_norm": 0.5152137836559063, "learning_rate": 5.087533021160654e-06, "loss": 0.5334, "step": 4419 }, { "epoch": 1.9806262671772923, "grad_norm": 0.5139829799963581, "learning_rate": 5.08341869498529e-06, "loss": 0.5119, "step": 4420 }, { "epoch": 1.9810768191034018, "grad_norm": 0.5046754735086648, "learning_rate": 5.079305466095512e-06, "loss": 0.4951, "step": 4421 }, { "epoch": 1.9815273710295112, "grad_norm": 0.5482621842950022, "learning_rate": 5.0751933354093e-06, "loss": 0.5299, "step": 4422 }, { "epoch": 1.9819779229556205, "grad_norm": 0.5358093325914228, "learning_rate": 5.071082303844417e-06, "loss": 0.4941, "step": 4423 }, { "epoch": 1.98242847488173, "grad_norm": 0.5222037376530632, "learning_rate": 5.066972372318351e-06, "loss": 0.5748, "step": 4424 }, { "epoch": 1.9828790268078396, "grad_norm": 0.567429212512839, "learning_rate": 5.062863541748368e-06, "loss": 0.5387, "step": 4425 }, { "epoch": 1.983329578733949, "grad_norm": 0.5343334126044836, "learning_rate": 5.058755813051482e-06, "loss": 0.5323, "step": 4426 }, { "epoch": 1.9837801306600586, "grad_norm": 0.5152810933588664, "learning_rate": 5.054649187144446e-06, "loss": 0.5323, "step": 4427 }, { "epoch": 1.9842306825861682, "grad_norm": 0.5287006167190318, "learning_rate": 5.050543664943795e-06, "loss": 0.5292, "step": 4428 }, { "epoch": 1.9846812345122775, "grad_norm": 0.5188418824809086, "learning_rate": 5.046439247365784e-06, "loss": 0.5177, "step": 4429 }, { "epoch": 1.985131786438387, "grad_norm": 0.5252780819154974, "learning_rate": 5.042335935326457e-06, "loss": 0.5112, "step": 4430 }, { "epoch": 1.9855823383644964, "grad_norm": 0.5149840442859547, "learning_rate": 5.0382337297415775e-06, "loss": 0.5165, "step": 4431 }, { "epoch": 1.986032890290606, "grad_norm": 0.5075665473019985, "learning_rate": 5.034132631526696e-06, "loss": 0.5248, "step": 4432 }, { "epoch": 1.9864834422167155, "grad_norm": 0.517002475875171, "learning_rate": 5.030032641597078e-06, "loss": 0.5127, "step": 4433 }, { "epoch": 1.986933994142825, "grad_norm": 0.5095387281017447, "learning_rate": 5.025933760867782e-06, "loss": 0.5432, "step": 4434 }, { "epoch": 1.9873845460689346, "grad_norm": 0.49966970825058626, "learning_rate": 5.021835990253583e-06, "loss": 0.5409, "step": 4435 }, { "epoch": 1.9878350979950439, "grad_norm": 0.50758328774611, "learning_rate": 5.017739330669031e-06, "loss": 0.5343, "step": 4436 }, { "epoch": 1.9882856499211534, "grad_norm": 0.5019166309306963, "learning_rate": 5.013643783028419e-06, "loss": 0.5129, "step": 4437 }, { "epoch": 1.9887362018472627, "grad_norm": 0.5045970858022876, "learning_rate": 5.009549348245795e-06, "loss": 0.5654, "step": 4438 }, { "epoch": 1.9891867537733723, "grad_norm": 0.5130573637710143, "learning_rate": 5.005456027234957e-06, "loss": 0.5282, "step": 4439 }, { "epoch": 1.9896373056994818, "grad_norm": 0.5280298821372701, "learning_rate": 5.001363820909455e-06, "loss": 0.521, "step": 4440 }, { "epoch": 1.9896373056994818, "eval_loss": 0.6168236136436462, "eval_runtime": 24.3982, "eval_samples_per_second": 11.435, "eval_steps_per_second": 0.492, "step": 4440 }, { "epoch": 1.9900878576255914, "grad_norm": 0.5163483691250625, "learning_rate": 4.9972727301825885e-06, "loss": 0.5399, "step": 4441 }, { "epoch": 1.990538409551701, "grad_norm": 0.5124989034335673, "learning_rate": 4.993182755967412e-06, "loss": 0.522, "step": 4442 }, { "epoch": 1.9909889614778105, "grad_norm": 0.5144572836718612, "learning_rate": 4.989093899176727e-06, "loss": 0.5034, "step": 4443 }, { "epoch": 1.9914395134039198, "grad_norm": 0.4864740466569667, "learning_rate": 4.9850061607230875e-06, "loss": 0.5108, "step": 4444 }, { "epoch": 1.9918900653300293, "grad_norm": 0.5062585523230853, "learning_rate": 4.980919541518796e-06, "loss": 0.538, "step": 4445 }, { "epoch": 1.9923406172561386, "grad_norm": 0.5391957737915767, "learning_rate": 4.976834042475909e-06, "loss": 0.5443, "step": 4446 }, { "epoch": 1.9927911691822482, "grad_norm": 0.5127761830374769, "learning_rate": 4.972749664506229e-06, "loss": 0.5292, "step": 4447 }, { "epoch": 1.9932417211083577, "grad_norm": 0.5415789636346926, "learning_rate": 4.968666408521311e-06, "loss": 0.5382, "step": 4448 }, { "epoch": 1.9936922730344673, "grad_norm": 0.5145326832163081, "learning_rate": 4.964584275432457e-06, "loss": 0.5245, "step": 4449 }, { "epoch": 1.9941428249605768, "grad_norm": 0.5189294758934386, "learning_rate": 4.960503266150726e-06, "loss": 0.5271, "step": 4450 }, { "epoch": 1.9945933768866861, "grad_norm": 0.5284895728443808, "learning_rate": 4.956423381586907e-06, "loss": 0.4924, "step": 4451 }, { "epoch": 1.9950439288127957, "grad_norm": 0.5238155114964212, "learning_rate": 4.952344622651566e-06, "loss": 0.5373, "step": 4452 }, { "epoch": 1.995494480738905, "grad_norm": 0.532473029852445, "learning_rate": 4.9482669902549896e-06, "loss": 0.5215, "step": 4453 }, { "epoch": 1.9959450326650146, "grad_norm": 0.5217459389762935, "learning_rate": 4.94419048530724e-06, "loss": 0.5531, "step": 4454 }, { "epoch": 1.996395584591124, "grad_norm": 0.5449066243370555, "learning_rate": 4.940115108718099e-06, "loss": 0.5512, "step": 4455 }, { "epoch": 1.9968461365172336, "grad_norm": 0.5088262445301693, "learning_rate": 4.936040861397125e-06, "loss": 0.5313, "step": 4456 }, { "epoch": 1.9972966884433432, "grad_norm": 0.5052678462113269, "learning_rate": 4.931967744253601e-06, "loss": 0.532, "step": 4457 }, { "epoch": 1.9977472403694527, "grad_norm": 0.5207845590611203, "learning_rate": 4.927895758196577e-06, "loss": 0.5618, "step": 4458 }, { "epoch": 1.998197792295562, "grad_norm": 0.5068569459564752, "learning_rate": 4.92382490413483e-06, "loss": 0.5339, "step": 4459 }, { "epoch": 1.9986483442216716, "grad_norm": 0.5060017791103802, "learning_rate": 4.9197551829769095e-06, "loss": 0.5448, "step": 4460 }, { "epoch": 1.999098896147781, "grad_norm": 0.5064068843986106, "learning_rate": 4.915686595631086e-06, "loss": 0.5232, "step": 4461 }, { "epoch": 1.9995494480738905, "grad_norm": 0.518120096344485, "learning_rate": 4.911619143005395e-06, "loss": 0.5444, "step": 4462 }, { "epoch": 2.0, "grad_norm": 0.5035175563202205, "learning_rate": 4.907552826007612e-06, "loss": 0.4962, "step": 4463 }, { "epoch": 2.0004505519261095, "grad_norm": 0.5124301843053308, "learning_rate": 4.903487645545261e-06, "loss": 0.5311, "step": 4464 }, { "epoch": 2.000901103852219, "grad_norm": 0.5257198403011223, "learning_rate": 4.899423602525609e-06, "loss": 0.5379, "step": 4465 }, { "epoch": 2.001351655778328, "grad_norm": 0.512135245974166, "learning_rate": 4.895360697855674e-06, "loss": 0.5304, "step": 4466 }, { "epoch": 2.001802207704438, "grad_norm": 0.5195130732731924, "learning_rate": 4.8912989324422164e-06, "loss": 0.5341, "step": 4467 }, { "epoch": 2.0022527596305473, "grad_norm": 0.5048121028954845, "learning_rate": 4.887238307191745e-06, "loss": 0.5295, "step": 4468 }, { "epoch": 2.002703311556657, "grad_norm": 0.5223375599220555, "learning_rate": 4.88317882301051e-06, "loss": 0.5405, "step": 4469 }, { "epoch": 2.0031538634827664, "grad_norm": 0.5315626472053352, "learning_rate": 4.879120480804511e-06, "loss": 0.5394, "step": 4470 }, { "epoch": 2.003604415408876, "grad_norm": 0.5191328242285952, "learning_rate": 4.875063281479492e-06, "loss": 0.5467, "step": 4471 }, { "epoch": 2.0040549673349854, "grad_norm": 0.5020270880705188, "learning_rate": 4.87100722594094e-06, "loss": 0.5266, "step": 4472 }, { "epoch": 2.0045055192610945, "grad_norm": 0.5232986709789184, "learning_rate": 4.866952315094088e-06, "loss": 0.5623, "step": 4473 }, { "epoch": 2.0049560711872045, "grad_norm": 0.5185636563953671, "learning_rate": 4.862898549843913e-06, "loss": 0.5449, "step": 4474 }, { "epoch": 2.0054066231133136, "grad_norm": 0.5097338471411587, "learning_rate": 4.8588459310951386e-06, "loss": 0.5265, "step": 4475 }, { "epoch": 2.0058571750394236, "grad_norm": 0.510510795642854, "learning_rate": 4.854794459752229e-06, "loss": 0.5498, "step": 4476 }, { "epoch": 2.0063077269655327, "grad_norm": 0.5281953039921821, "learning_rate": 4.850744136719395e-06, "loss": 0.5356, "step": 4477 }, { "epoch": 2.0067582788916423, "grad_norm": 0.5209208194261528, "learning_rate": 4.84669496290059e-06, "loss": 0.5218, "step": 4478 }, { "epoch": 2.007208830817752, "grad_norm": 0.5247781721288102, "learning_rate": 4.84264693919951e-06, "loss": 0.5913, "step": 4479 }, { "epoch": 2.007659382743861, "grad_norm": 0.5119036213332361, "learning_rate": 4.838600066519597e-06, "loss": 0.5279, "step": 4480 }, { "epoch": 2.008109934669971, "grad_norm": 0.5239859135644768, "learning_rate": 4.834554345764032e-06, "loss": 0.5539, "step": 4481 }, { "epoch": 2.00856048659608, "grad_norm": 0.5315751923059072, "learning_rate": 4.8305097778357445e-06, "loss": 0.5332, "step": 4482 }, { "epoch": 2.00901103852219, "grad_norm": 0.5078949303356991, "learning_rate": 4.826466363637402e-06, "loss": 0.5062, "step": 4483 }, { "epoch": 2.009461590448299, "grad_norm": 0.5216215386478403, "learning_rate": 4.822424104071416e-06, "loss": 0.5149, "step": 4484 }, { "epoch": 2.0099121423744086, "grad_norm": 0.5124119174063597, "learning_rate": 4.818383000039945e-06, "loss": 0.5368, "step": 4485 }, { "epoch": 2.010362694300518, "grad_norm": 0.5356638685975634, "learning_rate": 4.814343052444872e-06, "loss": 0.545, "step": 4486 }, { "epoch": 2.0108132462266277, "grad_norm": 0.5344115977238979, "learning_rate": 4.8103042621878515e-06, "loss": 0.5354, "step": 4487 }, { "epoch": 2.0004505519261095, "grad_norm": 1.2848462964969716, "learning_rate": 4.806266630170249e-06, "loss": 0.4066, "step": 4488 }, { "epoch": 2.000901103852219, "grad_norm": 1.0568407713424737, "learning_rate": 4.802230157293198e-06, "loss": 0.425, "step": 4489 }, { "epoch": 2.0013516557783286, "grad_norm": 0.8021529252318079, "learning_rate": 4.798194844457547e-06, "loss": 0.4218, "step": 4490 }, { "epoch": 2.0018022077044377, "grad_norm": 1.0980676358453099, "learning_rate": 4.794160692563917e-06, "loss": 0.4105, "step": 4491 }, { "epoch": 2.0022527596305473, "grad_norm": 1.1114321044626532, "learning_rate": 4.7901277025126345e-06, "loss": 0.3904, "step": 4492 }, { "epoch": 2.002703311556657, "grad_norm": 0.7476690839218882, "learning_rate": 4.7860958752038e-06, "loss": 0.4114, "step": 4493 }, { "epoch": 2.0031538634827664, "grad_norm": 0.8812375503066272, "learning_rate": 4.782065211537226e-06, "loss": 0.4187, "step": 4494 }, { "epoch": 2.003604415408876, "grad_norm": 1.0394994994959186, "learning_rate": 4.778035712412491e-06, "loss": 0.3857, "step": 4495 }, { "epoch": 2.0040549673349854, "grad_norm": 0.9072278290775441, "learning_rate": 4.774007378728891e-06, "loss": 0.432, "step": 4496 }, { "epoch": 2.004505519261095, "grad_norm": 0.6670611531114012, "learning_rate": 4.769980211385477e-06, "loss": 0.3972, "step": 4497 }, { "epoch": 2.0049560711872045, "grad_norm": 0.7946780553910081, "learning_rate": 4.765954211281032e-06, "loss": 0.4056, "step": 4498 }, { "epoch": 2.0054066231133136, "grad_norm": 0.8943654941708843, "learning_rate": 4.7619293793140816e-06, "loss": 0.3915, "step": 4499 }, { "epoch": 2.005857175039423, "grad_norm": 0.8656595828762265, "learning_rate": 4.7579057163828926e-06, "loss": 0.3975, "step": 4500 }, { "epoch": 2.0063077269655327, "grad_norm": 0.6847716082672989, "learning_rate": 4.753883223385467e-06, "loss": 0.4053, "step": 4501 }, { "epoch": 2.0067582788916423, "grad_norm": 0.6379088388266605, "learning_rate": 4.749861901219546e-06, "loss": 0.3803, "step": 4502 }, { "epoch": 2.007208830817752, "grad_norm": 0.6815783217600815, "learning_rate": 4.745841750782612e-06, "loss": 0.3974, "step": 4503 }, { "epoch": 2.0076593827438614, "grad_norm": 0.7132858774477419, "learning_rate": 4.741822772971886e-06, "loss": 0.3983, "step": 4504 }, { "epoch": 2.008109934669971, "grad_norm": 0.5956399161403498, "learning_rate": 4.737804968684323e-06, "loss": 0.3544, "step": 4505 }, { "epoch": 2.00856048659608, "grad_norm": 0.7008229950857285, "learning_rate": 4.73378833881662e-06, "loss": 0.4075, "step": 4506 }, { "epoch": 2.0090110385221895, "grad_norm": 0.6795962510066383, "learning_rate": 4.729772884265212e-06, "loss": 0.3763, "step": 4507 }, { "epoch": 2.009461590448299, "grad_norm": 0.6516137749857617, "learning_rate": 4.7257586059262706e-06, "loss": 0.3743, "step": 4508 }, { "epoch": 2.0099121423744086, "grad_norm": 0.6482536082131913, "learning_rate": 4.721745504695703e-06, "loss": 0.4104, "step": 4509 }, { "epoch": 2.010362694300518, "grad_norm": 0.5969362793429075, "learning_rate": 4.7177335814691575e-06, "loss": 0.3935, "step": 4510 }, { "epoch": 2.0108132462266277, "grad_norm": 0.7156817086972692, "learning_rate": 4.7137228371420195e-06, "loss": 0.4026, "step": 4511 }, { "epoch": 2.0112637981527373, "grad_norm": 0.6687587731307854, "learning_rate": 4.7097132726093986e-06, "loss": 0.4068, "step": 4512 }, { "epoch": 2.011714350078847, "grad_norm": 0.5641124499452929, "learning_rate": 4.705704888766168e-06, "loss": 0.3848, "step": 4513 }, { "epoch": 2.012164902004956, "grad_norm": 0.6093462517740866, "learning_rate": 4.701697686506906e-06, "loss": 0.391, "step": 4514 }, { "epoch": 2.0126154539310654, "grad_norm": 0.6401968783403406, "learning_rate": 4.6976916667259555e-06, "loss": 0.371, "step": 4515 }, { "epoch": 2.013066005857175, "grad_norm": 0.6861498806988882, "learning_rate": 4.69368683031737e-06, "loss": 0.4166, "step": 4516 }, { "epoch": 2.0135165577832845, "grad_norm": 0.6100633069589135, "learning_rate": 4.689683178174964e-06, "loss": 0.3907, "step": 4517 }, { "epoch": 2.013967109709394, "grad_norm": 0.6018384556296738, "learning_rate": 4.685680711192262e-06, "loss": 0.4186, "step": 4518 }, { "epoch": 2.0144176616355036, "grad_norm": 0.6184457477361431, "learning_rate": 4.68167943026255e-06, "loss": 0.4032, "step": 4519 }, { "epoch": 2.014868213561613, "grad_norm": 0.6171876945481428, "learning_rate": 4.6776793362788235e-06, "loss": 0.3931, "step": 4520 }, { "epoch": 2.0153187654877223, "grad_norm": 0.5932542010741636, "learning_rate": 4.673680430133839e-06, "loss": 0.401, "step": 4521 }, { "epoch": 2.015769317413832, "grad_norm": 0.568290507055158, "learning_rate": 4.669682712720065e-06, "loss": 0.4138, "step": 4522 }, { "epoch": 2.0162198693399414, "grad_norm": 0.5727372527643468, "learning_rate": 4.665686184929718e-06, "loss": 0.4109, "step": 4523 }, { "epoch": 2.016670421266051, "grad_norm": 0.5851867105397297, "learning_rate": 4.661690847654743e-06, "loss": 0.3943, "step": 4524 }, { "epoch": 2.0171209731921604, "grad_norm": 0.5858582559585471, "learning_rate": 4.657696701786822e-06, "loss": 0.3981, "step": 4525 }, { "epoch": 2.01757152511827, "grad_norm": 0.5682622220976644, "learning_rate": 4.653703748217379e-06, "loss": 0.4086, "step": 4526 }, { "epoch": 2.0180220770443795, "grad_norm": 0.5761598600093105, "learning_rate": 4.64971198783755e-06, "loss": 0.395, "step": 4527 }, { "epoch": 2.018472628970489, "grad_norm": 0.5653873694550637, "learning_rate": 4.645721421538234e-06, "loss": 0.3934, "step": 4528 }, { "epoch": 2.018923180896598, "grad_norm": 0.5714311261808925, "learning_rate": 4.641732050210032e-06, "loss": 0.4002, "step": 4529 }, { "epoch": 2.0193737328227077, "grad_norm": 0.5732988045603173, "learning_rate": 4.63774387474331e-06, "loss": 0.4123, "step": 4530 }, { "epoch": 2.0198242847488173, "grad_norm": 0.5817283469759105, "learning_rate": 4.63375689602814e-06, "loss": 0.4014, "step": 4531 }, { "epoch": 2.020274836674927, "grad_norm": 0.5621626635248819, "learning_rate": 4.629771114954341e-06, "loss": 0.3816, "step": 4532 }, { "epoch": 2.0207253886010363, "grad_norm": 0.5859680062768254, "learning_rate": 4.625786532411464e-06, "loss": 0.4067, "step": 4533 }, { "epoch": 2.021175940527146, "grad_norm": 0.5531620650939287, "learning_rate": 4.62180314928879e-06, "loss": 0.3941, "step": 4534 }, { "epoch": 2.0216264924532554, "grad_norm": 0.5690429559442798, "learning_rate": 4.6178209664753304e-06, "loss": 0.4006, "step": 4535 }, { "epoch": 2.0220770443793645, "grad_norm": 0.5911289218024006, "learning_rate": 4.613839984859835e-06, "loss": 0.4163, "step": 4536 }, { "epoch": 2.022527596305474, "grad_norm": 0.5660400283412714, "learning_rate": 4.609860205330778e-06, "loss": 0.3817, "step": 4537 }, { "epoch": 2.0229781482315836, "grad_norm": 0.5810592013060023, "learning_rate": 4.605881628776373e-06, "loss": 0.4161, "step": 4538 }, { "epoch": 2.023428700157693, "grad_norm": 0.5791936620418332, "learning_rate": 4.601904256084557e-06, "loss": 0.4032, "step": 4539 }, { "epoch": 2.0238792520838027, "grad_norm": 0.5490847895249685, "learning_rate": 4.597928088143005e-06, "loss": 0.3864, "step": 4540 }, { "epoch": 2.0243298040099122, "grad_norm": 0.5836993067241962, "learning_rate": 4.593953125839121e-06, "loss": 0.4205, "step": 4541 }, { "epoch": 2.024780355936022, "grad_norm": 0.5632957700473622, "learning_rate": 4.589979370060037e-06, "loss": 0.3984, "step": 4542 }, { "epoch": 2.025230907862131, "grad_norm": 0.5770887047116009, "learning_rate": 4.58600682169262e-06, "loss": 0.4088, "step": 4543 }, { "epoch": 2.0256814597882404, "grad_norm": 0.5934023971323861, "learning_rate": 4.582035481623466e-06, "loss": 0.3919, "step": 4544 }, { "epoch": 2.02613201171435, "grad_norm": 0.5788446673836358, "learning_rate": 4.578065350738899e-06, "loss": 0.3869, "step": 4545 }, { "epoch": 2.0265825636404595, "grad_norm": 0.6029435521015845, "learning_rate": 4.574096429924982e-06, "loss": 0.3742, "step": 4546 }, { "epoch": 2.027033115566569, "grad_norm": 0.5910218570159668, "learning_rate": 4.570128720067487e-06, "loss": 0.4153, "step": 4547 }, { "epoch": 2.0274836674926786, "grad_norm": 0.5956397821700133, "learning_rate": 4.566162222051946e-06, "loss": 0.3849, "step": 4548 }, { "epoch": 2.027934219418788, "grad_norm": 0.5600839924840783, "learning_rate": 4.562196936763591e-06, "loss": 0.3991, "step": 4549 }, { "epoch": 2.0283847713448977, "grad_norm": 0.5482892678422371, "learning_rate": 4.5582328650874095e-06, "loss": 0.3686, "step": 4550 }, { "epoch": 2.028835323271007, "grad_norm": 0.5735577139195354, "learning_rate": 4.55427000790809e-06, "loss": 0.3747, "step": 4551 }, { "epoch": 2.0292858751971163, "grad_norm": 0.5672098717512212, "learning_rate": 4.550308366110083e-06, "loss": 0.3838, "step": 4552 }, { "epoch": 2.029736427123226, "grad_norm": 0.5707164997670621, "learning_rate": 4.546347940577533e-06, "loss": 0.4111, "step": 4553 }, { "epoch": 2.0301869790493354, "grad_norm": 0.5624106135042881, "learning_rate": 4.542388732194348e-06, "loss": 0.3708, "step": 4554 }, { "epoch": 2.030637530975445, "grad_norm": 0.5723828439122425, "learning_rate": 4.538430741844127e-06, "loss": 0.4005, "step": 4555 }, { "epoch": 2.0310880829015545, "grad_norm": 0.5770688145664179, "learning_rate": 4.534473970410235e-06, "loss": 0.4346, "step": 4556 }, { "epoch": 2.031538634827664, "grad_norm": 0.6177525492856386, "learning_rate": 4.530518418775734e-06, "loss": 0.3893, "step": 4557 }, { "epoch": 2.031989186753773, "grad_norm": 0.5638171404311, "learning_rate": 4.526564087823429e-06, "loss": 0.397, "step": 4558 }, { "epoch": 2.0324397386798827, "grad_norm": 0.5716978254231477, "learning_rate": 4.522610978435854e-06, "loss": 0.3986, "step": 4559 }, { "epoch": 2.0328902906059922, "grad_norm": 0.5741177856513203, "learning_rate": 4.518659091495263e-06, "loss": 0.3868, "step": 4560 }, { "epoch": 2.033340842532102, "grad_norm": 0.5757285209713503, "learning_rate": 4.514708427883642e-06, "loss": 0.4057, "step": 4561 }, { "epoch": 2.0337913944582113, "grad_norm": 0.5642004168702129, "learning_rate": 4.510758988482701e-06, "loss": 0.3792, "step": 4562 }, { "epoch": 2.034241946384321, "grad_norm": 0.5868619763319799, "learning_rate": 4.50681077417388e-06, "loss": 0.38, "step": 4563 }, { "epoch": 2.0346924983104304, "grad_norm": 0.5996030686441215, "learning_rate": 4.5028637858383415e-06, "loss": 0.3869, "step": 4564 }, { "epoch": 2.03514305023654, "grad_norm": 0.562875690390852, "learning_rate": 4.49891802435698e-06, "loss": 0.4048, "step": 4565 }, { "epoch": 2.035593602162649, "grad_norm": 0.5809392776545658, "learning_rate": 4.4949734906104096e-06, "loss": 0.4061, "step": 4566 }, { "epoch": 2.0360441540887586, "grad_norm": 0.5973407443848463, "learning_rate": 4.491030185478976e-06, "loss": 0.3802, "step": 4567 }, { "epoch": 2.036494706014868, "grad_norm": 0.593076562622907, "learning_rate": 4.4870881098427475e-06, "loss": 0.3832, "step": 4568 }, { "epoch": 2.0369452579409777, "grad_norm": 0.5671705469419875, "learning_rate": 4.4831472645815184e-06, "loss": 0.3945, "step": 4569 }, { "epoch": 2.0373958098670872, "grad_norm": 0.5802242526831863, "learning_rate": 4.479207650574812e-06, "loss": 0.3934, "step": 4570 }, { "epoch": 2.037846361793197, "grad_norm": 0.5697031964254567, "learning_rate": 4.4752692687018685e-06, "loss": 0.376, "step": 4571 }, { "epoch": 2.0382969137193063, "grad_norm": 0.5797421352157516, "learning_rate": 4.471332119841667e-06, "loss": 0.4136, "step": 4572 }, { "epoch": 2.0387474656454154, "grad_norm": 0.568279297798103, "learning_rate": 4.467396204872888e-06, "loss": 0.3774, "step": 4573 }, { "epoch": 2.039198017571525, "grad_norm": 0.5867218563245078, "learning_rate": 4.463461524673966e-06, "loss": 0.4186, "step": 4574 }, { "epoch": 2.0396485694976345, "grad_norm": 0.5883722296149173, "learning_rate": 4.459528080123038e-06, "loss": 0.39, "step": 4575 }, { "epoch": 2.040099121423744, "grad_norm": 0.5904550106577043, "learning_rate": 4.455595872097974e-06, "loss": 0.4209, "step": 4576 }, { "epoch": 2.0405496733498536, "grad_norm": 0.5879255564441809, "learning_rate": 4.451664901476367e-06, "loss": 0.4366, "step": 4577 }, { "epoch": 2.041000225275963, "grad_norm": 0.5702691470443074, "learning_rate": 4.447735169135533e-06, "loss": 0.3984, "step": 4578 }, { "epoch": 2.0414507772020727, "grad_norm": 0.6053744003114393, "learning_rate": 4.443806675952513e-06, "loss": 0.4151, "step": 4579 }, { "epoch": 2.0419013291281822, "grad_norm": 0.5742856316282194, "learning_rate": 4.439879422804069e-06, "loss": 0.3888, "step": 4580 }, { "epoch": 2.0423518810542913, "grad_norm": 0.5769303676986124, "learning_rate": 4.435953410566693e-06, "loss": 0.3728, "step": 4581 }, { "epoch": 2.042802432980401, "grad_norm": 0.591058469170863, "learning_rate": 4.432028640116581e-06, "loss": 0.4069, "step": 4582 }, { "epoch": 2.0432529849065104, "grad_norm": 0.6049166485493924, "learning_rate": 4.428105112329683e-06, "loss": 0.4091, "step": 4583 }, { "epoch": 2.04370353683262, "grad_norm": 0.6001039924640544, "learning_rate": 4.424182828081639e-06, "loss": 0.4046, "step": 4584 }, { "epoch": 2.0441540887587295, "grad_norm": 0.5708665254011653, "learning_rate": 4.420261788247841e-06, "loss": 0.3799, "step": 4585 }, { "epoch": 2.044604640684839, "grad_norm": 0.5751796308539967, "learning_rate": 4.416341993703373e-06, "loss": 0.4074, "step": 4586 }, { "epoch": 2.0450551926109486, "grad_norm": 0.5940803712101227, "learning_rate": 4.412423445323075e-06, "loss": 0.4221, "step": 4587 }, { "epoch": 2.0455057445370577, "grad_norm": 0.5900240692691335, "learning_rate": 4.408506143981475e-06, "loss": 0.4034, "step": 4588 }, { "epoch": 2.0459562964631672, "grad_norm": 0.586410333521068, "learning_rate": 4.404590090552853e-06, "loss": 0.401, "step": 4589 }, { "epoch": 2.046406848389277, "grad_norm": 0.5725798191966015, "learning_rate": 4.40067528591118e-06, "loss": 0.3903, "step": 4590 }, { "epoch": 2.0468574003153863, "grad_norm": 0.5683985352304429, "learning_rate": 4.396761730930181e-06, "loss": 0.4166, "step": 4591 }, { "epoch": 2.047307952241496, "grad_norm": 0.5803746463632783, "learning_rate": 4.392849426483275e-06, "loss": 0.3998, "step": 4592 }, { "epoch": 2.0477585041676054, "grad_norm": 0.6023718332423844, "learning_rate": 4.388938373443614e-06, "loss": 0.415, "step": 4593 }, { "epoch": 2.048209056093715, "grad_norm": 0.5866725885395617, "learning_rate": 4.385028572684072e-06, "loss": 0.3864, "step": 4594 }, { "epoch": 2.0486596080198245, "grad_norm": 0.5688405754366731, "learning_rate": 4.381120025077238e-06, "loss": 0.4079, "step": 4595 }, { "epoch": 2.0491101599459336, "grad_norm": 0.6017400561495801, "learning_rate": 4.377212731495425e-06, "loss": 0.4067, "step": 4596 }, { "epoch": 2.049560711872043, "grad_norm": 0.5996031499922223, "learning_rate": 4.373306692810666e-06, "loss": 0.4089, "step": 4597 }, { "epoch": 2.0500112637981527, "grad_norm": 0.5933075583723831, "learning_rate": 4.3694019098947125e-06, "loss": 0.3768, "step": 4598 }, { "epoch": 2.0504618157242622, "grad_norm": 0.6018671902395696, "learning_rate": 4.365498383619036e-06, "loss": 0.3895, "step": 4599 }, { "epoch": 2.0509123676503718, "grad_norm": 0.5831579257942652, "learning_rate": 4.361596114854828e-06, "loss": 0.4282, "step": 4600 }, { "epoch": 2.0513629195764813, "grad_norm": 0.5847895987198106, "learning_rate": 4.357695104472999e-06, "loss": 0.4335, "step": 4601 }, { "epoch": 2.051813471502591, "grad_norm": 0.6086524706766395, "learning_rate": 4.353795353344179e-06, "loss": 0.4139, "step": 4602 }, { "epoch": 2.0522640234287, "grad_norm": 0.5795618639651681, "learning_rate": 4.349896862338717e-06, "loss": 0.3936, "step": 4603 }, { "epoch": 2.0527145753548095, "grad_norm": 0.5870982188715521, "learning_rate": 4.345999632326681e-06, "loss": 0.3993, "step": 4604 }, { "epoch": 2.053165127280919, "grad_norm": 0.556578642496439, "learning_rate": 4.342103664177856e-06, "loss": 0.3694, "step": 4605 }, { "epoch": 2.0536156792070286, "grad_norm": 0.5748644749920336, "learning_rate": 4.338208958761747e-06, "loss": 0.3762, "step": 4606 }, { "epoch": 2.054066231133138, "grad_norm": 0.5759338861698118, "learning_rate": 4.33431551694758e-06, "loss": 0.4127, "step": 4607 }, { "epoch": 2.0545167830592477, "grad_norm": 0.5939476043821671, "learning_rate": 4.330423339604286e-06, "loss": 0.422, "step": 4608 }, { "epoch": 2.054967334985357, "grad_norm": 0.5699132333894673, "learning_rate": 4.326532427600537e-06, "loss": 0.4118, "step": 4609 }, { "epoch": 2.0554178869114668, "grad_norm": 0.5963199632044953, "learning_rate": 4.322642781804696e-06, "loss": 0.4074, "step": 4610 }, { "epoch": 2.055868438837576, "grad_norm": 0.5789890963363736, "learning_rate": 4.318754403084868e-06, "loss": 0.4088, "step": 4611 }, { "epoch": 2.0563189907636854, "grad_norm": 0.5785830322589379, "learning_rate": 4.314867292308852e-06, "loss": 0.3803, "step": 4612 }, { "epoch": 2.056769542689795, "grad_norm": 0.5735151009789885, "learning_rate": 4.3109814503441894e-06, "loss": 0.4212, "step": 4613 }, { "epoch": 2.0572200946159045, "grad_norm": 0.6034165969805985, "learning_rate": 4.307096878058109e-06, "loss": 0.3704, "step": 4614 }, { "epoch": 2.057670646542014, "grad_norm": 0.5827173583490937, "learning_rate": 4.303213576317589e-06, "loss": 0.4176, "step": 4615 }, { "epoch": 2.0581211984681236, "grad_norm": 0.5695398555866008, "learning_rate": 4.2993315459892905e-06, "loss": 0.3701, "step": 4616 }, { "epoch": 2.058571750394233, "grad_norm": 0.5935024207961399, "learning_rate": 4.295450787939622e-06, "loss": 0.3962, "step": 4617 }, { "epoch": 2.0590223023203422, "grad_norm": 0.6086436608054913, "learning_rate": 4.291571303034684e-06, "loss": 0.4175, "step": 4618 }, { "epoch": 2.0594728542464518, "grad_norm": 0.6015802095660616, "learning_rate": 4.287693092140305e-06, "loss": 0.4206, "step": 4619 }, { "epoch": 2.0599234061725613, "grad_norm": 0.5924392940494901, "learning_rate": 4.2838161561220245e-06, "loss": 0.3663, "step": 4620 }, { "epoch": 2.060373958098671, "grad_norm": 0.5998108866546945, "learning_rate": 4.279940495845104e-06, "loss": 0.3545, "step": 4621 }, { "epoch": 2.0608245100247804, "grad_norm": 0.6139682097398589, "learning_rate": 4.276066112174512e-06, "loss": 0.38, "step": 4622 }, { "epoch": 2.06127506195089, "grad_norm": 0.5955149697972535, "learning_rate": 4.272193005974932e-06, "loss": 0.3983, "step": 4623 }, { "epoch": 2.0617256138769995, "grad_norm": 0.5989141039379533, "learning_rate": 4.268321178110779e-06, "loss": 0.4166, "step": 4624 }, { "epoch": 2.062176165803109, "grad_norm": 0.6113636413432081, "learning_rate": 4.264450629446155e-06, "loss": 0.4228, "step": 4625 }, { "epoch": 2.062626717729218, "grad_norm": 0.574384325178668, "learning_rate": 4.260581360844906e-06, "loss": 0.3951, "step": 4626 }, { "epoch": 2.0630772696553277, "grad_norm": 0.5830788014174236, "learning_rate": 4.256713373170565e-06, "loss": 0.4076, "step": 4627 }, { "epoch": 2.063527821581437, "grad_norm": 0.5664892824960513, "learning_rate": 4.252846667286396e-06, "loss": 0.3877, "step": 4628 }, { "epoch": 2.0639783735075468, "grad_norm": 0.6022042663160331, "learning_rate": 4.248981244055376e-06, "loss": 0.3674, "step": 4629 }, { "epoch": 2.0644289254336563, "grad_norm": 0.612528991989187, "learning_rate": 4.245117104340188e-06, "loss": 0.4095, "step": 4630 }, { "epoch": 2.064879477359766, "grad_norm": 0.5815925829932288, "learning_rate": 4.241254249003236e-06, "loss": 0.3999, "step": 4631 }, { "epoch": 2.0653300292858754, "grad_norm": 0.5808629646972634, "learning_rate": 4.237392678906633e-06, "loss": 0.4151, "step": 4632 }, { "epoch": 2.0657805812119845, "grad_norm": 0.7985436249336076, "learning_rate": 4.233532394912208e-06, "loss": 0.4068, "step": 4633 }, { "epoch": 2.066231133138094, "grad_norm": 0.5985226566859502, "learning_rate": 4.2296733978815e-06, "loss": 0.3912, "step": 4634 }, { "epoch": 2.0666816850642036, "grad_norm": 0.5807591302386614, "learning_rate": 4.225815688675762e-06, "loss": 0.4125, "step": 4635 }, { "epoch": 2.067132236990313, "grad_norm": 0.5901841942020605, "learning_rate": 4.22195926815596e-06, "loss": 0.4127, "step": 4636 }, { "epoch": 2.0675827889164227, "grad_norm": 0.5939745047630635, "learning_rate": 4.218104137182775e-06, "loss": 0.4223, "step": 4637 }, { "epoch": 2.068033340842532, "grad_norm": 0.5990252451881818, "learning_rate": 4.214250296616593e-06, "loss": 0.4152, "step": 4638 }, { "epoch": 2.0684838927686418, "grad_norm": 0.5921212447221815, "learning_rate": 4.21039774731752e-06, "loss": 0.3996, "step": 4639 }, { "epoch": 2.068934444694751, "grad_norm": 0.5646264660332186, "learning_rate": 4.2065464901453705e-06, "loss": 0.392, "step": 4640 }, { "epoch": 2.0693849966208604, "grad_norm": 0.5776904069894329, "learning_rate": 4.202696525959667e-06, "loss": 0.3756, "step": 4641 }, { "epoch": 2.06983554854697, "grad_norm": 0.5763295798663877, "learning_rate": 4.198847855619652e-06, "loss": 0.3839, "step": 4642 }, { "epoch": 2.0702861004730795, "grad_norm": 0.5755720299789334, "learning_rate": 4.195000479984264e-06, "loss": 0.4138, "step": 4643 }, { "epoch": 2.070736652399189, "grad_norm": 0.5669950182875053, "learning_rate": 4.191154399912178e-06, "loss": 0.3655, "step": 4644 }, { "epoch": 2.0711872043252986, "grad_norm": 0.6044967653857386, "learning_rate": 4.1873096162617474e-06, "loss": 0.392, "step": 4645 }, { "epoch": 2.071637756251408, "grad_norm": 0.6162783745512435, "learning_rate": 4.18346612989107e-06, "loss": 0.4031, "step": 4646 }, { "epoch": 2.0720883081775177, "grad_norm": 0.6025440795950548, "learning_rate": 4.179623941657922e-06, "loss": 0.3953, "step": 4647 }, { "epoch": 2.0725388601036268, "grad_norm": 0.5649900953267262, "learning_rate": 4.17578305241982e-06, "loss": 0.3854, "step": 4648 }, { "epoch": 2.0729894120297363, "grad_norm": 0.603142606842484, "learning_rate": 4.1719434630339606e-06, "loss": 0.4206, "step": 4649 }, { "epoch": 2.073439963955846, "grad_norm": 0.6243754766908121, "learning_rate": 4.1681051743572805e-06, "loss": 0.3971, "step": 4650 }, { "epoch": 2.0738905158819554, "grad_norm": 0.5726148660158243, "learning_rate": 4.1642681872463985e-06, "loss": 0.4174, "step": 4651 }, { "epoch": 2.074341067808065, "grad_norm": 0.5786693671090135, "learning_rate": 4.160432502557667e-06, "loss": 0.397, "step": 4652 }, { "epoch": 2.0747916197341745, "grad_norm": 0.6142285486300989, "learning_rate": 4.156598121147128e-06, "loss": 0.3994, "step": 4653 }, { "epoch": 2.075242171660284, "grad_norm": 0.5781132306590844, "learning_rate": 4.1527650438705455e-06, "loss": 0.3928, "step": 4654 }, { "epoch": 2.075692723586393, "grad_norm": 0.6007108462522683, "learning_rate": 4.148933271583385e-06, "loss": 0.4153, "step": 4655 }, { "epoch": 2.0761432755125027, "grad_norm": 0.5756745505247222, "learning_rate": 4.145102805140826e-06, "loss": 0.3918, "step": 4656 }, { "epoch": 2.076593827438612, "grad_norm": 0.5901173770318486, "learning_rate": 4.1412736453977545e-06, "loss": 0.3959, "step": 4657 }, { "epoch": 2.0770443793647217, "grad_norm": 0.6039181415617274, "learning_rate": 4.137445793208762e-06, "loss": 0.3846, "step": 4658 }, { "epoch": 2.0774949312908313, "grad_norm": 0.5570372684301453, "learning_rate": 4.1336192494281535e-06, "loss": 0.3896, "step": 4659 }, { "epoch": 2.077945483216941, "grad_norm": 0.5921542765034724, "learning_rate": 4.129794014909939e-06, "loss": 0.4009, "step": 4660 }, { "epoch": 2.0783960351430504, "grad_norm": 0.6051565079576516, "learning_rate": 4.125970090507836e-06, "loss": 0.3963, "step": 4661 }, { "epoch": 2.07884658706916, "grad_norm": 0.5816396992035149, "learning_rate": 4.12214747707527e-06, "loss": 0.4017, "step": 4662 }, { "epoch": 2.079297138995269, "grad_norm": 0.6009918880304821, "learning_rate": 4.118326175465375e-06, "loss": 0.4235, "step": 4663 }, { "epoch": 2.0797476909213786, "grad_norm": 0.5774828221832209, "learning_rate": 4.11450618653099e-06, "loss": 0.3554, "step": 4664 }, { "epoch": 2.080198242847488, "grad_norm": 0.5662971891671189, "learning_rate": 4.110687511124665e-06, "loss": 0.3956, "step": 4665 }, { "epoch": 2.0806487947735977, "grad_norm": 0.5898560262522922, "learning_rate": 4.1068701500986505e-06, "loss": 0.4086, "step": 4666 }, { "epoch": 2.081099346699707, "grad_norm": 0.5631942052436351, "learning_rate": 4.1030541043049125e-06, "loss": 0.4036, "step": 4667 }, { "epoch": 2.0815498986258167, "grad_norm": 0.6015975829336786, "learning_rate": 4.099239374595116e-06, "loss": 0.4097, "step": 4668 }, { "epoch": 2.0820004505519263, "grad_norm": 0.5839536025758051, "learning_rate": 4.0954259618206295e-06, "loss": 0.4074, "step": 4669 }, { "epoch": 2.0824510024780354, "grad_norm": 0.5868370695477143, "learning_rate": 4.091613866832544e-06, "loss": 0.4081, "step": 4670 }, { "epoch": 2.082901554404145, "grad_norm": 0.5773978223813189, "learning_rate": 4.0878030904816315e-06, "loss": 0.4053, "step": 4671 }, { "epoch": 2.0833521063302545, "grad_norm": 0.5732042206629484, "learning_rate": 4.083993633618394e-06, "loss": 0.3875, "step": 4672 }, { "epoch": 2.083802658256364, "grad_norm": 0.5859067776947257, "learning_rate": 4.080185497093024e-06, "loss": 0.4028, "step": 4673 }, { "epoch": 2.0842532101824736, "grad_norm": 0.5784452894189661, "learning_rate": 4.076378681755425e-06, "loss": 0.3803, "step": 4674 }, { "epoch": 2.084703762108583, "grad_norm": 0.5957959640209213, "learning_rate": 4.072573188455204e-06, "loss": 0.3923, "step": 4675 }, { "epoch": 2.0851543140346926, "grad_norm": 0.5697215566055502, "learning_rate": 4.068769018041674e-06, "loss": 0.3871, "step": 4676 }, { "epoch": 2.085604865960802, "grad_norm": 0.5770676919124693, "learning_rate": 4.064966171363854e-06, "loss": 0.3726, "step": 4677 }, { "epoch": 2.0860554178869113, "grad_norm": 0.5921514276157516, "learning_rate": 4.061164649270457e-06, "loss": 0.3756, "step": 4678 }, { "epoch": 2.086505969813021, "grad_norm": 0.599005389712974, "learning_rate": 4.057364452609921e-06, "loss": 0.4206, "step": 4679 }, { "epoch": 2.0869565217391304, "grad_norm": 0.6016150560091578, "learning_rate": 4.053565582230362e-06, "loss": 0.4005, "step": 4680 }, { "epoch": 2.08740707366524, "grad_norm": 0.5880343860942134, "learning_rate": 4.049768038979631e-06, "loss": 0.404, "step": 4681 }, { "epoch": 2.0878576255913495, "grad_norm": 0.5888585894028093, "learning_rate": 4.045971823705249e-06, "loss": 0.3967, "step": 4682 }, { "epoch": 2.088308177517459, "grad_norm": 0.6252973406535438, "learning_rate": 4.042176937254474e-06, "loss": 0.3851, "step": 4683 }, { "epoch": 2.0887587294435686, "grad_norm": 0.5956083080150711, "learning_rate": 4.0383833804742355e-06, "loss": 0.3764, "step": 4684 }, { "epoch": 2.0892092813696777, "grad_norm": 0.5635824691553976, "learning_rate": 4.034591154211196e-06, "loss": 0.4016, "step": 4685 }, { "epoch": 2.089659833295787, "grad_norm": 0.6068325053521619, "learning_rate": 4.030800259311693e-06, "loss": 0.3911, "step": 4686 }, { "epoch": 2.0901103852218967, "grad_norm": 0.5626680211480438, "learning_rate": 4.027010696621795e-06, "loss": 0.3637, "step": 4687 }, { "epoch": 2.0905609371480063, "grad_norm": 0.5943278004026171, "learning_rate": 4.023222466987248e-06, "loss": 0.4074, "step": 4688 }, { "epoch": 2.091011489074116, "grad_norm": 0.6253396331693865, "learning_rate": 4.019435571253514e-06, "loss": 0.4008, "step": 4689 }, { "epoch": 2.0914620410002254, "grad_norm": 0.6061421073222825, "learning_rate": 4.015650010265757e-06, "loss": 0.4018, "step": 4690 }, { "epoch": 2.091912592926335, "grad_norm": 0.6038480366732768, "learning_rate": 4.011865784868839e-06, "loss": 0.4016, "step": 4691 }, { "epoch": 2.0923631448524445, "grad_norm": 0.5908275625493723, "learning_rate": 4.008082895907326e-06, "loss": 0.3691, "step": 4692 }, { "epoch": 2.0928136967785536, "grad_norm": 0.5985748499975037, "learning_rate": 4.004301344225485e-06, "loss": 0.3982, "step": 4693 }, { "epoch": 2.093264248704663, "grad_norm": 0.5942656642019758, "learning_rate": 4.000521130667288e-06, "loss": 0.3902, "step": 4694 }, { "epoch": 2.0937148006307726, "grad_norm": 0.5649804869097176, "learning_rate": 3.9967422560764e-06, "loss": 0.4044, "step": 4695 }, { "epoch": 2.094165352556882, "grad_norm": 0.5834468525861481, "learning_rate": 3.992964721296197e-06, "loss": 0.4066, "step": 4696 }, { "epoch": 2.0946159044829917, "grad_norm": 0.586422595337242, "learning_rate": 3.989188527169749e-06, "loss": 0.4082, "step": 4697 }, { "epoch": 2.0950664564091013, "grad_norm": 0.5510417095022262, "learning_rate": 3.985413674539832e-06, "loss": 0.383, "step": 4698 }, { "epoch": 2.095517008335211, "grad_norm": 0.5888308033014509, "learning_rate": 3.9816401642489164e-06, "loss": 0.4339, "step": 4699 }, { "epoch": 2.09596756026132, "grad_norm": 0.5870990550243375, "learning_rate": 3.977867997139179e-06, "loss": 0.4047, "step": 4700 }, { "epoch": 2.0964181121874295, "grad_norm": 0.6008413266269139, "learning_rate": 3.974097174052494e-06, "loss": 0.4039, "step": 4701 }, { "epoch": 2.096868664113539, "grad_norm": 0.6136145162315056, "learning_rate": 3.970327695830434e-06, "loss": 0.4085, "step": 4702 }, { "epoch": 2.0973192160396485, "grad_norm": 0.6007215319690279, "learning_rate": 3.966559563314279e-06, "loss": 0.406, "step": 4703 }, { "epoch": 2.097769767965758, "grad_norm": 0.5884491812359275, "learning_rate": 3.962792777344992e-06, "loss": 0.4011, "step": 4704 }, { "epoch": 2.0982203198918676, "grad_norm": 0.5635939072846232, "learning_rate": 3.959027338763262e-06, "loss": 0.3761, "step": 4705 }, { "epoch": 2.098670871817977, "grad_norm": 0.5905935720067265, "learning_rate": 3.9552632484094444e-06, "loss": 0.4032, "step": 4706 }, { "epoch": 2.0991214237440863, "grad_norm": 0.6085805442792911, "learning_rate": 3.9515005071236274e-06, "loss": 0.4159, "step": 4707 }, { "epoch": 2.099571975670196, "grad_norm": 0.5917309502401232, "learning_rate": 3.9477391157455694e-06, "loss": 0.3921, "step": 4708 }, { "epoch": 2.1000225275963054, "grad_norm": 0.5691096493760323, "learning_rate": 3.943979075114751e-06, "loss": 0.384, "step": 4709 }, { "epoch": 2.100473079522415, "grad_norm": 0.5998602071439599, "learning_rate": 3.940220386070327e-06, "loss": 0.409, "step": 4710 }, { "epoch": 2.1009236314485245, "grad_norm": 0.5642670384843672, "learning_rate": 3.936463049451179e-06, "loss": 0.3658, "step": 4711 }, { "epoch": 2.101374183374634, "grad_norm": 0.5793028459424376, "learning_rate": 3.932707066095861e-06, "loss": 0.3951, "step": 4712 }, { "epoch": 2.1018247353007435, "grad_norm": 0.5689104207672365, "learning_rate": 3.928952436842639e-06, "loss": 0.4028, "step": 4713 }, { "epoch": 2.102275287226853, "grad_norm": 0.5737303484914072, "learning_rate": 3.925199162529473e-06, "loss": 0.3733, "step": 4714 }, { "epoch": 2.102725839152962, "grad_norm": 0.5647874510716956, "learning_rate": 3.92144724399402e-06, "loss": 0.4035, "step": 4715 }, { "epoch": 2.1031763910790717, "grad_norm": 0.5918132421393278, "learning_rate": 3.917696682073639e-06, "loss": 0.3969, "step": 4716 }, { "epoch": 2.1036269430051813, "grad_norm": 0.5749271324226752, "learning_rate": 3.913947477605378e-06, "loss": 0.4065, "step": 4717 }, { "epoch": 2.104077494931291, "grad_norm": 0.6032096699340073, "learning_rate": 3.910199631425989e-06, "loss": 0.4166, "step": 4718 }, { "epoch": 2.1045280468574004, "grad_norm": 0.5982643108498765, "learning_rate": 3.90645314437192e-06, "loss": 0.444, "step": 4719 }, { "epoch": 2.10497859878351, "grad_norm": 0.5705495274835681, "learning_rate": 3.902708017279312e-06, "loss": 0.3837, "step": 4720 }, { "epoch": 2.1054291507096194, "grad_norm": 0.5781834164456019, "learning_rate": 3.898964250984007e-06, "loss": 0.4, "step": 4721 }, { "epoch": 2.105879702635729, "grad_norm": 0.5966880320076078, "learning_rate": 3.895221846321537e-06, "loss": 0.3955, "step": 4722 }, { "epoch": 2.106330254561838, "grad_norm": 0.6026660055923241, "learning_rate": 3.891480804127139e-06, "loss": 0.4149, "step": 4723 }, { "epoch": 2.1067808064879476, "grad_norm": 0.5961793686708301, "learning_rate": 3.887741125235738e-06, "loss": 0.4162, "step": 4724 }, { "epoch": 2.107231358414057, "grad_norm": 0.59805587905295, "learning_rate": 3.884002810481959e-06, "loss": 0.3954, "step": 4725 }, { "epoch": 2.1076819103401667, "grad_norm": 0.5888131207488025, "learning_rate": 3.8802658607001195e-06, "loss": 0.4055, "step": 4726 }, { "epoch": 2.1081324622662763, "grad_norm": 0.5793652485605104, "learning_rate": 3.876530276724236e-06, "loss": 0.3744, "step": 4727 }, { "epoch": 2.108583014192386, "grad_norm": 0.591882340265663, "learning_rate": 3.872796059388019e-06, "loss": 0.4081, "step": 4728 }, { "epoch": 2.1090335661184954, "grad_norm": 0.6004261978215955, "learning_rate": 3.869063209524871e-06, "loss": 0.4057, "step": 4729 }, { "epoch": 2.1094841180446045, "grad_norm": 0.5979608245875495, "learning_rate": 3.865331727967895e-06, "loss": 0.3964, "step": 4730 }, { "epoch": 2.109934669970714, "grad_norm": 0.5531070347131342, "learning_rate": 3.861601615549881e-06, "loss": 0.3739, "step": 4731 }, { "epoch": 2.1103852218968235, "grad_norm": 0.5814431421456141, "learning_rate": 3.857872873103322e-06, "loss": 0.4124, "step": 4732 }, { "epoch": 2.110835773822933, "grad_norm": 0.5761320117140787, "learning_rate": 3.854145501460398e-06, "loss": 0.3963, "step": 4733 }, { "epoch": 2.1112863257490426, "grad_norm": 0.5813009681020924, "learning_rate": 3.850419501452988e-06, "loss": 0.3948, "step": 4734 }, { "epoch": 2.111736877675152, "grad_norm": 0.5897713916755523, "learning_rate": 3.846694873912662e-06, "loss": 0.418, "step": 4735 }, { "epoch": 2.1121874296012617, "grad_norm": 0.587649617065477, "learning_rate": 3.842971619670683e-06, "loss": 0.3729, "step": 4736 }, { "epoch": 2.112637981527371, "grad_norm": 0.5877666369908546, "learning_rate": 3.839249739558013e-06, "loss": 0.3891, "step": 4737 }, { "epoch": 2.1130885334534804, "grad_norm": 0.596437556776902, "learning_rate": 3.835529234405303e-06, "loss": 0.3762, "step": 4738 }, { "epoch": 2.11353908537959, "grad_norm": 0.592150889358018, "learning_rate": 3.83181010504289e-06, "loss": 0.406, "step": 4739 }, { "epoch": 2.1139896373056994, "grad_norm": 0.5953365683170925, "learning_rate": 3.828092352300824e-06, "loss": 0.4255, "step": 4740 }, { "epoch": 2.114440189231809, "grad_norm": 0.6047875295856292, "learning_rate": 3.824375977008822e-06, "loss": 0.4079, "step": 4741 }, { "epoch": 2.1148907411579185, "grad_norm": 0.5854931841024981, "learning_rate": 3.8206609799963205e-06, "loss": 0.3999, "step": 4742 }, { "epoch": 2.115341293084028, "grad_norm": 0.587951529002933, "learning_rate": 3.816947362092419e-06, "loss": 0.3952, "step": 4743 }, { "epoch": 2.1157918450101376, "grad_norm": 0.59275875000875, "learning_rate": 3.8132351241259425e-06, "loss": 0.4089, "step": 4744 }, { "epoch": 2.1162423969362467, "grad_norm": 0.5933398303764102, "learning_rate": 3.8095242669253753e-06, "loss": 0.3668, "step": 4745 }, { "epoch": 2.1166929488623563, "grad_norm": 0.5756559307394755, "learning_rate": 3.8058147913189215e-06, "loss": 0.3879, "step": 4746 }, { "epoch": 2.117143500788466, "grad_norm": 0.588888617780403, "learning_rate": 3.80210669813445e-06, "loss": 0.3976, "step": 4747 }, { "epoch": 2.1175940527145753, "grad_norm": 0.6101579559990659, "learning_rate": 3.7983999881995515e-06, "loss": 0.379, "step": 4748 }, { "epoch": 2.118044604640685, "grad_norm": 0.6180563397306424, "learning_rate": 3.7946946623414794e-06, "loss": 0.412, "step": 4749 }, { "epoch": 2.1184951565667944, "grad_norm": 0.5746875991804539, "learning_rate": 3.7909907213871943e-06, "loss": 0.3756, "step": 4750 }, { "epoch": 2.118945708492904, "grad_norm": 0.6169181067829408, "learning_rate": 3.787288166163344e-06, "loss": 0.4373, "step": 4751 }, { "epoch": 2.119396260419013, "grad_norm": 0.5853999037238407, "learning_rate": 3.783586997496268e-06, "loss": 0.3946, "step": 4752 }, { "epoch": 2.1198468123451226, "grad_norm": 0.580731303421409, "learning_rate": 3.7798872162119948e-06, "loss": 0.4012, "step": 4753 }, { "epoch": 2.120297364271232, "grad_norm": 0.571918884611718, "learning_rate": 3.7761888231362433e-06, "loss": 0.4074, "step": 4754 }, { "epoch": 2.1207479161973417, "grad_norm": 0.6006478967143654, "learning_rate": 3.7724918190944225e-06, "loss": 0.3851, "step": 4755 }, { "epoch": 2.1211984681234513, "grad_norm": 0.6017452386706903, "learning_rate": 3.7687962049116345e-06, "loss": 0.4162, "step": 4756 }, { "epoch": 2.121649020049561, "grad_norm": 0.5543746748698043, "learning_rate": 3.7651019814126656e-06, "loss": 0.383, "step": 4757 }, { "epoch": 2.1220995719756703, "grad_norm": 0.5697669164318647, "learning_rate": 3.7614091494219975e-06, "loss": 0.3948, "step": 4758 }, { "epoch": 2.12255012390178, "grad_norm": 0.587087503266252, "learning_rate": 3.757717709763796e-06, "loss": 0.3919, "step": 4759 }, { "epoch": 2.123000675827889, "grad_norm": 0.5830724337097029, "learning_rate": 3.754027663261922e-06, "loss": 0.4067, "step": 4760 }, { "epoch": 2.1234512277539985, "grad_norm": 0.5751430071123198, "learning_rate": 3.7503390107399194e-06, "loss": 0.3819, "step": 4761 }, { "epoch": 2.123901779680108, "grad_norm": 0.5693980310041025, "learning_rate": 3.7466517530210257e-06, "loss": 0.4201, "step": 4762 }, { "epoch": 2.1243523316062176, "grad_norm": 0.5931161007372588, "learning_rate": 3.742965890928164e-06, "loss": 0.3884, "step": 4763 }, { "epoch": 2.124802883532327, "grad_norm": 0.5825158461867904, "learning_rate": 3.7392814252839528e-06, "loss": 0.3793, "step": 4764 }, { "epoch": 2.1252534354584367, "grad_norm": 0.5887721865297698, "learning_rate": 3.73559835691068e-06, "loss": 0.3964, "step": 4765 }, { "epoch": 2.1257039873845462, "grad_norm": 0.6168935032325484, "learning_rate": 3.7319166866303513e-06, "loss": 0.3977, "step": 4766 }, { "epoch": 2.1261545393106553, "grad_norm": 0.6084505454339109, "learning_rate": 3.72823641526463e-06, "loss": 0.3805, "step": 4767 }, { "epoch": 2.126605091236765, "grad_norm": 0.6017993737710093, "learning_rate": 3.7245575436348937e-06, "loss": 0.3843, "step": 4768 }, { "epoch": 2.1270556431628744, "grad_norm": 0.5873471161191277, "learning_rate": 3.720880072562183e-06, "loss": 0.4071, "step": 4769 }, { "epoch": 2.127506195088984, "grad_norm": 0.5600422106264008, "learning_rate": 3.7172040028672496e-06, "loss": 0.3822, "step": 4770 }, { "epoch": 2.1279567470150935, "grad_norm": 0.5786384294861797, "learning_rate": 3.7135293353705103e-06, "loss": 0.4057, "step": 4771 }, { "epoch": 2.128407298941203, "grad_norm": 0.6109606639987577, "learning_rate": 3.7098560708920874e-06, "loss": 0.3749, "step": 4772 }, { "epoch": 2.1288578508673126, "grad_norm": 0.5924749138721396, "learning_rate": 3.706184210251783e-06, "loss": 0.3748, "step": 4773 }, { "epoch": 2.1293084027934217, "grad_norm": 0.6147274631284728, "learning_rate": 3.702513754269076e-06, "loss": 0.4059, "step": 4774 }, { "epoch": 2.1297589547195312, "grad_norm": 0.6410779012222569, "learning_rate": 3.6988447037631527e-06, "loss": 0.441, "step": 4775 }, { "epoch": 2.130209506645641, "grad_norm": 0.6046484934662005, "learning_rate": 3.6951770595528615e-06, "loss": 0.4028, "step": 4776 }, { "epoch": 2.1306600585717503, "grad_norm": 0.5781095505655726, "learning_rate": 3.691510822456764e-06, "loss": 0.4014, "step": 4777 }, { "epoch": 2.13111061049786, "grad_norm": 0.5845854245038806, "learning_rate": 3.6878459932930776e-06, "loss": 0.3899, "step": 4778 }, { "epoch": 2.1315611624239694, "grad_norm": 0.5858320269354454, "learning_rate": 3.6841825728797353e-06, "loss": 0.4297, "step": 4779 }, { "epoch": 2.132011714350079, "grad_norm": 0.5976618687607095, "learning_rate": 3.6805205620343286e-06, "loss": 0.4126, "step": 4780 }, { "epoch": 2.1324622662761885, "grad_norm": 0.6036095571243227, "learning_rate": 3.676859961574162e-06, "loss": 0.3864, "step": 4781 }, { "epoch": 2.1329128182022976, "grad_norm": 0.567604990449985, "learning_rate": 3.6732007723161933e-06, "loss": 0.3865, "step": 4782 }, { "epoch": 2.133363370128407, "grad_norm": 0.6048978229394978, "learning_rate": 3.669542995077099e-06, "loss": 0.419, "step": 4783 }, { "epoch": 2.1338139220545167, "grad_norm": 0.5871939571652953, "learning_rate": 3.6658866306732132e-06, "loss": 0.3967, "step": 4784 }, { "epoch": 2.1342644739806262, "grad_norm": 0.5928406717560357, "learning_rate": 3.6622316799205695e-06, "loss": 0.3976, "step": 4785 }, { "epoch": 2.134715025906736, "grad_norm": 0.5928058949987947, "learning_rate": 3.65857814363488e-06, "loss": 0.4091, "step": 4786 }, { "epoch": 2.1351655778328453, "grad_norm": 0.6154425692730324, "learning_rate": 3.6549260226315453e-06, "loss": 0.3939, "step": 4787 }, { "epoch": 2.135616129758955, "grad_norm": 0.6398395239420243, "learning_rate": 3.651275317725648e-06, "loss": 0.3958, "step": 4788 }, { "epoch": 2.1360666816850644, "grad_norm": 0.5792381537092273, "learning_rate": 3.647626029731952e-06, "loss": 0.3563, "step": 4789 }, { "epoch": 2.1365172336111735, "grad_norm": 0.5836229142711881, "learning_rate": 3.6439781594649104e-06, "loss": 0.406, "step": 4790 }, { "epoch": 2.136967785537283, "grad_norm": 0.5814196017137216, "learning_rate": 3.6403317077386555e-06, "loss": 0.4076, "step": 4791 }, { "epoch": 2.1374183374633926, "grad_norm": 0.6000263768582929, "learning_rate": 3.636686675367006e-06, "loss": 0.393, "step": 4792 }, { "epoch": 2.137868889389502, "grad_norm": 0.5982439365957182, "learning_rate": 3.6330430631634607e-06, "loss": 0.4049, "step": 4793 }, { "epoch": 2.1383194413156117, "grad_norm": 0.6019737009026658, "learning_rate": 3.629400871941204e-06, "loss": 0.4171, "step": 4794 }, { "epoch": 2.1387699932417212, "grad_norm": 0.6226135037443427, "learning_rate": 3.625760102513103e-06, "loss": 0.4143, "step": 4795 }, { "epoch": 2.139220545167831, "grad_norm": 0.5779272199019091, "learning_rate": 3.6221207556917058e-06, "loss": 0.3809, "step": 4796 }, { "epoch": 2.13967109709394, "grad_norm": 0.5869461195909724, "learning_rate": 3.618482832289245e-06, "loss": 0.3804, "step": 4797 }, { "epoch": 2.1401216490200494, "grad_norm": 0.593196539404585, "learning_rate": 3.614846333117633e-06, "loss": 0.3881, "step": 4798 }, { "epoch": 2.140572200946159, "grad_norm": 0.5742601752968043, "learning_rate": 3.6112112589884707e-06, "loss": 0.3915, "step": 4799 }, { "epoch": 2.1410227528722685, "grad_norm": 0.5761165285339105, "learning_rate": 3.6075776107130254e-06, "loss": 0.4182, "step": 4800 }, { "epoch": 2.141473304798378, "grad_norm": 0.5825404183444407, "learning_rate": 3.6039453891022713e-06, "loss": 0.3957, "step": 4801 }, { "epoch": 2.1419238567244876, "grad_norm": 0.5847088760485691, "learning_rate": 3.6003145949668338e-06, "loss": 0.3982, "step": 4802 }, { "epoch": 2.142374408650597, "grad_norm": 0.587060103438609, "learning_rate": 3.596685229117053e-06, "loss": 0.3898, "step": 4803 }, { "epoch": 2.1428249605767062, "grad_norm": 0.568119794683138, "learning_rate": 3.593057292362916e-06, "loss": 0.3912, "step": 4804 }, { "epoch": 2.143275512502816, "grad_norm": 0.5922499258559804, "learning_rate": 3.5894307855141232e-06, "loss": 0.4247, "step": 4805 }, { "epoch": 2.1437260644289253, "grad_norm": 0.5744259684788733, "learning_rate": 3.5858057093800267e-06, "loss": 0.3847, "step": 4806 }, { "epoch": 2.144176616355035, "grad_norm": 0.591546562352272, "learning_rate": 3.582182064769687e-06, "loss": 0.3761, "step": 4807 }, { "epoch": 2.1446271682811444, "grad_norm": 0.6166694546605131, "learning_rate": 3.5785598524918195e-06, "loss": 0.3884, "step": 4808 }, { "epoch": 2.145077720207254, "grad_norm": 0.5755112564351808, "learning_rate": 3.5749390733548382e-06, "loss": 0.3984, "step": 4809 }, { "epoch": 2.1455282721333635, "grad_norm": 0.5914858980519033, "learning_rate": 3.571319728166828e-06, "loss": 0.3876, "step": 4810 }, { "epoch": 2.145978824059473, "grad_norm": 0.602279216659344, "learning_rate": 3.567701817735558e-06, "loss": 0.4169, "step": 4811 }, { "epoch": 2.146429375985582, "grad_norm": 0.5823107658634789, "learning_rate": 3.564085342868475e-06, "loss": 0.394, "step": 4812 }, { "epoch": 2.1468799279116917, "grad_norm": 0.5780908066958956, "learning_rate": 3.5604703043727063e-06, "loss": 0.3705, "step": 4813 }, { "epoch": 2.1473304798378012, "grad_norm": 0.609959983851115, "learning_rate": 3.5568567030550584e-06, "loss": 0.4074, "step": 4814 }, { "epoch": 2.1477810317639108, "grad_norm": 0.59214799669199, "learning_rate": 3.5532445397220173e-06, "loss": 0.4146, "step": 4815 }, { "epoch": 2.1482315836900203, "grad_norm": 0.5797722192040945, "learning_rate": 3.549633815179746e-06, "loss": 0.3865, "step": 4816 }, { "epoch": 2.14868213561613, "grad_norm": 0.5861275268640599, "learning_rate": 3.5460245302340914e-06, "loss": 0.4287, "step": 4817 }, { "epoch": 2.1491326875422394, "grad_norm": 0.573009811905767, "learning_rate": 3.542416685690573e-06, "loss": 0.408, "step": 4818 }, { "epoch": 2.149583239468349, "grad_norm": 0.5927207504222496, "learning_rate": 3.5388102823543936e-06, "loss": 0.4029, "step": 4819 }, { "epoch": 2.150033791394458, "grad_norm": 0.565944696649877, "learning_rate": 3.5352053210304303e-06, "loss": 0.3898, "step": 4820 }, { "epoch": 2.1504843433205676, "grad_norm": 0.584811950672254, "learning_rate": 3.5316018025232425e-06, "loss": 0.3947, "step": 4821 }, { "epoch": 2.150934895246677, "grad_norm": 0.5672242609332168, "learning_rate": 3.527999727637066e-06, "loss": 0.4075, "step": 4822 }, { "epoch": 2.1513854471727867, "grad_norm": 0.563414107362102, "learning_rate": 3.5243990971758124e-06, "loss": 0.3894, "step": 4823 }, { "epoch": 2.1518359990988962, "grad_norm": 0.600663563398707, "learning_rate": 3.5207999119430725e-06, "loss": 0.4037, "step": 4824 }, { "epoch": 2.1522865510250058, "grad_norm": 0.585847913477911, "learning_rate": 3.5172021727421167e-06, "loss": 0.3819, "step": 4825 }, { "epoch": 2.1527371029511153, "grad_norm": 0.5884679263268787, "learning_rate": 3.513605880375889e-06, "loss": 0.4015, "step": 4826 }, { "epoch": 2.1531876548772244, "grad_norm": 0.584149640112029, "learning_rate": 3.510011035647012e-06, "loss": 0.3901, "step": 4827 }, { "epoch": 2.153638206803334, "grad_norm": 0.6064255789776759, "learning_rate": 3.5064176393577864e-06, "loss": 0.3922, "step": 4828 }, { "epoch": 2.1540887587294435, "grad_norm": 0.5765241143811065, "learning_rate": 3.5028256923101866e-06, "loss": 0.4184, "step": 4829 }, { "epoch": 2.154539310655553, "grad_norm": 0.5938179751781769, "learning_rate": 3.499235195305868e-06, "loss": 0.405, "step": 4830 }, { "epoch": 2.1549898625816626, "grad_norm": 0.5802154108192941, "learning_rate": 3.495646149146158e-06, "loss": 0.3961, "step": 4831 }, { "epoch": 2.155440414507772, "grad_norm": 0.6032922988229499, "learning_rate": 3.492058554632063e-06, "loss": 0.4167, "step": 4832 }, { "epoch": 2.1558909664338817, "grad_norm": 0.5999937009976506, "learning_rate": 3.4884724125642646e-06, "loss": 0.3953, "step": 4833 }, { "epoch": 2.1563415183599908, "grad_norm": 0.5984795620199364, "learning_rate": 3.4848877237431235e-06, "loss": 0.3737, "step": 4834 }, { "epoch": 2.1567920702861003, "grad_norm": 0.6016983262717166, "learning_rate": 3.4813044889686607e-06, "loss": 0.3858, "step": 4835 }, { "epoch": 2.15724262221221, "grad_norm": 0.5945525288428444, "learning_rate": 3.4777227090406007e-06, "loss": 0.3839, "step": 4836 }, { "epoch": 2.1576931741383194, "grad_norm": 0.5643075111325728, "learning_rate": 3.4741423847583134e-06, "loss": 0.4025, "step": 4837 }, { "epoch": 2.158143726064429, "grad_norm": 0.5810883652707599, "learning_rate": 3.4705635169208706e-06, "loss": 0.4, "step": 4838 }, { "epoch": 2.1585942779905385, "grad_norm": 0.5884577023909412, "learning_rate": 3.4669861063269918e-06, "loss": 0.3908, "step": 4839 }, { "epoch": 2.159044829916648, "grad_norm": 0.5734003359859752, "learning_rate": 3.463410153775101e-06, "loss": 0.3816, "step": 4840 }, { "epoch": 2.1594953818427576, "grad_norm": 0.5932630213812314, "learning_rate": 3.4598356600632667e-06, "loss": 0.3827, "step": 4841 }, { "epoch": 2.1599459337688667, "grad_norm": 0.5956872477300053, "learning_rate": 3.4562626259892605e-06, "loss": 0.3952, "step": 4842 }, { "epoch": 2.160396485694976, "grad_norm": 0.5987849652502901, "learning_rate": 3.4526910523505007e-06, "loss": 0.4013, "step": 4843 }, { "epoch": 2.1608470376210858, "grad_norm": 0.601138139099341, "learning_rate": 3.449120939944107e-06, "loss": 0.4146, "step": 4844 }, { "epoch": 2.1612975895471953, "grad_norm": 0.6046750155335507, "learning_rate": 3.445552289566849e-06, "loss": 0.4102, "step": 4845 }, { "epoch": 2.161748141473305, "grad_norm": 0.6009047978073128, "learning_rate": 3.441985102015184e-06, "loss": 0.3939, "step": 4846 }, { "epoch": 2.1621986933994144, "grad_norm": 0.6106382997045815, "learning_rate": 3.4384193780852384e-06, "loss": 0.4031, "step": 4847 }, { "epoch": 2.162649245325524, "grad_norm": 0.605564066802337, "learning_rate": 3.434855118572812e-06, "loss": 0.4042, "step": 4848 }, { "epoch": 2.1630997972516335, "grad_norm": 0.6067679725422997, "learning_rate": 3.4312923242733796e-06, "loss": 0.4135, "step": 4849 }, { "epoch": 2.1635503491777426, "grad_norm": 0.6114933693633006, "learning_rate": 3.4277309959820882e-06, "loss": 0.4213, "step": 4850 }, { "epoch": 2.164000901103852, "grad_norm": 0.5994283818864014, "learning_rate": 3.4241711344937557e-06, "loss": 0.4095, "step": 4851 }, { "epoch": 2.1644514530299617, "grad_norm": 0.6079184319930194, "learning_rate": 3.4206127406028744e-06, "loss": 0.4199, "step": 4852 }, { "epoch": 2.164902004956071, "grad_norm": 0.5886025800626707, "learning_rate": 3.4170558151036105e-06, "loss": 0.3973, "step": 4853 }, { "epoch": 2.1653525568821808, "grad_norm": 0.5897823772841442, "learning_rate": 3.4135003587897988e-06, "loss": 0.4085, "step": 4854 }, { "epoch": 2.1658031088082903, "grad_norm": 0.5800510579405543, "learning_rate": 3.409946372454949e-06, "loss": 0.379, "step": 4855 }, { "epoch": 2.1662536607344, "grad_norm": 0.5927279027870987, "learning_rate": 3.4063938568922406e-06, "loss": 0.4377, "step": 4856 }, { "epoch": 2.166704212660509, "grad_norm": 0.6047766834951903, "learning_rate": 3.402842812894529e-06, "loss": 0.4237, "step": 4857 }, { "epoch": 2.1671547645866185, "grad_norm": 0.5984436286680145, "learning_rate": 3.3992932412543358e-06, "loss": 0.3917, "step": 4858 }, { "epoch": 2.167605316512728, "grad_norm": 0.6324939586873821, "learning_rate": 3.3957451427638577e-06, "loss": 0.4096, "step": 4859 }, { "epoch": 2.1680558684388376, "grad_norm": 0.5763533904478099, "learning_rate": 3.3921985182149653e-06, "loss": 0.375, "step": 4860 }, { "epoch": 2.168506420364947, "grad_norm": 0.5873417255223666, "learning_rate": 3.388653368399184e-06, "loss": 0.3729, "step": 4861 }, { "epoch": 2.1689569722910567, "grad_norm": 0.5971120305229981, "learning_rate": 3.385109694107739e-06, "loss": 0.412, "step": 4862 }, { "epoch": 2.169407524217166, "grad_norm": 0.5830607768154646, "learning_rate": 3.3815674961314936e-06, "loss": 0.3812, "step": 4863 }, { "epoch": 2.1698580761432753, "grad_norm": 0.6164621080128586, "learning_rate": 3.378026775261013e-06, "loss": 0.4152, "step": 4864 }, { "epoch": 2.170308628069385, "grad_norm": 0.577757526039382, "learning_rate": 3.3744875322865035e-06, "loss": 0.3769, "step": 4865 }, { "epoch": 2.1707591799954944, "grad_norm": 0.598881023211554, "learning_rate": 3.3709497679978675e-06, "loss": 0.3901, "step": 4866 }, { "epoch": 2.171209731921604, "grad_norm": 0.576982626443047, "learning_rate": 3.367413483184654e-06, "loss": 0.4328, "step": 4867 }, { "epoch": 2.1716602838477135, "grad_norm": 0.5978458041908269, "learning_rate": 3.3638786786361057e-06, "loss": 0.4261, "step": 4868 }, { "epoch": 2.172110835773823, "grad_norm": 0.5983910111737627, "learning_rate": 3.360345355141111e-06, "loss": 0.4069, "step": 4869 }, { "epoch": 2.1725613876999326, "grad_norm": 0.6082100511849006, "learning_rate": 3.3568135134882463e-06, "loss": 0.4171, "step": 4870 }, { "epoch": 2.1730119396260417, "grad_norm": 0.5857134245813631, "learning_rate": 3.3532831544657464e-06, "loss": 0.3848, "step": 4871 }, { "epoch": 2.173462491552151, "grad_norm": 0.593774275736344, "learning_rate": 3.349754278861517e-06, "loss": 0.3945, "step": 4872 }, { "epoch": 2.1739130434782608, "grad_norm": 0.6077835015440911, "learning_rate": 3.346226887463144e-06, "loss": 0.3869, "step": 4873 }, { "epoch": 2.1743635954043703, "grad_norm": 0.5767010734997462, "learning_rate": 3.3427009810578602e-06, "loss": 0.3855, "step": 4874 }, { "epoch": 2.17481414733048, "grad_norm": 0.5709061230945055, "learning_rate": 3.339176560432593e-06, "loss": 0.3724, "step": 4875 }, { "epoch": 2.1752646992565894, "grad_norm": 0.5803828271076268, "learning_rate": 3.3356536263739115e-06, "loss": 0.4005, "step": 4876 }, { "epoch": 2.175715251182699, "grad_norm": 0.6834307838534477, "learning_rate": 3.3321321796680784e-06, "loss": 0.3951, "step": 4877 }, { "epoch": 2.1761658031088085, "grad_norm": 0.5718627608834426, "learning_rate": 3.3286122211009997e-06, "loss": 0.3852, "step": 4878 }, { "epoch": 2.1766163550349176, "grad_norm": 0.6128530196080159, "learning_rate": 3.3250937514582758e-06, "loss": 0.4067, "step": 4879 }, { "epoch": 2.177066906961027, "grad_norm": 0.5904958469416278, "learning_rate": 3.32157677152515e-06, "loss": 0.4094, "step": 4880 }, { "epoch": 2.1775174588871367, "grad_norm": 0.603492664389121, "learning_rate": 3.3180612820865477e-06, "loss": 0.403, "step": 4881 }, { "epoch": 2.177968010813246, "grad_norm": 0.5872421982237112, "learning_rate": 3.3145472839270575e-06, "loss": 0.3913, "step": 4882 }, { "epoch": 2.1784185627393557, "grad_norm": 0.5957131852865584, "learning_rate": 3.311034777830936e-06, "loss": 0.3856, "step": 4883 }, { "epoch": 2.1788691146654653, "grad_norm": 0.6045813642738423, "learning_rate": 3.3075237645821068e-06, "loss": 0.3916, "step": 4884 }, { "epoch": 2.179319666591575, "grad_norm": 0.5838851959349646, "learning_rate": 3.30401424496416e-06, "loss": 0.3908, "step": 4885 }, { "epoch": 2.1797702185176844, "grad_norm": 0.5912948087780563, "learning_rate": 3.300506219760351e-06, "loss": 0.3693, "step": 4886 }, { "epoch": 2.1802207704437935, "grad_norm": 0.5756376278720298, "learning_rate": 3.296999689753604e-06, "loss": 0.3928, "step": 4887 }, { "epoch": 2.180671322369903, "grad_norm": 0.6086073519867308, "learning_rate": 3.293494655726509e-06, "loss": 0.4192, "step": 4888 }, { "epoch": 2.1811218742960126, "grad_norm": 0.5843465012226793, "learning_rate": 3.289991118461321e-06, "loss": 0.3979, "step": 4889 }, { "epoch": 2.181572426222122, "grad_norm": 0.605310610869492, "learning_rate": 3.2864890787399606e-06, "loss": 0.3886, "step": 4890 }, { "epoch": 2.1820229781482317, "grad_norm": 0.5921673555409117, "learning_rate": 3.282988537344016e-06, "loss": 0.3926, "step": 4891 }, { "epoch": 2.182473530074341, "grad_norm": 0.578155748362001, "learning_rate": 3.279489495054742e-06, "loss": 0.3937, "step": 4892 }, { "epoch": 2.1829240820004507, "grad_norm": 0.5899406097535126, "learning_rate": 3.2759919526530536e-06, "loss": 0.4048, "step": 4893 }, { "epoch": 2.18337463392656, "grad_norm": 0.5993386999377809, "learning_rate": 3.2724959109195366e-06, "loss": 0.4067, "step": 4894 }, { "epoch": 2.1838251858526694, "grad_norm": 0.5939467305258481, "learning_rate": 3.2690013706344413e-06, "loss": 0.3853, "step": 4895 }, { "epoch": 2.184275737778779, "grad_norm": 0.5670355179126348, "learning_rate": 3.2655083325776736e-06, "loss": 0.39, "step": 4896 }, { "epoch": 2.1847262897048885, "grad_norm": 0.5928883093428321, "learning_rate": 3.262016797528824e-06, "loss": 0.4119, "step": 4897 }, { "epoch": 2.185176841630998, "grad_norm": 0.5950958489817327, "learning_rate": 3.2585267662671217e-06, "loss": 0.3874, "step": 4898 }, { "epoch": 2.1856273935571076, "grad_norm": 0.5813954150469518, "learning_rate": 3.2550382395714873e-06, "loss": 0.3758, "step": 4899 }, { "epoch": 2.186077945483217, "grad_norm": 0.6018467150704735, "learning_rate": 3.25155121822048e-06, "loss": 0.4095, "step": 4900 }, { "epoch": 2.186528497409326, "grad_norm": 0.6025958751841283, "learning_rate": 3.248065702992348e-06, "loss": 0.3642, "step": 4901 }, { "epoch": 2.1869790493354357, "grad_norm": 0.6027680900939757, "learning_rate": 3.2445816946649777e-06, "loss": 0.4094, "step": 4902 }, { "epoch": 2.1874296012615453, "grad_norm": 0.5688578677941825, "learning_rate": 3.241099194015944e-06, "loss": 0.377, "step": 4903 }, { "epoch": 2.187880153187655, "grad_norm": 0.5808685263553831, "learning_rate": 3.2376182018224656e-06, "loss": 0.3826, "step": 4904 }, { "epoch": 2.1883307051137644, "grad_norm": 0.5852676962775898, "learning_rate": 3.2341387188614348e-06, "loss": 0.3653, "step": 4905 }, { "epoch": 2.188781257039874, "grad_norm": 0.5970574136683287, "learning_rate": 3.2306607459094043e-06, "loss": 0.3833, "step": 4906 }, { "epoch": 2.1892318089659835, "grad_norm": 0.5896572395274549, "learning_rate": 3.2271842837425917e-06, "loss": 0.4282, "step": 4907 }, { "epoch": 2.189682360892093, "grad_norm": 0.5680168345911044, "learning_rate": 3.223709333136873e-06, "loss": 0.3547, "step": 4908 }, { "epoch": 2.190132912818202, "grad_norm": 0.6100199262573948, "learning_rate": 3.220235894867794e-06, "loss": 0.3999, "step": 4909 }, { "epoch": 2.1905834647443116, "grad_norm": 0.5895253910216228, "learning_rate": 3.2167639697105547e-06, "loss": 0.3915, "step": 4910 }, { "epoch": 2.191034016670421, "grad_norm": 0.5797302190299931, "learning_rate": 3.2132935584400225e-06, "loss": 0.4046, "step": 4911 }, { "epoch": 2.1914845685965307, "grad_norm": 0.5844699345104851, "learning_rate": 3.209824661830728e-06, "loss": 0.3776, "step": 4912 }, { "epoch": 2.1919351205226403, "grad_norm": 0.5849200406389, "learning_rate": 3.20635728065686e-06, "loss": 0.4238, "step": 4913 }, { "epoch": 2.19238567244875, "grad_norm": 0.5797924861114672, "learning_rate": 3.202891415692271e-06, "loss": 0.3828, "step": 4914 }, { "epoch": 2.1928362243748594, "grad_norm": 0.5752634500468914, "learning_rate": 3.1994270677104733e-06, "loss": 0.3859, "step": 4915 }, { "epoch": 2.193286776300969, "grad_norm": 0.5775524550241242, "learning_rate": 3.1959642374846445e-06, "loss": 0.379, "step": 4916 }, { "epoch": 2.193737328227078, "grad_norm": 0.586231770900898, "learning_rate": 3.1925029257876206e-06, "loss": 0.4089, "step": 4917 }, { "epoch": 2.1941878801531876, "grad_norm": 0.6227750722544217, "learning_rate": 3.1890431333918992e-06, "loss": 0.4007, "step": 4918 }, { "epoch": 2.194638432079297, "grad_norm": 0.583445873069714, "learning_rate": 3.1855848610696406e-06, "loss": 0.3764, "step": 4919 }, { "epoch": 2.1950889840054066, "grad_norm": 0.6048708922287241, "learning_rate": 3.182128109592656e-06, "loss": 0.4123, "step": 4920 }, { "epoch": 2.195539535931516, "grad_norm": 0.6123303689567149, "learning_rate": 3.178672879732435e-06, "loss": 0.4264, "step": 4921 }, { "epoch": 2.1959900878576257, "grad_norm": 0.5898286938163716, "learning_rate": 3.1752191722601134e-06, "loss": 0.3951, "step": 4922 }, { "epoch": 2.1964406397837353, "grad_norm": 0.5850900437960189, "learning_rate": 3.171766987946493e-06, "loss": 0.4102, "step": 4923 }, { "epoch": 2.1968911917098444, "grad_norm": 0.5769595080027561, "learning_rate": 3.1683163275620333e-06, "loss": 0.3741, "step": 4924 }, { "epoch": 2.197341743635954, "grad_norm": 0.583702924907961, "learning_rate": 3.1648671918768558e-06, "loss": 0.3928, "step": 4925 }, { "epoch": 2.1977922955620635, "grad_norm": 0.5983723555678504, "learning_rate": 3.161419581660741e-06, "loss": 0.4021, "step": 4926 }, { "epoch": 2.198242847488173, "grad_norm": 0.5873936968218186, "learning_rate": 3.1579734976831265e-06, "loss": 0.3719, "step": 4927 }, { "epoch": 2.1986933994142825, "grad_norm": 0.5989379455621607, "learning_rate": 3.1545289407131128e-06, "loss": 0.4081, "step": 4928 }, { "epoch": 2.199143951340392, "grad_norm": 0.6067515895205677, "learning_rate": 3.1510859115194582e-06, "loss": 0.3847, "step": 4929 }, { "epoch": 2.1995945032665016, "grad_norm": 0.5806088351900686, "learning_rate": 3.147644410870584e-06, "loss": 0.3962, "step": 4930 }, { "epoch": 2.2000450551926107, "grad_norm": 0.5908462009426345, "learning_rate": 3.144204439534555e-06, "loss": 0.3711, "step": 4931 }, { "epoch": 2.2004956071187203, "grad_norm": 0.5947076847179176, "learning_rate": 3.1407659982791204e-06, "loss": 0.3732, "step": 4932 }, { "epoch": 2.20094615904483, "grad_norm": 0.5848938737719743, "learning_rate": 3.13732908787166e-06, "loss": 0.3996, "step": 4933 }, { "epoch": 2.2013967109709394, "grad_norm": 0.6078005802256577, "learning_rate": 3.1338937090792396e-06, "loss": 0.4194, "step": 4934 }, { "epoch": 2.201847262897049, "grad_norm": 0.6054833178759107, "learning_rate": 3.1304598626685544e-06, "loss": 0.4052, "step": 4935 }, { "epoch": 2.2022978148231585, "grad_norm": 0.600272784187588, "learning_rate": 3.1270275494059856e-06, "loss": 0.4128, "step": 4936 }, { "epoch": 2.202748366749268, "grad_norm": 0.5897707082567362, "learning_rate": 3.1235967700575474e-06, "loss": 0.3836, "step": 4937 }, { "epoch": 2.2031989186753775, "grad_norm": 0.6018218851300366, "learning_rate": 3.1201675253889343e-06, "loss": 0.3738, "step": 4938 }, { "epoch": 2.2036494706014866, "grad_norm": 0.6105354922051537, "learning_rate": 3.1167398161654795e-06, "loss": 0.3789, "step": 4939 }, { "epoch": 2.204100022527596, "grad_norm": 0.5582005334546439, "learning_rate": 3.1133136431521817e-06, "loss": 0.3842, "step": 4940 }, { "epoch": 2.2045505744537057, "grad_norm": 0.5890079670154844, "learning_rate": 3.109889007113699e-06, "loss": 0.3838, "step": 4941 }, { "epoch": 2.2050011263798153, "grad_norm": 0.6056669029531292, "learning_rate": 3.1064659088143424e-06, "loss": 0.4132, "step": 4942 }, { "epoch": 2.205451678305925, "grad_norm": 0.5863664752627864, "learning_rate": 3.1030443490180816e-06, "loss": 0.4052, "step": 4943 }, { "epoch": 2.2059022302320344, "grad_norm": 0.598670107332812, "learning_rate": 3.099624328488542e-06, "loss": 0.4155, "step": 4944 }, { "epoch": 2.206352782158144, "grad_norm": 0.5845671118208793, "learning_rate": 3.0962058479890057e-06, "loss": 0.4099, "step": 4945 }, { "epoch": 2.2068033340842534, "grad_norm": 0.661652530995104, "learning_rate": 3.0927889082824112e-06, "loss": 0.4099, "step": 4946 }, { "epoch": 2.2072538860103625, "grad_norm": 0.5807854449401983, "learning_rate": 3.089373510131354e-06, "loss": 0.3642, "step": 4947 }, { "epoch": 2.207704437936472, "grad_norm": 0.5758588799367773, "learning_rate": 3.085959654298084e-06, "loss": 0.387, "step": 4948 }, { "epoch": 2.2081549898625816, "grad_norm": 0.6010143375450397, "learning_rate": 3.0825473415445073e-06, "loss": 0.3859, "step": 4949 }, { "epoch": 2.208605541788691, "grad_norm": 0.5932028866772118, "learning_rate": 3.0791365726321864e-06, "loss": 0.4061, "step": 4950 }, { "epoch": 2.2090560937148007, "grad_norm": 0.5715477382403583, "learning_rate": 3.0757273483223394e-06, "loss": 0.3643, "step": 4951 }, { "epoch": 2.2095066456409103, "grad_norm": 0.6263849392495354, "learning_rate": 3.0723196693758386e-06, "loss": 0.3984, "step": 4952 }, { "epoch": 2.20995719756702, "grad_norm": 0.6406306907522928, "learning_rate": 3.0689135365532107e-06, "loss": 0.4169, "step": 4953 }, { "epoch": 2.210407749493129, "grad_norm": 0.5919457263237194, "learning_rate": 3.0655089506146395e-06, "loss": 0.4129, "step": 4954 }, { "epoch": 2.2108583014192384, "grad_norm": 0.581561389644842, "learning_rate": 3.0621059123199626e-06, "loss": 0.3648, "step": 4955 }, { "epoch": 2.211308853345348, "grad_norm": 0.6125803047685665, "learning_rate": 3.0587044224286743e-06, "loss": 0.4098, "step": 4956 }, { "epoch": 2.2117594052714575, "grad_norm": 0.5904831861334151, "learning_rate": 3.0553044816999133e-06, "loss": 0.412, "step": 4957 }, { "epoch": 2.212209957197567, "grad_norm": 0.6169386906506972, "learning_rate": 3.0519060908924926e-06, "loss": 0.3902, "step": 4958 }, { "epoch": 2.2126605091236766, "grad_norm": 0.5839361157842108, "learning_rate": 3.048509250764854e-06, "loss": 0.4004, "step": 4959 }, { "epoch": 2.213111061049786, "grad_norm": 0.6038363741329534, "learning_rate": 3.045113962075118e-06, "loss": 0.3638, "step": 4960 }, { "epoch": 2.2135616129758953, "grad_norm": 0.6148925026533704, "learning_rate": 3.0417202255810363e-06, "loss": 0.3958, "step": 4961 }, { "epoch": 2.214012164902005, "grad_norm": 0.5612171904804506, "learning_rate": 3.038328042040037e-06, "loss": 0.3642, "step": 4962 }, { "epoch": 2.2144627168281144, "grad_norm": 0.5592042875376668, "learning_rate": 3.034937412209178e-06, "loss": 0.4269, "step": 4963 }, { "epoch": 2.214913268754224, "grad_norm": 0.5889925317405709, "learning_rate": 3.0315483368451925e-06, "loss": 0.3981, "step": 4964 }, { "epoch": 2.2153638206803334, "grad_norm": 0.567021730850227, "learning_rate": 3.0281608167044483e-06, "loss": 0.3902, "step": 4965 }, { "epoch": 2.215814372606443, "grad_norm": 0.5896601249909209, "learning_rate": 3.0247748525429787e-06, "loss": 0.3971, "step": 4966 }, { "epoch": 2.2162649245325525, "grad_norm": 0.6013523891118292, "learning_rate": 3.0213904451164623e-06, "loss": 0.4358, "step": 4967 }, { "epoch": 2.2167154764586616, "grad_norm": 0.6109953324448358, "learning_rate": 3.0180075951802358e-06, "loss": 0.3638, "step": 4968 }, { "epoch": 2.217166028384771, "grad_norm": 0.5688398358470399, "learning_rate": 3.014626303489283e-06, "loss": 0.395, "step": 4969 }, { "epoch": 2.2176165803108807, "grad_norm": 0.5956552633470913, "learning_rate": 3.011246570798242e-06, "loss": 0.4434, "step": 4970 }, { "epoch": 2.2180671322369903, "grad_norm": 0.5758316657125303, "learning_rate": 3.0078683978614122e-06, "loss": 0.3891, "step": 4971 }, { "epoch": 2.2185176841631, "grad_norm": 0.5854796708702847, "learning_rate": 3.0044917854327237e-06, "loss": 0.3807, "step": 4972 }, { "epoch": 2.2189682360892093, "grad_norm": 0.5905618230678132, "learning_rate": 3.001116734265783e-06, "loss": 0.398, "step": 4973 }, { "epoch": 2.219418788015319, "grad_norm": 0.5876110573489564, "learning_rate": 2.997743245113823e-06, "loss": 0.3875, "step": 4974 }, { "epoch": 2.2198693399414284, "grad_norm": 0.5949586296636926, "learning_rate": 2.994371318729756e-06, "loss": 0.4144, "step": 4975 }, { "epoch": 2.2203198918675375, "grad_norm": 0.6096393698965328, "learning_rate": 2.991000955866119e-06, "loss": 0.4044, "step": 4976 }, { "epoch": 2.220770443793647, "grad_norm": 0.6021762329694457, "learning_rate": 2.9876321572751143e-06, "loss": 0.4064, "step": 4977 }, { "epoch": 2.2212209957197566, "grad_norm": 0.5946105295665229, "learning_rate": 2.984264923708594e-06, "loss": 0.396, "step": 4978 }, { "epoch": 2.221671547645866, "grad_norm": 0.5932634786232833, "learning_rate": 2.9808992559180593e-06, "loss": 0.3779, "step": 4979 }, { "epoch": 2.2221220995719757, "grad_norm": 0.6137245227805077, "learning_rate": 2.9775351546546617e-06, "loss": 0.4323, "step": 4980 }, { "epoch": 2.2225726514980852, "grad_norm": 0.607334025352889, "learning_rate": 2.9741726206692022e-06, "loss": 0.4025, "step": 4981 }, { "epoch": 2.223023203424195, "grad_norm": 0.5937831663168815, "learning_rate": 2.9708116547121333e-06, "loss": 0.3994, "step": 4982 }, { "epoch": 2.2234737553503043, "grad_norm": 0.5858444409418151, "learning_rate": 2.9674522575335595e-06, "loss": 0.3938, "step": 4983 }, { "epoch": 2.2239243072764134, "grad_norm": 0.6204613262616459, "learning_rate": 2.9640944298832306e-06, "loss": 0.3937, "step": 4984 }, { "epoch": 2.224374859202523, "grad_norm": 0.6156783462450977, "learning_rate": 2.960738172510551e-06, "loss": 0.4249, "step": 4985 }, { "epoch": 2.2248254111286325, "grad_norm": 0.6042766871862458, "learning_rate": 2.95738348616457e-06, "loss": 0.417, "step": 4986 }, { "epoch": 2.225275963054742, "grad_norm": 0.5784316307726804, "learning_rate": 2.95403037159399e-06, "loss": 0.4255, "step": 4987 }, { "epoch": 2.2257265149808516, "grad_norm": 0.5676493521785354, "learning_rate": 2.95067882954716e-06, "loss": 0.3866, "step": 4988 }, { "epoch": 2.226177066906961, "grad_norm": 0.5764902512191672, "learning_rate": 2.9473288607720805e-06, "loss": 0.3952, "step": 4989 }, { "epoch": 2.2266276188330707, "grad_norm": 0.5865690611933021, "learning_rate": 2.9439804660163983e-06, "loss": 0.4005, "step": 4990 }, { "epoch": 2.22707817075918, "grad_norm": 0.6085622548378987, "learning_rate": 2.9406336460274144e-06, "loss": 0.4031, "step": 4991 }, { "epoch": 2.2275287226852893, "grad_norm": 0.6026323246216545, "learning_rate": 2.937288401552063e-06, "loss": 0.4081, "step": 4992 }, { "epoch": 2.227979274611399, "grad_norm": 0.5877681874958701, "learning_rate": 2.9339447333369518e-06, "loss": 0.3894, "step": 4993 }, { "epoch": 2.2284298265375084, "grad_norm": 0.5666321174177095, "learning_rate": 2.9306026421283107e-06, "loss": 0.3988, "step": 4994 }, { "epoch": 2.228880378463618, "grad_norm": 0.5905298567109188, "learning_rate": 2.9272621286720403e-06, "loss": 0.3968, "step": 4995 }, { "epoch": 2.228880378463618, "eval_loss": 0.6646064519882202, "eval_runtime": 24.3761, "eval_samples_per_second": 11.446, "eval_steps_per_second": 0.492, "step": 4995 }, { "epoch": 2.2293309303897275, "grad_norm": 0.6244856633877391, "learning_rate": 2.923923193713668e-06, "loss": 0.3845, "step": 4996 }, { "epoch": 2.229781482315837, "grad_norm": 0.6127020690052096, "learning_rate": 2.92058583799839e-06, "loss": 0.3892, "step": 4997 }, { "epoch": 2.230232034241946, "grad_norm": 0.5983583964159814, "learning_rate": 2.9172500622710263e-06, "loss": 0.391, "step": 4998 }, { "epoch": 2.2306825861680557, "grad_norm": 0.5986909711034674, "learning_rate": 2.9139158672760724e-06, "loss": 0.3784, "step": 4999 }, { "epoch": 2.2311331380941652, "grad_norm": 0.5964647637931112, "learning_rate": 2.910583253757645e-06, "loss": 0.4057, "step": 5000 }, { "epoch": 2.231583690020275, "grad_norm": 0.5882726202528816, "learning_rate": 2.9072522224595224e-06, "loss": 0.407, "step": 5001 }, { "epoch": 2.2320342419463843, "grad_norm": 0.6117422609684448, "learning_rate": 2.9039227741251263e-06, "loss": 0.4092, "step": 5002 }, { "epoch": 2.232484793872494, "grad_norm": 0.5623623313857687, "learning_rate": 2.9005949094975237e-06, "loss": 0.3892, "step": 5003 }, { "epoch": 2.2329353457986034, "grad_norm": 0.5762863335623856, "learning_rate": 2.8972686293194307e-06, "loss": 0.3757, "step": 5004 }, { "epoch": 2.233385897724713, "grad_norm": 0.5971895438967021, "learning_rate": 2.8939439343332086e-06, "loss": 0.403, "step": 5005 }, { "epoch": 2.233836449650822, "grad_norm": 0.5726331940037691, "learning_rate": 2.8906208252808642e-06, "loss": 0.4256, "step": 5006 }, { "epoch": 2.2342870015769316, "grad_norm": 0.5915786465744678, "learning_rate": 2.8872993029040506e-06, "loss": 0.3959, "step": 5007 }, { "epoch": 2.234737553503041, "grad_norm": 0.5776445807538774, "learning_rate": 2.8839793679440687e-06, "loss": 0.4073, "step": 5008 }, { "epoch": 2.2351881054291507, "grad_norm": 0.5917632696958721, "learning_rate": 2.8806610211418617e-06, "loss": 0.3995, "step": 5009 }, { "epoch": 2.2356386573552602, "grad_norm": 0.5888999959828333, "learning_rate": 2.877344263238021e-06, "loss": 0.3688, "step": 5010 }, { "epoch": 2.23608920928137, "grad_norm": 0.5703617116190943, "learning_rate": 2.8740290949727833e-06, "loss": 0.4065, "step": 5011 }, { "epoch": 2.2365397612074793, "grad_norm": 0.5875967226194209, "learning_rate": 2.8707155170860303e-06, "loss": 0.4218, "step": 5012 }, { "epoch": 2.236990313133589, "grad_norm": 0.6025737822264532, "learning_rate": 2.8674035303172864e-06, "loss": 0.3932, "step": 5013 }, { "epoch": 2.237440865059698, "grad_norm": 0.5638829972840541, "learning_rate": 2.8640931354057233e-06, "loss": 0.3993, "step": 5014 }, { "epoch": 2.2378914169858075, "grad_norm": 0.5953171631346628, "learning_rate": 2.860784333090162e-06, "loss": 0.4119, "step": 5015 }, { "epoch": 2.238341968911917, "grad_norm": 0.5804402528408646, "learning_rate": 2.8574771241090506e-06, "loss": 0.3702, "step": 5016 }, { "epoch": 2.2387925208380266, "grad_norm": 0.624301542987065, "learning_rate": 2.8541715092005097e-06, "loss": 0.3891, "step": 5017 }, { "epoch": 2.239243072764136, "grad_norm": 0.5780522219511106, "learning_rate": 2.8508674891022727e-06, "loss": 0.4001, "step": 5018 }, { "epoch": 2.2396936246902457, "grad_norm": 0.5972064743195484, "learning_rate": 2.847565064551747e-06, "loss": 0.4092, "step": 5019 }, { "epoch": 2.2401441766163552, "grad_norm": 0.6060046382843483, "learning_rate": 2.8442642362859586e-06, "loss": 0.38, "step": 5020 }, { "epoch": 2.2405947285424643, "grad_norm": 0.598838858518451, "learning_rate": 2.840965005041595e-06, "loss": 0.4106, "step": 5021 }, { "epoch": 2.241045280468574, "grad_norm": 0.570808233911913, "learning_rate": 2.8376673715549784e-06, "loss": 0.3923, "step": 5022 }, { "epoch": 2.2414958323946834, "grad_norm": 0.5711581135913929, "learning_rate": 2.834371336562077e-06, "loss": 0.4074, "step": 5023 }, { "epoch": 2.241946384320793, "grad_norm": 0.5877725576883329, "learning_rate": 2.831076900798503e-06, "loss": 0.3971, "step": 5024 }, { "epoch": 2.2423969362469025, "grad_norm": 0.5706156496736777, "learning_rate": 2.8277840649995083e-06, "loss": 0.3689, "step": 5025 }, { "epoch": 2.242847488173012, "grad_norm": 0.5849532529737677, "learning_rate": 2.824492829899994e-06, "loss": 0.4089, "step": 5026 }, { "epoch": 2.2432980400991216, "grad_norm": 0.5726968915735047, "learning_rate": 2.821203196234492e-06, "loss": 0.3697, "step": 5027 }, { "epoch": 2.2437485920252307, "grad_norm": 0.5948969564584097, "learning_rate": 2.817915164737195e-06, "loss": 0.3763, "step": 5028 }, { "epoch": 2.2441991439513402, "grad_norm": 0.5940457253288965, "learning_rate": 2.8146287361419167e-06, "loss": 0.3944, "step": 5029 }, { "epoch": 2.24464969587745, "grad_norm": 0.589417398319683, "learning_rate": 2.811343911182136e-06, "loss": 0.4207, "step": 5030 }, { "epoch": 2.2451002478035593, "grad_norm": 0.5746303247325103, "learning_rate": 2.8080606905909492e-06, "loss": 0.3746, "step": 5031 }, { "epoch": 2.245550799729669, "grad_norm": 0.5929999269093357, "learning_rate": 2.8047790751011216e-06, "loss": 0.4067, "step": 5032 }, { "epoch": 2.2460013516557784, "grad_norm": 0.5814786925277863, "learning_rate": 2.8014990654450325e-06, "loss": 0.3832, "step": 5033 }, { "epoch": 2.246451903581888, "grad_norm": 0.5820791176425772, "learning_rate": 2.7982206623547293e-06, "loss": 0.4068, "step": 5034 }, { "epoch": 2.2469024555079975, "grad_norm": 0.5982921420233683, "learning_rate": 2.7949438665618787e-06, "loss": 0.4061, "step": 5035 }, { "epoch": 2.2473530074341066, "grad_norm": 0.5874358455178409, "learning_rate": 2.791668678797801e-06, "loss": 0.4076, "step": 5036 }, { "epoch": 2.247803559360216, "grad_norm": 0.5834603754251619, "learning_rate": 2.788395099793453e-06, "loss": 0.3863, "step": 5037 }, { "epoch": 2.2482541112863257, "grad_norm": 0.5818429237790206, "learning_rate": 2.7851231302794367e-06, "loss": 0.3912, "step": 5038 }, { "epoch": 2.2487046632124352, "grad_norm": 0.5833623717274374, "learning_rate": 2.7818527709859912e-06, "loss": 0.3768, "step": 5039 }, { "epoch": 2.2491552151385448, "grad_norm": 0.5903378188519178, "learning_rate": 2.778584022642996e-06, "loss": 0.3851, "step": 5040 }, { "epoch": 2.2496057670646543, "grad_norm": 0.5796065334155256, "learning_rate": 2.775316885979974e-06, "loss": 0.3671, "step": 5041 }, { "epoch": 2.250056318990764, "grad_norm": 0.5882450623479566, "learning_rate": 2.7720513617260857e-06, "loss": 0.4084, "step": 5042 }, { "epoch": 2.2505068709168734, "grad_norm": 0.5652393476558362, "learning_rate": 2.768787450610133e-06, "loss": 0.3907, "step": 5043 }, { "epoch": 2.2509574228429825, "grad_norm": 0.5803342136605592, "learning_rate": 2.7655251533605587e-06, "loss": 0.4078, "step": 5044 }, { "epoch": 2.251407974769092, "grad_norm": 0.5781512770706996, "learning_rate": 2.7622644707054424e-06, "loss": 0.3863, "step": 5045 }, { "epoch": 2.2518585266952016, "grad_norm": 0.5747208999174385, "learning_rate": 2.759005403372507e-06, "loss": 0.3888, "step": 5046 }, { "epoch": 2.252309078621311, "grad_norm": 0.5809314882477071, "learning_rate": 2.7557479520891104e-06, "loss": 0.3771, "step": 5047 }, { "epoch": 2.2527596305474207, "grad_norm": 0.5632875199409156, "learning_rate": 2.752492117582256e-06, "loss": 0.3541, "step": 5048 }, { "epoch": 2.25321018247353, "grad_norm": 0.5966549854725876, "learning_rate": 2.749237900578581e-06, "loss": 0.3872, "step": 5049 }, { "epoch": 2.2536607343996398, "grad_norm": 0.6008188836217108, "learning_rate": 2.7459853018043637e-06, "loss": 0.4038, "step": 5050 }, { "epoch": 2.254111286325749, "grad_norm": 0.5676365257194913, "learning_rate": 2.7427343219855206e-06, "loss": 0.4128, "step": 5051 }, { "epoch": 2.2545618382518584, "grad_norm": 0.5611791164780058, "learning_rate": 2.7394849618476116e-06, "loss": 0.3841, "step": 5052 }, { "epoch": 2.255012390177968, "grad_norm": 0.5948606076371821, "learning_rate": 2.7362372221158206e-06, "loss": 0.4207, "step": 5053 }, { "epoch": 2.2554629421040775, "grad_norm": 0.593902011456871, "learning_rate": 2.7329911035149934e-06, "loss": 0.3879, "step": 5054 }, { "epoch": 2.255913494030187, "grad_norm": 0.5904055511760663, "learning_rate": 2.7297466067695876e-06, "loss": 0.3909, "step": 5055 }, { "epoch": 2.2563640459562966, "grad_norm": 0.5788688520465368, "learning_rate": 2.7265037326037236e-06, "loss": 0.4165, "step": 5056 }, { "epoch": 2.256814597882406, "grad_norm": 0.5989722720417545, "learning_rate": 2.723262481741138e-06, "loss": 0.4108, "step": 5057 }, { "epoch": 2.2572651498085152, "grad_norm": 0.5881988210909864, "learning_rate": 2.7200228549052244e-06, "loss": 0.4118, "step": 5058 }, { "epoch": 2.2577157017346248, "grad_norm": 0.5776186534540166, "learning_rate": 2.7167848528189945e-06, "loss": 0.3871, "step": 5059 }, { "epoch": 2.2581662536607343, "grad_norm": 0.5848463135293162, "learning_rate": 2.71354847620512e-06, "loss": 0.3896, "step": 5060 }, { "epoch": 2.258616805586844, "grad_norm": 0.5875516494894798, "learning_rate": 2.7103137257858867e-06, "loss": 0.3755, "step": 5061 }, { "epoch": 2.2590673575129534, "grad_norm": 0.5652997967264244, "learning_rate": 2.7070806022832318e-06, "loss": 0.383, "step": 5062 }, { "epoch": 2.259517909439063, "grad_norm": 0.5694563839753218, "learning_rate": 2.703849106418727e-06, "loss": 0.3858, "step": 5063 }, { "epoch": 2.2599684613651725, "grad_norm": 0.5872076260037692, "learning_rate": 2.7006192389135777e-06, "loss": 0.3864, "step": 5064 }, { "epoch": 2.2604190132912816, "grad_norm": 0.5920035183334657, "learning_rate": 2.6973910004886285e-06, "loss": 0.4076, "step": 5065 }, { "epoch": 2.260869565217391, "grad_norm": 0.5753837024105906, "learning_rate": 2.69416439186436e-06, "loss": 0.41, "step": 5066 }, { "epoch": 2.2613201171435007, "grad_norm": 0.5816038616579243, "learning_rate": 2.690939413760887e-06, "loss": 0.3796, "step": 5067 }, { "epoch": 2.26177066906961, "grad_norm": 0.5930443052990615, "learning_rate": 2.687716066897964e-06, "loss": 0.3784, "step": 5068 }, { "epoch": 2.2622212209957198, "grad_norm": 0.584452837834329, "learning_rate": 2.6844943519949785e-06, "loss": 0.3898, "step": 5069 }, { "epoch": 2.2626717729218293, "grad_norm": 0.590381982848018, "learning_rate": 2.6812742697709527e-06, "loss": 0.3955, "step": 5070 }, { "epoch": 2.263122324847939, "grad_norm": 0.5852407523976575, "learning_rate": 2.678055820944554e-06, "loss": 0.4263, "step": 5071 }, { "epoch": 2.2635728767740484, "grad_norm": 0.5704527452751306, "learning_rate": 2.67483900623407e-06, "loss": 0.4083, "step": 5072 }, { "epoch": 2.264023428700158, "grad_norm": 1.132603874695851, "learning_rate": 2.6716238263574333e-06, "loss": 0.4128, "step": 5073 }, { "epoch": 2.264473980626267, "grad_norm": 0.5777567136290404, "learning_rate": 2.668410282032211e-06, "loss": 0.3701, "step": 5074 }, { "epoch": 2.2649245325523766, "grad_norm": 0.5972532605613854, "learning_rate": 2.6651983739756026e-06, "loss": 0.4252, "step": 5075 }, { "epoch": 2.265375084478486, "grad_norm": 0.585929423394179, "learning_rate": 2.6619881029044435e-06, "loss": 0.4085, "step": 5076 }, { "epoch": 2.2658256364045957, "grad_norm": 0.5899983435726325, "learning_rate": 2.6587794695352065e-06, "loss": 0.4039, "step": 5077 }, { "epoch": 2.266276188330705, "grad_norm": 0.5613213475522862, "learning_rate": 2.6555724745839927e-06, "loss": 0.3884, "step": 5078 }, { "epoch": 2.2667267402568148, "grad_norm": 0.591486683845253, "learning_rate": 2.6523671187665446e-06, "loss": 0.4058, "step": 5079 }, { "epoch": 2.2671772921829243, "grad_norm": 0.588059624670295, "learning_rate": 2.649163402798233e-06, "loss": 0.3858, "step": 5080 }, { "epoch": 2.2676278441090334, "grad_norm": 0.5800806042585267, "learning_rate": 2.6459613273940653e-06, "loss": 0.3663, "step": 5081 }, { "epoch": 2.268078396035143, "grad_norm": 0.609422675254381, "learning_rate": 2.642760893268684e-06, "loss": 0.3772, "step": 5082 }, { "epoch": 2.2685289479612525, "grad_norm": 0.6288253707349463, "learning_rate": 2.6395621011363627e-06, "loss": 0.404, "step": 5083 }, { "epoch": 2.268979499887362, "grad_norm": 0.5841081495347527, "learning_rate": 2.6363649517110104e-06, "loss": 0.3842, "step": 5084 }, { "epoch": 2.2694300518134716, "grad_norm": 0.6226676266095276, "learning_rate": 2.6331694457061685e-06, "loss": 0.4107, "step": 5085 }, { "epoch": 2.269880603739581, "grad_norm": 0.5914528252462053, "learning_rate": 2.6299755838350126e-06, "loss": 0.4111, "step": 5086 }, { "epoch": 2.2703311556656907, "grad_norm": 0.5768928860515906, "learning_rate": 2.626783366810354e-06, "loss": 0.4167, "step": 5087 }, { "epoch": 2.2707817075917998, "grad_norm": 0.613830819273085, "learning_rate": 2.623592795344623e-06, "loss": 0.4016, "step": 5088 }, { "epoch": 2.2712322595179093, "grad_norm": 0.5904702357981296, "learning_rate": 2.6204038701499056e-06, "loss": 0.3878, "step": 5089 }, { "epoch": 2.271682811444019, "grad_norm": 0.565834950867798, "learning_rate": 2.6172165919378966e-06, "loss": 0.3997, "step": 5090 }, { "epoch": 2.2721333633701284, "grad_norm": 0.5771433362453717, "learning_rate": 2.6140309614199478e-06, "loss": 0.3803, "step": 5091 }, { "epoch": 2.272583915296238, "grad_norm": 0.5763048919024705, "learning_rate": 2.610846979307016e-06, "loss": 0.3964, "step": 5092 }, { "epoch": 2.2730344672223475, "grad_norm": 0.5906700794289652, "learning_rate": 2.607664646309718e-06, "loss": 0.3677, "step": 5093 }, { "epoch": 2.273485019148457, "grad_norm": 0.5737102455367236, "learning_rate": 2.6044839631382758e-06, "loss": 0.4133, "step": 5094 }, { "epoch": 2.273935571074566, "grad_norm": 0.6127779029827659, "learning_rate": 2.6013049305025674e-06, "loss": 0.4309, "step": 5095 }, { "epoch": 2.2743861230006757, "grad_norm": 0.6073571476396038, "learning_rate": 2.5981275491120837e-06, "loss": 0.4088, "step": 5096 }, { "epoch": 2.274836674926785, "grad_norm": 0.6028711404729804, "learning_rate": 2.594951819675958e-06, "loss": 0.403, "step": 5097 }, { "epoch": 2.2752872268528948, "grad_norm": 0.6234104556150448, "learning_rate": 2.59177774290295e-06, "loss": 0.3887, "step": 5098 }, { "epoch": 2.2757377787790043, "grad_norm": 0.5869384011051046, "learning_rate": 2.5886053195014537e-06, "loss": 0.4105, "step": 5099 }, { "epoch": 2.276188330705114, "grad_norm": 0.597364968839569, "learning_rate": 2.585434550179491e-06, "loss": 0.4009, "step": 5100 }, { "epoch": 2.2766388826312234, "grad_norm": 0.5747316765569898, "learning_rate": 2.5822654356447152e-06, "loss": 0.3939, "step": 5101 }, { "epoch": 2.277089434557333, "grad_norm": 0.6177056073109016, "learning_rate": 2.579097976604414e-06, "loss": 0.4254, "step": 5102 }, { "epoch": 2.277539986483442, "grad_norm": 0.6050235138445413, "learning_rate": 2.5759321737655017e-06, "loss": 0.3987, "step": 5103 }, { "epoch": 2.2779905384095516, "grad_norm": 0.6013779061086048, "learning_rate": 2.5727680278345226e-06, "loss": 0.4009, "step": 5104 }, { "epoch": 2.278441090335661, "grad_norm": 0.5861269180721286, "learning_rate": 2.5696055395176547e-06, "loss": 0.3943, "step": 5105 }, { "epoch": 2.2788916422617707, "grad_norm": 0.5882906186639869, "learning_rate": 2.5664447095207033e-06, "loss": 0.38, "step": 5106 }, { "epoch": 2.27934219418788, "grad_norm": 0.5979364273348614, "learning_rate": 2.563285538549104e-06, "loss": 0.3796, "step": 5107 }, { "epoch": 2.2797927461139897, "grad_norm": 0.5986294133976988, "learning_rate": 2.560128027307923e-06, "loss": 0.4101, "step": 5108 }, { "epoch": 2.2802432980400993, "grad_norm": 0.6011206807317828, "learning_rate": 2.5569721765018553e-06, "loss": 0.4184, "step": 5109 }, { "epoch": 2.280693849966209, "grad_norm": 0.5776676673195503, "learning_rate": 2.553817986835225e-06, "loss": 0.3941, "step": 5110 }, { "epoch": 2.281144401892318, "grad_norm": 0.6044605229256744, "learning_rate": 2.5506654590119908e-06, "loss": 0.3925, "step": 5111 }, { "epoch": 2.2815949538184275, "grad_norm": 0.5869664818265443, "learning_rate": 2.547514593735725e-06, "loss": 0.4152, "step": 5112 }, { "epoch": 2.282045505744537, "grad_norm": 0.5946026004290752, "learning_rate": 2.5443653917096523e-06, "loss": 0.3834, "step": 5113 }, { "epoch": 2.2824960576706466, "grad_norm": 0.5749572700687459, "learning_rate": 2.5412178536366005e-06, "loss": 0.3679, "step": 5114 }, { "epoch": 2.282946609596756, "grad_norm": 0.5857194288041045, "learning_rate": 2.5380719802190536e-06, "loss": 0.4049, "step": 5115 }, { "epoch": 2.2833971615228656, "grad_norm": 0.5822504612667294, "learning_rate": 2.534927772159095e-06, "loss": 0.391, "step": 5116 }, { "epoch": 2.283847713448975, "grad_norm": 0.5694265349765032, "learning_rate": 2.5317852301584642e-06, "loss": 0.4024, "step": 5117 }, { "epoch": 2.2842982653750843, "grad_norm": 0.5818131216509996, "learning_rate": 2.5286443549185035e-06, "loss": 0.3924, "step": 5118 }, { "epoch": 2.284748817301194, "grad_norm": 0.5902736105412502, "learning_rate": 2.525505147140204e-06, "loss": 0.4419, "step": 5119 }, { "epoch": 2.2851993692273034, "grad_norm": 0.5983271571117333, "learning_rate": 2.522367607524172e-06, "loss": 0.3909, "step": 5120 }, { "epoch": 2.285649921153413, "grad_norm": 0.5842788530311498, "learning_rate": 2.5192317367706487e-06, "loss": 0.3727, "step": 5121 }, { "epoch": 2.2861004730795225, "grad_norm": 0.5798003615026865, "learning_rate": 2.5160975355794993e-06, "loss": 0.4132, "step": 5122 }, { "epoch": 2.286551025005632, "grad_norm": 0.5757745608072461, "learning_rate": 2.5129650046502083e-06, "loss": 0.4012, "step": 5123 }, { "epoch": 2.2870015769317416, "grad_norm": 0.585964526765852, "learning_rate": 2.5098341446819097e-06, "loss": 0.3877, "step": 5124 }, { "epoch": 2.2874521288578507, "grad_norm": 0.6200039876282988, "learning_rate": 2.506704956373337e-06, "loss": 0.4025, "step": 5125 }, { "epoch": 2.28790268078396, "grad_norm": 0.5914702481773455, "learning_rate": 2.5035774404228765e-06, "loss": 0.3973, "step": 5126 }, { "epoch": 2.2883532327100697, "grad_norm": 0.5748004381674954, "learning_rate": 2.500451597528518e-06, "loss": 0.3804, "step": 5127 }, { "epoch": 2.2888037846361793, "grad_norm": 0.5820813450463201, "learning_rate": 2.4973274283879e-06, "loss": 0.3946, "step": 5128 }, { "epoch": 2.289254336562289, "grad_norm": 0.5800317710060037, "learning_rate": 2.4942049336982654e-06, "loss": 0.4031, "step": 5129 }, { "epoch": 2.2897048884883984, "grad_norm": 0.5848886897500469, "learning_rate": 2.4910841141565045e-06, "loss": 0.4161, "step": 5130 }, { "epoch": 2.290155440414508, "grad_norm": 0.5779833031775001, "learning_rate": 2.487964970459118e-06, "loss": 0.4011, "step": 5131 }, { "epoch": 2.290605992340617, "grad_norm": 0.5920337651591642, "learning_rate": 2.484847503302238e-06, "loss": 0.3892, "step": 5132 }, { "epoch": 2.2910565442667266, "grad_norm": 0.5657564579899363, "learning_rate": 2.4817317133816244e-06, "loss": 0.3695, "step": 5133 }, { "epoch": 2.291507096192836, "grad_norm": 0.5776302134738943, "learning_rate": 2.4786176013926613e-06, "loss": 0.3833, "step": 5134 }, { "epoch": 2.2919576481189456, "grad_norm": 0.5852847625620144, "learning_rate": 2.4755051680303576e-06, "loss": 0.4191, "step": 5135 }, { "epoch": 2.292408200045055, "grad_norm": 0.6000453378408713, "learning_rate": 2.4723944139893474e-06, "loss": 0.4474, "step": 5136 }, { "epoch": 2.2928587519711647, "grad_norm": 0.5971869772977373, "learning_rate": 2.469285339963892e-06, "loss": 0.4066, "step": 5137 }, { "epoch": 2.2933093038972743, "grad_norm": 0.5952697291963482, "learning_rate": 2.466177946647874e-06, "loss": 0.4081, "step": 5138 }, { "epoch": 2.293759855823384, "grad_norm": 0.5896899933814722, "learning_rate": 2.4630722347348066e-06, "loss": 0.3923, "step": 5139 }, { "epoch": 2.2942104077494934, "grad_norm": 0.5682285473803236, "learning_rate": 2.459968204917823e-06, "loss": 0.3882, "step": 5140 }, { "epoch": 2.2946609596756025, "grad_norm": 0.6050465191614709, "learning_rate": 2.456865857889681e-06, "loss": 0.4144, "step": 5141 }, { "epoch": 2.295111511601712, "grad_norm": 0.6109075888907979, "learning_rate": 2.4537651943427666e-06, "loss": 0.4128, "step": 5142 }, { "epoch": 2.2955620635278216, "grad_norm": 0.5775720985083288, "learning_rate": 2.4506662149690863e-06, "loss": 0.3914, "step": 5143 }, { "epoch": 2.296012615453931, "grad_norm": 0.5844709183901495, "learning_rate": 2.4475689204602726e-06, "loss": 0.3762, "step": 5144 }, { "epoch": 2.2964631673800406, "grad_norm": 0.5826750262191285, "learning_rate": 2.4444733115075823e-06, "loss": 0.3835, "step": 5145 }, { "epoch": 2.29691371930615, "grad_norm": 0.5986945282630355, "learning_rate": 2.4413793888018965e-06, "loss": 0.4089, "step": 5146 }, { "epoch": 2.2973642712322597, "grad_norm": 0.5929461685855215, "learning_rate": 2.4382871530337114e-06, "loss": 0.4152, "step": 5147 }, { "epoch": 2.297814823158369, "grad_norm": 0.5982068390006514, "learning_rate": 2.4351966048931653e-06, "loss": 0.3881, "step": 5148 }, { "epoch": 2.2982653750844784, "grad_norm": 0.5950869227455087, "learning_rate": 2.432107745069997e-06, "loss": 0.3623, "step": 5149 }, { "epoch": 2.298715927010588, "grad_norm": 0.5702938123081528, "learning_rate": 2.429020574253591e-06, "loss": 0.3908, "step": 5150 }, { "epoch": 2.2991664789366975, "grad_norm": 0.5585202296314914, "learning_rate": 2.425935093132934e-06, "loss": 0.3924, "step": 5151 }, { "epoch": 2.299617030862807, "grad_norm": 0.5890981834725917, "learning_rate": 2.422851302396655e-06, "loss": 0.397, "step": 5152 }, { "epoch": 2.3000675827889165, "grad_norm": 0.5969274550344023, "learning_rate": 2.4197692027329867e-06, "loss": 0.4036, "step": 5153 }, { "epoch": 2.300518134715026, "grad_norm": 0.5850572210990025, "learning_rate": 2.4166887948298046e-06, "loss": 0.4002, "step": 5154 }, { "epoch": 2.300968686641135, "grad_norm": 0.631006251703749, "learning_rate": 2.413610079374584e-06, "loss": 0.394, "step": 5155 }, { "epoch": 2.3014192385672447, "grad_norm": 0.5790722821612205, "learning_rate": 2.410533057054446e-06, "loss": 0.4084, "step": 5156 }, { "epoch": 2.3018697904933543, "grad_norm": 0.6105106542969919, "learning_rate": 2.407457728556115e-06, "loss": 0.4053, "step": 5157 }, { "epoch": 2.302320342419464, "grad_norm": 0.6145819451425762, "learning_rate": 2.404384094565947e-06, "loss": 0.4308, "step": 5158 }, { "epoch": 2.3027708943455734, "grad_norm": 0.5848012248783955, "learning_rate": 2.401312155769916e-06, "loss": 0.3714, "step": 5159 }, { "epoch": 2.303221446271683, "grad_norm": 0.5671624728881614, "learning_rate": 2.3982419128536218e-06, "loss": 0.4153, "step": 5160 }, { "epoch": 2.3036719981977924, "grad_norm": 0.5732562875951487, "learning_rate": 2.39517336650228e-06, "loss": 0.4169, "step": 5161 }, { "epoch": 2.3041225501239015, "grad_norm": 0.5975884079148038, "learning_rate": 2.392106517400733e-06, "loss": 0.3664, "step": 5162 }, { "epoch": 2.304573102050011, "grad_norm": 0.6013859176024015, "learning_rate": 2.3890413662334413e-06, "loss": 0.3882, "step": 5163 }, { "epoch": 2.3050236539761206, "grad_norm": 0.5799693770692895, "learning_rate": 2.3859779136844864e-06, "loss": 0.3769, "step": 5164 }, { "epoch": 2.30547420590223, "grad_norm": 0.5807505116486003, "learning_rate": 2.382916160437573e-06, "loss": 0.4006, "step": 5165 }, { "epoch": 2.3059247578283397, "grad_norm": 0.5733560488613761, "learning_rate": 2.379856107176024e-06, "loss": 0.4075, "step": 5166 }, { "epoch": 2.3063753097544493, "grad_norm": 0.5820018041028377, "learning_rate": 2.376797754582785e-06, "loss": 0.3933, "step": 5167 }, { "epoch": 2.306825861680559, "grad_norm": 0.6010394654511422, "learning_rate": 2.373741103340419e-06, "loss": 0.3879, "step": 5168 }, { "epoch": 2.3072764136066684, "grad_norm": 0.5634205474784597, "learning_rate": 2.370686154131112e-06, "loss": 0.3836, "step": 5169 }, { "epoch": 2.307726965532778, "grad_norm": 0.5729145581655732, "learning_rate": 2.367632907636671e-06, "loss": 0.4051, "step": 5170 }, { "epoch": 2.308177517458887, "grad_norm": 0.6067355873634634, "learning_rate": 2.3645813645385198e-06, "loss": 0.4063, "step": 5171 }, { "epoch": 2.3086280693849965, "grad_norm": 0.6014907574976228, "learning_rate": 2.361531525517704e-06, "loss": 0.3788, "step": 5172 }, { "epoch": 2.309078621311106, "grad_norm": 0.5863461686875358, "learning_rate": 2.3584833912548887e-06, "loss": 0.3833, "step": 5173 }, { "epoch": 2.3095291732372156, "grad_norm": 0.5663967689279332, "learning_rate": 2.3554369624303588e-06, "loss": 0.3811, "step": 5174 }, { "epoch": 2.309979725163325, "grad_norm": 0.5615576991789428, "learning_rate": 2.3523922397240163e-06, "loss": 0.361, "step": 5175 }, { "epoch": 2.3104302770894347, "grad_norm": 0.591054033418788, "learning_rate": 2.3493492238153857e-06, "loss": 0.3753, "step": 5176 }, { "epoch": 2.3108808290155443, "grad_norm": 0.5938303171706728, "learning_rate": 2.34630791538361e-06, "loss": 0.3931, "step": 5177 }, { "epoch": 2.3113313809416534, "grad_norm": 0.5803442567658015, "learning_rate": 2.3432683151074487e-06, "loss": 0.3646, "step": 5178 }, { "epoch": 2.311781932867763, "grad_norm": 0.577175878097552, "learning_rate": 2.3402304236652817e-06, "loss": 0.4023, "step": 5179 }, { "epoch": 2.3122324847938724, "grad_norm": 0.5867998435169861, "learning_rate": 2.3371942417351077e-06, "loss": 0.4078, "step": 5180 }, { "epoch": 2.312683036719982, "grad_norm": 0.5878216272423797, "learning_rate": 2.334159769994544e-06, "loss": 0.3778, "step": 5181 }, { "epoch": 2.3131335886460915, "grad_norm": 0.5711221339894099, "learning_rate": 2.331127009120826e-06, "loss": 0.4066, "step": 5182 }, { "epoch": 2.313584140572201, "grad_norm": 0.5822134650490167, "learning_rate": 2.328095959790809e-06, "loss": 0.3947, "step": 5183 }, { "epoch": 2.3140346924983106, "grad_norm": 0.5826934294221385, "learning_rate": 2.325066622680956e-06, "loss": 0.3926, "step": 5184 }, { "epoch": 2.3144852444244197, "grad_norm": 0.6310175655321492, "learning_rate": 2.3220389984673684e-06, "loss": 0.3936, "step": 5185 }, { "epoch": 2.3149357963505293, "grad_norm": 0.5883517948132193, "learning_rate": 2.3190130878257422e-06, "loss": 0.4023, "step": 5186 }, { "epoch": 2.315386348276639, "grad_norm": 0.5823004248028077, "learning_rate": 2.315988891431412e-06, "loss": 0.3959, "step": 5187 }, { "epoch": 2.3158369002027483, "grad_norm": 0.5816516825291684, "learning_rate": 2.3129664099593086e-06, "loss": 0.3727, "step": 5188 }, { "epoch": 2.316287452128858, "grad_norm": 0.5737893327686516, "learning_rate": 2.309945644084004e-06, "loss": 0.4094, "step": 5189 }, { "epoch": 2.3167380040549674, "grad_norm": 0.5884022387501994, "learning_rate": 2.3069265944796616e-06, "loss": 0.389, "step": 5190 }, { "epoch": 2.317188555981077, "grad_norm": 0.6158979204384091, "learning_rate": 2.3039092618200864e-06, "loss": 0.3924, "step": 5191 }, { "epoch": 2.317639107907186, "grad_norm": 0.611379565023254, "learning_rate": 2.3008936467786815e-06, "loss": 0.4251, "step": 5192 }, { "epoch": 2.3180896598332956, "grad_norm": 0.5991818553522245, "learning_rate": 2.2978797500284744e-06, "loss": 0.4068, "step": 5193 }, { "epoch": 2.318540211759405, "grad_norm": 0.616484064693829, "learning_rate": 2.2948675722421086e-06, "loss": 0.3928, "step": 5194 }, { "epoch": 2.3189907636855147, "grad_norm": 0.598443859910105, "learning_rate": 2.2918571140918456e-06, "loss": 0.3728, "step": 5195 }, { "epoch": 2.3194413156116243, "grad_norm": 0.5858855898539156, "learning_rate": 2.2888483762495594e-06, "loss": 0.3889, "step": 5196 }, { "epoch": 2.319891867537734, "grad_norm": 0.5819838754799985, "learning_rate": 2.2858413593867434e-06, "loss": 0.3839, "step": 5197 }, { "epoch": 2.3203424194638433, "grad_norm": 0.5845346076405864, "learning_rate": 2.282836064174504e-06, "loss": 0.3987, "step": 5198 }, { "epoch": 2.320792971389953, "grad_norm": 0.5944487615941348, "learning_rate": 2.2798324912835667e-06, "loss": 0.3927, "step": 5199 }, { "epoch": 2.321243523316062, "grad_norm": 0.5898261115524193, "learning_rate": 2.276830641384268e-06, "loss": 0.3895, "step": 5200 }, { "epoch": 2.3216940752421715, "grad_norm": 0.577408216194685, "learning_rate": 2.2738305151465646e-06, "loss": 0.4118, "step": 5201 }, { "epoch": 2.322144627168281, "grad_norm": 0.603022668601085, "learning_rate": 2.2708321132400257e-06, "loss": 0.349, "step": 5202 }, { "epoch": 2.3225951790943906, "grad_norm": 0.5740062546983512, "learning_rate": 2.267835436333837e-06, "loss": 0.3795, "step": 5203 }, { "epoch": 2.3230457310205, "grad_norm": 0.6227128224185019, "learning_rate": 2.2648404850967986e-06, "loss": 0.3921, "step": 5204 }, { "epoch": 2.3234962829466097, "grad_norm": 0.5905948373843533, "learning_rate": 2.2618472601973252e-06, "loss": 0.4077, "step": 5205 }, { "epoch": 2.3239468348727192, "grad_norm": 0.5843029296889014, "learning_rate": 2.258855762303447e-06, "loss": 0.3992, "step": 5206 }, { "epoch": 2.324397386798829, "grad_norm": 0.5767604049793331, "learning_rate": 2.2558659920828095e-06, "loss": 0.4119, "step": 5207 }, { "epoch": 2.324847938724938, "grad_norm": 0.5773222620397171, "learning_rate": 2.2528779502026652e-06, "loss": 0.409, "step": 5208 }, { "epoch": 2.3252984906510474, "grad_norm": 0.5894783990748798, "learning_rate": 2.249891637329897e-06, "loss": 0.4202, "step": 5209 }, { "epoch": 2.325749042577157, "grad_norm": 0.5744612506965032, "learning_rate": 2.2469070541309814e-06, "loss": 0.3906, "step": 5210 }, { "epoch": 2.3261995945032665, "grad_norm": 0.6163572089984198, "learning_rate": 2.2439242012720295e-06, "loss": 0.4529, "step": 5211 }, { "epoch": 2.326650146429376, "grad_norm": 0.5708150435758207, "learning_rate": 2.240943079418747e-06, "loss": 0.3993, "step": 5212 }, { "epoch": 2.3271006983554856, "grad_norm": 0.5785723895973116, "learning_rate": 2.237963689236472e-06, "loss": 0.3905, "step": 5213 }, { "epoch": 2.327551250281595, "grad_norm": 0.5964580999960862, "learning_rate": 2.234986031390136e-06, "loss": 0.4072, "step": 5214 }, { "epoch": 2.3280018022077043, "grad_norm": 0.596765445707849, "learning_rate": 2.2320101065443055e-06, "loss": 0.3967, "step": 5215 }, { "epoch": 2.328452354133814, "grad_norm": 0.5598325899685477, "learning_rate": 2.2290359153631392e-06, "loss": 0.3764, "step": 5216 }, { "epoch": 2.3289029060599233, "grad_norm": 0.6069143125608218, "learning_rate": 2.226063458510428e-06, "loss": 0.4034, "step": 5217 }, { "epoch": 2.329353457986033, "grad_norm": 0.6152193247178187, "learning_rate": 2.22309273664956e-06, "loss": 0.4234, "step": 5218 }, { "epoch": 2.3298040099121424, "grad_norm": 0.5829654686986409, "learning_rate": 2.2201237504435413e-06, "loss": 0.3999, "step": 5219 }, { "epoch": 2.330254561838252, "grad_norm": 0.5951467822236118, "learning_rate": 2.2171565005550013e-06, "loss": 0.3919, "step": 5220 }, { "epoch": 2.3307051137643615, "grad_norm": 0.5927537962246556, "learning_rate": 2.2141909876461607e-06, "loss": 0.4062, "step": 5221 }, { "epoch": 2.3311556656904706, "grad_norm": 0.5755125854859493, "learning_rate": 2.211227212378877e-06, "loss": 0.3846, "step": 5222 }, { "epoch": 2.33160621761658, "grad_norm": 0.5784496872987631, "learning_rate": 2.2082651754145956e-06, "loss": 0.3994, "step": 5223 }, { "epoch": 2.3320567695426897, "grad_norm": 0.5721400205996313, "learning_rate": 2.2053048774143957e-06, "loss": 0.3745, "step": 5224 }, { "epoch": 2.3325073214687992, "grad_norm": 0.5767983313509821, "learning_rate": 2.2023463190389483e-06, "loss": 0.3996, "step": 5225 }, { "epoch": 2.332957873394909, "grad_norm": 0.6128833876604325, "learning_rate": 2.1993895009485576e-06, "loss": 0.3944, "step": 5226 }, { "epoch": 2.3334084253210183, "grad_norm": 0.5725462557691441, "learning_rate": 2.196434423803119e-06, "loss": 0.3974, "step": 5227 }, { "epoch": 2.333858977247128, "grad_norm": 0.6230895780764247, "learning_rate": 2.1934810882621515e-06, "loss": 0.3935, "step": 5228 }, { "epoch": 2.334309529173237, "grad_norm": 0.6159312959447146, "learning_rate": 2.190529494984782e-06, "loss": 0.4406, "step": 5229 }, { "epoch": 2.3347600810993465, "grad_norm": 0.5950528029550327, "learning_rate": 2.1875796446297494e-06, "loss": 0.398, "step": 5230 }, { "epoch": 2.335210633025456, "grad_norm": 0.595508544382792, "learning_rate": 2.1846315378554027e-06, "loss": 0.4013, "step": 5231 }, { "epoch": 2.3356611849515656, "grad_norm": 0.5936698855206185, "learning_rate": 2.1816851753197023e-06, "loss": 0.3898, "step": 5232 }, { "epoch": 2.336111736877675, "grad_norm": 0.5751156049285213, "learning_rate": 2.17874055768022e-06, "loss": 0.3845, "step": 5233 }, { "epoch": 2.3365622888037847, "grad_norm": 0.574792926325374, "learning_rate": 2.1757976855941355e-06, "loss": 0.4011, "step": 5234 }, { "epoch": 2.3370128407298942, "grad_norm": 0.6047617181417918, "learning_rate": 2.172856559718243e-06, "loss": 0.3885, "step": 5235 }, { "epoch": 2.337463392656004, "grad_norm": 0.5742357761444595, "learning_rate": 2.1699171807089414e-06, "loss": 0.3601, "step": 5236 }, { "epoch": 2.3379139445821133, "grad_norm": 0.5991304596684898, "learning_rate": 2.1669795492222466e-06, "loss": 0.411, "step": 5237 }, { "epoch": 2.3383644965082224, "grad_norm": 0.6002550747276356, "learning_rate": 2.16404366591378e-06, "loss": 0.4411, "step": 5238 }, { "epoch": 2.338815048434332, "grad_norm": 0.6034426681828804, "learning_rate": 2.161109531438772e-06, "loss": 0.3883, "step": 5239 }, { "epoch": 2.3392656003604415, "grad_norm": 0.589447299468251, "learning_rate": 2.1581771464520672e-06, "loss": 0.3977, "step": 5240 }, { "epoch": 2.339716152286551, "grad_norm": 0.6003549740504405, "learning_rate": 2.1552465116081146e-06, "loss": 0.3996, "step": 5241 }, { "epoch": 2.3401667042126606, "grad_norm": 0.580159093151861, "learning_rate": 2.152317627560979e-06, "loss": 0.3792, "step": 5242 }, { "epoch": 2.34061725613877, "grad_norm": 0.6040537804146219, "learning_rate": 2.149390494964323e-06, "loss": 0.4029, "step": 5243 }, { "epoch": 2.3410678080648797, "grad_norm": 0.6120185038299594, "learning_rate": 2.1464651144714357e-06, "loss": 0.3854, "step": 5244 }, { "epoch": 2.341518359990989, "grad_norm": 0.6116097566188518, "learning_rate": 2.1435414867351943e-06, "loss": 0.412, "step": 5245 }, { "epoch": 2.3419689119170983, "grad_norm": 0.5866764399518631, "learning_rate": 2.140619612408108e-06, "loss": 0.3908, "step": 5246 }, { "epoch": 2.342419463843208, "grad_norm": 0.620494174539203, "learning_rate": 2.137699492142269e-06, "loss": 0.4115, "step": 5247 }, { "epoch": 2.3428700157693174, "grad_norm": 0.6056267334267387, "learning_rate": 2.1347811265894047e-06, "loss": 0.3793, "step": 5248 }, { "epoch": 2.343320567695427, "grad_norm": 0.604154493370662, "learning_rate": 2.131864516400827e-06, "loss": 0.3994, "step": 5249 }, { "epoch": 2.3437711196215365, "grad_norm": 0.6051836041085208, "learning_rate": 2.1289496622274754e-06, "loss": 0.3814, "step": 5250 }, { "epoch": 2.344221671547646, "grad_norm": 0.6036603644234741, "learning_rate": 2.12603656471988e-06, "loss": 0.4073, "step": 5251 }, { "epoch": 2.344672223473755, "grad_norm": 0.5905620239695072, "learning_rate": 2.1231252245281975e-06, "loss": 0.4079, "step": 5252 }, { "epoch": 2.3451227753998647, "grad_norm": 0.5833662599917084, "learning_rate": 2.1202156423021746e-06, "loss": 0.3967, "step": 5253 }, { "epoch": 2.3455733273259742, "grad_norm": 0.5798504035721863, "learning_rate": 2.1173078186911766e-06, "loss": 0.4111, "step": 5254 }, { "epoch": 2.3460238792520838, "grad_norm": 0.5841829528450938, "learning_rate": 2.114401754344173e-06, "loss": 0.3941, "step": 5255 }, { "epoch": 2.3464744311781933, "grad_norm": 0.6914194219255644, "learning_rate": 2.111497449909741e-06, "loss": 0.4113, "step": 5256 }, { "epoch": 2.346924983104303, "grad_norm": 0.5778106548899357, "learning_rate": 2.1085949060360654e-06, "loss": 0.3772, "step": 5257 }, { "epoch": 2.3473755350304124, "grad_norm": 0.6175845124335833, "learning_rate": 2.1056941233709373e-06, "loss": 0.4388, "step": 5258 }, { "epoch": 2.3478260869565215, "grad_norm": 0.6064751575135016, "learning_rate": 2.1027951025617555e-06, "loss": 0.3785, "step": 5259 }, { "epoch": 2.348276638882631, "grad_norm": 0.5711567301313331, "learning_rate": 2.099897844255524e-06, "loss": 0.3809, "step": 5260 }, { "epoch": 2.3487271908087406, "grad_norm": 0.5806536813482311, "learning_rate": 2.0970023490988567e-06, "loss": 0.3672, "step": 5261 }, { "epoch": 2.34917774273485, "grad_norm": 0.5909724435432403, "learning_rate": 2.094108617737971e-06, "loss": 0.3897, "step": 5262 }, { "epoch": 2.3496282946609597, "grad_norm": 0.5963451273857822, "learning_rate": 2.0912166508186904e-06, "loss": 0.3606, "step": 5263 }, { "epoch": 2.3500788465870692, "grad_norm": 0.5865527152526128, "learning_rate": 2.0883264489864476e-06, "loss": 0.3993, "step": 5264 }, { "epoch": 2.3505293985131788, "grad_norm": 0.5751291149382308, "learning_rate": 2.0854380128862796e-06, "loss": 0.4064, "step": 5265 }, { "epoch": 2.3509799504392883, "grad_norm": 0.5868574561011539, "learning_rate": 2.0825513431628277e-06, "loss": 0.4009, "step": 5266 }, { "epoch": 2.351430502365398, "grad_norm": 0.5862668751072037, "learning_rate": 2.0796664404603416e-06, "loss": 0.3982, "step": 5267 }, { "epoch": 2.351881054291507, "grad_norm": 0.5773070142752513, "learning_rate": 2.076783305422675e-06, "loss": 0.3889, "step": 5268 }, { "epoch": 2.3523316062176165, "grad_norm": 0.5984599483593631, "learning_rate": 2.073901938693289e-06, "loss": 0.4, "step": 5269 }, { "epoch": 2.352782158143726, "grad_norm": 0.595041761394708, "learning_rate": 2.0710223409152474e-06, "loss": 0.4143, "step": 5270 }, { "epoch": 2.3532327100698356, "grad_norm": 0.6001285181064244, "learning_rate": 2.0681445127312218e-06, "loss": 0.3952, "step": 5271 }, { "epoch": 2.353683261995945, "grad_norm": 0.5660447537268545, "learning_rate": 2.0652684547834865e-06, "loss": 0.3884, "step": 5272 }, { "epoch": 2.3541338139220547, "grad_norm": 0.6040877743007103, "learning_rate": 2.0623941677139235e-06, "loss": 0.3966, "step": 5273 }, { "epoch": 2.354584365848164, "grad_norm": 0.5971334683412917, "learning_rate": 2.059521652164016e-06, "loss": 0.4067, "step": 5274 }, { "epoch": 2.3550349177742733, "grad_norm": 0.5825190645506324, "learning_rate": 2.0566509087748542e-06, "loss": 0.4242, "step": 5275 }, { "epoch": 2.355485469700383, "grad_norm": 0.5866024897821357, "learning_rate": 2.0537819381871325e-06, "loss": 0.3881, "step": 5276 }, { "epoch": 2.3559360216264924, "grad_norm": 0.5690361414654246, "learning_rate": 2.0509147410411503e-06, "loss": 0.3814, "step": 5277 }, { "epoch": 2.356386573552602, "grad_norm": 0.5889959847234523, "learning_rate": 2.048049317976809e-06, "loss": 0.402, "step": 5278 }, { "epoch": 2.3568371254787115, "grad_norm": 0.6364726089491719, "learning_rate": 2.0451856696336204e-06, "loss": 0.3901, "step": 5279 }, { "epoch": 2.357287677404821, "grad_norm": 0.6094189681931584, "learning_rate": 2.0423237966506838e-06, "loss": 0.3986, "step": 5280 }, { "epoch": 2.3577382293309306, "grad_norm": 0.5884312672753123, "learning_rate": 2.039463699666727e-06, "loss": 0.3818, "step": 5281 }, { "epoch": 2.3581887812570397, "grad_norm": 0.5676560764858046, "learning_rate": 2.0366053793200567e-06, "loss": 0.3962, "step": 5282 }, { "epoch": 2.3586393331831492, "grad_norm": 0.5912148131661461, "learning_rate": 2.033748836248607e-06, "loss": 0.3992, "step": 5283 }, { "epoch": 2.3590898851092588, "grad_norm": 0.5781329025121114, "learning_rate": 2.0308940710898885e-06, "loss": 0.3793, "step": 5284 }, { "epoch": 2.3595404370353683, "grad_norm": 0.5877504474942852, "learning_rate": 2.0280410844810426e-06, "loss": 0.3753, "step": 5285 }, { "epoch": 2.359990988961478, "grad_norm": 0.6007254334917741, "learning_rate": 2.0251898770587897e-06, "loss": 0.3867, "step": 5286 }, { "epoch": 2.3604415408875874, "grad_norm": 0.5929176807075444, "learning_rate": 2.0223404494594745e-06, "loss": 0.3946, "step": 5287 }, { "epoch": 2.360892092813697, "grad_norm": 0.5895070951827643, "learning_rate": 2.0194928023190263e-06, "loss": 0.398, "step": 5288 }, { "epoch": 2.361342644739806, "grad_norm": 0.6134200131297066, "learning_rate": 2.0166469362729868e-06, "loss": 0.4041, "step": 5289 }, { "epoch": 2.3617931966659156, "grad_norm": 0.5659720986252204, "learning_rate": 2.013802851956498e-06, "loss": 0.3776, "step": 5290 }, { "epoch": 2.362243748592025, "grad_norm": 0.5776922093729988, "learning_rate": 2.010960550004305e-06, "loss": 0.4071, "step": 5291 }, { "epoch": 2.3626943005181347, "grad_norm": 0.5671680965282457, "learning_rate": 2.008120031050753e-06, "loss": 0.3607, "step": 5292 }, { "epoch": 2.363144852444244, "grad_norm": 0.598416475756684, "learning_rate": 2.005281295729793e-06, "loss": 0.3944, "step": 5293 }, { "epoch": 2.3635954043703538, "grad_norm": 0.5702599942709169, "learning_rate": 2.0024443446749743e-06, "loss": 0.3943, "step": 5294 }, { "epoch": 2.3640459562964633, "grad_norm": 0.5760258127623584, "learning_rate": 1.9996091785194492e-06, "loss": 0.3973, "step": 5295 }, { "epoch": 2.3644965082225724, "grad_norm": 0.5931155830544502, "learning_rate": 1.9967757978959723e-06, "loss": 0.3809, "step": 5296 }, { "epoch": 2.364947060148682, "grad_norm": 0.5862205170122843, "learning_rate": 1.9939442034368983e-06, "loss": 0.371, "step": 5297 }, { "epoch": 2.3653976120747915, "grad_norm": 0.585760695178453, "learning_rate": 1.9911143957741853e-06, "loss": 0.4015, "step": 5298 }, { "epoch": 2.365848164000901, "grad_norm": 0.5969355995065035, "learning_rate": 1.988286375539391e-06, "loss": 0.3869, "step": 5299 }, { "epoch": 2.3662987159270106, "grad_norm": 0.595609661755352, "learning_rate": 1.9854601433636755e-06, "loss": 0.4177, "step": 5300 }, { "epoch": 2.36674926785312, "grad_norm": 0.5802002311372934, "learning_rate": 1.9826356998777975e-06, "loss": 0.4193, "step": 5301 }, { "epoch": 2.3671998197792297, "grad_norm": 0.6093840268595989, "learning_rate": 1.979813045712119e-06, "loss": 0.4144, "step": 5302 }, { "epoch": 2.367650371705339, "grad_norm": 0.6219864838657613, "learning_rate": 1.976992181496604e-06, "loss": 0.3915, "step": 5303 }, { "epoch": 2.3681009236314488, "grad_norm": 0.5842618478119553, "learning_rate": 1.974173107860806e-06, "loss": 0.3652, "step": 5304 }, { "epoch": 2.368551475557558, "grad_norm": 0.5863269357485388, "learning_rate": 1.9713558254339003e-06, "loss": 0.3988, "step": 5305 }, { "epoch": 2.3690020274836674, "grad_norm": 0.5922385893191536, "learning_rate": 1.9685403348446374e-06, "loss": 0.4074, "step": 5306 }, { "epoch": 2.369452579409777, "grad_norm": 0.6064670478612857, "learning_rate": 1.96572663672139e-06, "loss": 0.3959, "step": 5307 }, { "epoch": 2.3699031313358865, "grad_norm": 0.5817466985689858, "learning_rate": 1.962914731692113e-06, "loss": 0.3916, "step": 5308 }, { "epoch": 2.370353683261996, "grad_norm": 0.5597486028903724, "learning_rate": 1.9601046203843767e-06, "loss": 0.3706, "step": 5309 }, { "epoch": 2.3708042351881056, "grad_norm": 0.628521591002918, "learning_rate": 1.9572963034253343e-06, "loss": 0.4237, "step": 5310 }, { "epoch": 2.371254787114215, "grad_norm": 0.596883654756742, "learning_rate": 1.9544897814417584e-06, "loss": 0.3802, "step": 5311 }, { "epoch": 2.371705339040324, "grad_norm": 0.5990691122391303, "learning_rate": 1.9516850550599996e-06, "loss": 0.3778, "step": 5312 }, { "epoch": 2.3721558909664338, "grad_norm": 0.5828798975752933, "learning_rate": 1.9488821249060297e-06, "loss": 0.3815, "step": 5313 }, { "epoch": 2.3726064428925433, "grad_norm": 0.5921720318889564, "learning_rate": 1.946080991605399e-06, "loss": 0.4279, "step": 5314 }, { "epoch": 2.373056994818653, "grad_norm": 0.600416783696914, "learning_rate": 1.943281655783269e-06, "loss": 0.3881, "step": 5315 }, { "epoch": 2.3735075467447624, "grad_norm": 0.5952339879918491, "learning_rate": 1.9404841180643975e-06, "loss": 0.3865, "step": 5316 }, { "epoch": 2.373958098670872, "grad_norm": 0.5918047107668051, "learning_rate": 1.9376883790731417e-06, "loss": 0.4138, "step": 5317 }, { "epoch": 2.3744086505969815, "grad_norm": 0.5753252021507699, "learning_rate": 1.9348944394334536e-06, "loss": 0.4077, "step": 5318 }, { "epoch": 2.3748592025230906, "grad_norm": 0.5997713865766421, "learning_rate": 1.9321022997688866e-06, "loss": 0.3833, "step": 5319 }, { "epoch": 2.3753097544492, "grad_norm": 0.5836784023026752, "learning_rate": 1.929311960702599e-06, "loss": 0.4034, "step": 5320 }, { "epoch": 2.3757603063753097, "grad_norm": 0.5945624954452409, "learning_rate": 1.9265234228573284e-06, "loss": 0.3856, "step": 5321 }, { "epoch": 2.376210858301419, "grad_norm": 0.6016760609807972, "learning_rate": 1.9237366868554353e-06, "loss": 0.4093, "step": 5322 }, { "epoch": 2.3766614102275287, "grad_norm": 0.5819874532034995, "learning_rate": 1.9209517533188547e-06, "loss": 0.4018, "step": 5323 }, { "epoch": 2.3771119621536383, "grad_norm": 0.6025673136351423, "learning_rate": 1.9181686228691354e-06, "loss": 0.3909, "step": 5324 }, { "epoch": 2.377562514079748, "grad_norm": 0.6075232946130118, "learning_rate": 1.9153872961274158e-06, "loss": 0.4056, "step": 5325 }, { "epoch": 2.378013066005857, "grad_norm": 0.5826718199563401, "learning_rate": 1.912607773714433e-06, "loss": 0.3965, "step": 5326 }, { "epoch": 2.3784636179319665, "grad_norm": 0.5804087935133815, "learning_rate": 1.9098300562505266e-06, "loss": 0.3924, "step": 5327 }, { "epoch": 2.378914169858076, "grad_norm": 0.5659630477288573, "learning_rate": 1.9070541443556257e-06, "loss": 0.3828, "step": 5328 }, { "epoch": 2.3793647217841856, "grad_norm": 0.58964232982469, "learning_rate": 1.904280038649261e-06, "loss": 0.4025, "step": 5329 }, { "epoch": 2.379815273710295, "grad_norm": 0.6088708646505878, "learning_rate": 1.9015077397505599e-06, "loss": 0.3864, "step": 5330 }, { "epoch": 2.3802658256364047, "grad_norm": 0.5890103585951117, "learning_rate": 1.8987372482782452e-06, "loss": 0.3838, "step": 5331 }, { "epoch": 2.380716377562514, "grad_norm": 0.5892958042062469, "learning_rate": 1.8959685648506365e-06, "loss": 0.426, "step": 5332 }, { "epoch": 2.3811669294886237, "grad_norm": 0.5842974642797449, "learning_rate": 1.8932016900856498e-06, "loss": 0.3764, "step": 5333 }, { "epoch": 2.3816174814147333, "grad_norm": 0.6013469110121266, "learning_rate": 1.8904366246007998e-06, "loss": 0.396, "step": 5334 }, { "epoch": 2.3820680333408424, "grad_norm": 0.5831463821687616, "learning_rate": 1.8876733690131932e-06, "loss": 0.3974, "step": 5335 }, { "epoch": 2.382518585266952, "grad_norm": 0.5961214884156689, "learning_rate": 1.8849119239395365e-06, "loss": 0.3994, "step": 5336 }, { "epoch": 2.3829691371930615, "grad_norm": 0.5971889003254263, "learning_rate": 1.8821522899961309e-06, "loss": 0.3915, "step": 5337 }, { "epoch": 2.383419689119171, "grad_norm": 0.5820778449981169, "learning_rate": 1.879394467798874e-06, "loss": 0.3896, "step": 5338 }, { "epoch": 2.3838702410452806, "grad_norm": 0.5884509562915253, "learning_rate": 1.8766384579632524e-06, "loss": 0.417, "step": 5339 }, { "epoch": 2.38432079297139, "grad_norm": 0.58013566442807, "learning_rate": 1.8738842611043629e-06, "loss": 0.3904, "step": 5340 }, { "epoch": 2.3847713448974996, "grad_norm": 0.5822000836675103, "learning_rate": 1.8711318778368792e-06, "loss": 0.3872, "step": 5341 }, { "epoch": 2.3852218968236087, "grad_norm": 0.5898536884720752, "learning_rate": 1.8683813087750901e-06, "loss": 0.4083, "step": 5342 }, { "epoch": 2.3856724487497183, "grad_norm": 0.585995776571561, "learning_rate": 1.865632554532859e-06, "loss": 0.382, "step": 5343 }, { "epoch": 2.386123000675828, "grad_norm": 0.5903107772626288, "learning_rate": 1.8628856157236651e-06, "loss": 0.4051, "step": 5344 }, { "epoch": 2.3865735526019374, "grad_norm": 0.6010139442229631, "learning_rate": 1.8601404929605615e-06, "loss": 0.3963, "step": 5345 }, { "epoch": 2.387024104528047, "grad_norm": 0.6002702926387345, "learning_rate": 1.8573971868562158e-06, "loss": 0.403, "step": 5346 }, { "epoch": 2.3874746564541565, "grad_norm": 0.6015957805270038, "learning_rate": 1.8546556980228714e-06, "loss": 0.3876, "step": 5347 }, { "epoch": 2.387925208380266, "grad_norm": 0.609016300585024, "learning_rate": 1.8519160270723858e-06, "loss": 0.3922, "step": 5348 }, { "epoch": 2.388375760306375, "grad_norm": 0.6193765602078881, "learning_rate": 1.849178174616192e-06, "loss": 0.3662, "step": 5349 }, { "epoch": 2.3888263122324846, "grad_norm": 0.6010811413219204, "learning_rate": 1.8464421412653277e-06, "loss": 0.4207, "step": 5350 }, { "epoch": 2.389276864158594, "grad_norm": 0.5888884679989034, "learning_rate": 1.8437079276304236e-06, "loss": 0.3831, "step": 5351 }, { "epoch": 2.3897274160847037, "grad_norm": 0.581927390975639, "learning_rate": 1.840975534321703e-06, "loss": 0.3738, "step": 5352 }, { "epoch": 2.3901779680108133, "grad_norm": 0.5959419933994218, "learning_rate": 1.8382449619489818e-06, "loss": 0.4104, "step": 5353 }, { "epoch": 2.390628519936923, "grad_norm": 0.5866041407647163, "learning_rate": 1.835516211121673e-06, "loss": 0.3789, "step": 5354 }, { "epoch": 2.3910790718630324, "grad_norm": 0.5739904310655003, "learning_rate": 1.8327892824487792e-06, "loss": 0.4337, "step": 5355 }, { "epoch": 2.3915296237891415, "grad_norm": 0.585999546722017, "learning_rate": 1.8300641765388982e-06, "loss": 0.3712, "step": 5356 }, { "epoch": 2.391980175715251, "grad_norm": 0.605309015154977, "learning_rate": 1.8273408940002202e-06, "loss": 0.3789, "step": 5357 }, { "epoch": 2.3924307276413606, "grad_norm": 0.621982510790134, "learning_rate": 1.8246194354405312e-06, "loss": 0.4213, "step": 5358 }, { "epoch": 2.39288127956747, "grad_norm": 0.5949532850112123, "learning_rate": 1.8218998014672063e-06, "loss": 0.3994, "step": 5359 }, { "epoch": 2.3933318314935796, "grad_norm": 0.6025618215954214, "learning_rate": 1.8191819926872156e-06, "loss": 0.4089, "step": 5360 }, { "epoch": 2.393782383419689, "grad_norm": 0.5905188556006887, "learning_rate": 1.8164660097071208e-06, "loss": 0.4098, "step": 5361 }, { "epoch": 2.3942329353457987, "grad_norm": 0.5689796673824677, "learning_rate": 1.8137518531330768e-06, "loss": 0.3955, "step": 5362 }, { "epoch": 2.3946834872719083, "grad_norm": 0.5898916311756848, "learning_rate": 1.81103952357083e-06, "loss": 0.4195, "step": 5363 }, { "epoch": 2.395134039198018, "grad_norm": 0.5763235928443603, "learning_rate": 1.8083290216257242e-06, "loss": 0.3613, "step": 5364 }, { "epoch": 2.395584591124127, "grad_norm": 0.5839962945427954, "learning_rate": 1.8056203479026812e-06, "loss": 0.4159, "step": 5365 }, { "epoch": 2.3960351430502365, "grad_norm": 0.5802575796459958, "learning_rate": 1.8029135030062362e-06, "loss": 0.3858, "step": 5366 }, { "epoch": 2.396485694976346, "grad_norm": 0.5851639007850135, "learning_rate": 1.8002084875404935e-06, "loss": 0.3946, "step": 5367 }, { "epoch": 2.3969362469024555, "grad_norm": 0.5852839408679916, "learning_rate": 1.7975053021091683e-06, "loss": 0.4167, "step": 5368 }, { "epoch": 2.397386798828565, "grad_norm": 0.6148782426972973, "learning_rate": 1.7948039473155553e-06, "loss": 0.3985, "step": 5369 }, { "epoch": 2.3978373507546746, "grad_norm": 0.5893262344753329, "learning_rate": 1.792104423762545e-06, "loss": 0.4026, "step": 5370 }, { "epoch": 2.398287902680784, "grad_norm": 0.5793219745962814, "learning_rate": 1.7894067320526198e-06, "loss": 0.3633, "step": 5371 }, { "epoch": 2.3987384546068933, "grad_norm": 0.5910330943727898, "learning_rate": 1.7867108727878512e-06, "loss": 0.3834, "step": 5372 }, { "epoch": 2.399189006533003, "grad_norm": 0.5896980497065452, "learning_rate": 1.784016846569906e-06, "loss": 0.3884, "step": 5373 }, { "epoch": 2.3996395584591124, "grad_norm": 0.6139436523284415, "learning_rate": 1.7813246540000295e-06, "loss": 0.3908, "step": 5374 }, { "epoch": 2.400090110385222, "grad_norm": 0.6093178643568314, "learning_rate": 1.7786342956790782e-06, "loss": 0.3648, "step": 5375 }, { "epoch": 2.4005406623113315, "grad_norm": 0.6339894712606634, "learning_rate": 1.775945772207477e-06, "loss": 0.4005, "step": 5376 }, { "epoch": 2.400991214237441, "grad_norm": 0.5869513844908595, "learning_rate": 1.7732590841852637e-06, "loss": 0.3661, "step": 5377 }, { "epoch": 2.4014417661635505, "grad_norm": 0.5780198138704964, "learning_rate": 1.7705742322120434e-06, "loss": 0.4059, "step": 5378 }, { "epoch": 2.4018923180896596, "grad_norm": 0.6002433673989044, "learning_rate": 1.7678912168870332e-06, "loss": 0.3696, "step": 5379 }, { "epoch": 2.402342870015769, "grad_norm": 0.6120156158249941, "learning_rate": 1.765210038809021e-06, "loss": 0.401, "step": 5380 }, { "epoch": 2.4027934219418787, "grad_norm": 0.5886522230312213, "learning_rate": 1.7625306985764034e-06, "loss": 0.3799, "step": 5381 }, { "epoch": 2.4032439738679883, "grad_norm": 0.5834107532939076, "learning_rate": 1.7598531967871468e-06, "loss": 0.3913, "step": 5382 }, { "epoch": 2.403694525794098, "grad_norm": 0.5848669401646563, "learning_rate": 1.7571775340388275e-06, "loss": 0.3746, "step": 5383 }, { "epoch": 2.4041450777202074, "grad_norm": 0.5944542605465758, "learning_rate": 1.7545037109285946e-06, "loss": 0.4019, "step": 5384 }, { "epoch": 2.404595629646317, "grad_norm": 0.5897094054531226, "learning_rate": 1.751831728053195e-06, "loss": 0.3684, "step": 5385 }, { "epoch": 2.405046181572426, "grad_norm": 0.5761482118577906, "learning_rate": 1.7491615860089651e-06, "loss": 0.3814, "step": 5386 }, { "epoch": 2.4054967334985355, "grad_norm": 0.5973122749321289, "learning_rate": 1.746493285391827e-06, "loss": 0.3936, "step": 5387 }, { "epoch": 2.405947285424645, "grad_norm": 0.5967835907386434, "learning_rate": 1.7438268267972959e-06, "loss": 0.3743, "step": 5388 }, { "epoch": 2.4063978373507546, "grad_norm": 0.5941171246693888, "learning_rate": 1.7411622108204718e-06, "loss": 0.3985, "step": 5389 }, { "epoch": 2.406848389276864, "grad_norm": 0.6347874010168039, "learning_rate": 1.738499438056045e-06, "loss": 0.3943, "step": 5390 }, { "epoch": 2.4072989412029737, "grad_norm": 0.598992691543407, "learning_rate": 1.7358385090982965e-06, "loss": 0.3816, "step": 5391 }, { "epoch": 2.4077494931290833, "grad_norm": 0.5834649357358022, "learning_rate": 1.7331794245410926e-06, "loss": 0.4081, "step": 5392 }, { "epoch": 2.4082000450551924, "grad_norm": 0.5822109771494909, "learning_rate": 1.7305221849778909e-06, "loss": 0.3912, "step": 5393 }, { "epoch": 2.408650596981302, "grad_norm": 0.5689347776690252, "learning_rate": 1.7278667910017332e-06, "loss": 0.3861, "step": 5394 }, { "epoch": 2.4091011489074114, "grad_norm": 0.5900165733409486, "learning_rate": 1.7252132432052548e-06, "loss": 0.4255, "step": 5395 }, { "epoch": 2.409551700833521, "grad_norm": 0.5727807425156952, "learning_rate": 1.7225615421806741e-06, "loss": 0.3988, "step": 5396 }, { "epoch": 2.4100022527596305, "grad_norm": 0.5867361598166836, "learning_rate": 1.7199116885197996e-06, "loss": 0.3806, "step": 5397 }, { "epoch": 2.41045280468574, "grad_norm": 0.5769220637101588, "learning_rate": 1.7172636828140277e-06, "loss": 0.3896, "step": 5398 }, { "epoch": 2.4109033566118496, "grad_norm": 0.6162002020879719, "learning_rate": 1.7146175256543452e-06, "loss": 0.3886, "step": 5399 }, { "epoch": 2.411353908537959, "grad_norm": 0.5875997407519711, "learning_rate": 1.7119732176313142e-06, "loss": 0.3924, "step": 5400 }, { "epoch": 2.4118044604640687, "grad_norm": 0.5785852768396255, "learning_rate": 1.7093307593351027e-06, "loss": 0.3853, "step": 5401 }, { "epoch": 2.412255012390178, "grad_norm": 0.575767049860028, "learning_rate": 1.706690151355448e-06, "loss": 0.3623, "step": 5402 }, { "epoch": 2.4127055643162874, "grad_norm": 0.6039733745226157, "learning_rate": 1.7040513942816905e-06, "loss": 0.4238, "step": 5403 }, { "epoch": 2.413156116242397, "grad_norm": 0.6029261537378166, "learning_rate": 1.7014144887027406e-06, "loss": 0.3845, "step": 5404 }, { "epoch": 2.4136066681685064, "grad_norm": 0.5855956810149896, "learning_rate": 1.6987794352071152e-06, "loss": 0.3826, "step": 5405 }, { "epoch": 2.414057220094616, "grad_norm": 0.5769325328678332, "learning_rate": 1.6961462343828961e-06, "loss": 0.3992, "step": 5406 }, { "epoch": 2.4145077720207255, "grad_norm": 0.5809863625010036, "learning_rate": 1.693514886817772e-06, "loss": 0.4021, "step": 5407 }, { "epoch": 2.414958323946835, "grad_norm": 0.617014722685577, "learning_rate": 1.690885393099001e-06, "loss": 0.4123, "step": 5408 }, { "epoch": 2.415408875872944, "grad_norm": 0.6106688412085253, "learning_rate": 1.6882577538134425e-06, "loss": 0.4076, "step": 5409 }, { "epoch": 2.4158594277990537, "grad_norm": 0.5890144732825409, "learning_rate": 1.6856319695475287e-06, "loss": 0.406, "step": 5410 }, { "epoch": 2.4163099797251633, "grad_norm": 0.5996267025002974, "learning_rate": 1.6830080408872852e-06, "loss": 0.41, "step": 5411 }, { "epoch": 2.416760531651273, "grad_norm": 0.6011285359813879, "learning_rate": 1.6803859684183233e-06, "loss": 0.4061, "step": 5412 }, { "epoch": 2.4172110835773823, "grad_norm": 0.5735477168286985, "learning_rate": 1.6777657527258373e-06, "loss": 0.3748, "step": 5413 }, { "epoch": 2.417661635503492, "grad_norm": 0.5860352117484561, "learning_rate": 1.6751473943946094e-06, "loss": 0.381, "step": 5414 }, { "epoch": 2.4181121874296014, "grad_norm": 0.5679778122474195, "learning_rate": 1.6725308940090068e-06, "loss": 0.3782, "step": 5415 }, { "epoch": 2.4185627393557105, "grad_norm": 0.5712281458142077, "learning_rate": 1.6699162521529798e-06, "loss": 0.4046, "step": 5416 }, { "epoch": 2.41901329128182, "grad_norm": 0.5836707954744806, "learning_rate": 1.6673034694100655e-06, "loss": 0.406, "step": 5417 }, { "epoch": 2.4194638432079296, "grad_norm": 0.6030910222917485, "learning_rate": 1.6646925463633924e-06, "loss": 0.4195, "step": 5418 }, { "epoch": 2.419914395134039, "grad_norm": 0.5723714645711853, "learning_rate": 1.66208348359566e-06, "loss": 0.3927, "step": 5419 }, { "epoch": 2.4203649470601487, "grad_norm": 0.5760639592560871, "learning_rate": 1.6594762816891653e-06, "loss": 0.3941, "step": 5420 }, { "epoch": 2.4208154989862583, "grad_norm": 0.6080739643713958, "learning_rate": 1.6568709412257823e-06, "loss": 0.4034, "step": 5421 }, { "epoch": 2.421266050912368, "grad_norm": 0.5864887921139922, "learning_rate": 1.6542674627869738e-06, "loss": 0.3553, "step": 5422 }, { "epoch": 2.421716602838477, "grad_norm": 0.5758557247567537, "learning_rate": 1.651665846953786e-06, "loss": 0.402, "step": 5423 }, { "epoch": 2.4221671547645864, "grad_norm": 0.5816741162107062, "learning_rate": 1.6490660943068492e-06, "loss": 0.389, "step": 5424 }, { "epoch": 2.422617706690696, "grad_norm": 0.5784256656376655, "learning_rate": 1.646468205426377e-06, "loss": 0.3967, "step": 5425 }, { "epoch": 2.4230682586168055, "grad_norm": 0.5970799284027679, "learning_rate": 1.6438721808921687e-06, "loss": 0.4145, "step": 5426 }, { "epoch": 2.423518810542915, "grad_norm": 0.5779894197159275, "learning_rate": 1.6412780212836055e-06, "loss": 0.372, "step": 5427 }, { "epoch": 2.4239693624690246, "grad_norm": 0.6032599658307901, "learning_rate": 1.638685727179654e-06, "loss": 0.4142, "step": 5428 }, { "epoch": 2.424419914395134, "grad_norm": 0.5957112259154358, "learning_rate": 1.636095299158864e-06, "loss": 0.4204, "step": 5429 }, { "epoch": 2.4248704663212437, "grad_norm": 0.580261158326216, "learning_rate": 1.6335067377993697e-06, "loss": 0.3753, "step": 5430 }, { "epoch": 2.4253210182473532, "grad_norm": 0.5822678367446522, "learning_rate": 1.6309200436788864e-06, "loss": 0.4098, "step": 5431 }, { "epoch": 2.4257715701734623, "grad_norm": 0.5707860264883201, "learning_rate": 1.6283352173747148e-06, "loss": 0.3878, "step": 5432 }, { "epoch": 2.426222122099572, "grad_norm": 0.5931917694321235, "learning_rate": 1.6257522594637365e-06, "loss": 0.4136, "step": 5433 }, { "epoch": 2.4266726740256814, "grad_norm": 0.5715546496634129, "learning_rate": 1.6231711705224228e-06, "loss": 0.3868, "step": 5434 }, { "epoch": 2.427123225951791, "grad_norm": 0.6084138322525409, "learning_rate": 1.620591951126813e-06, "loss": 0.3769, "step": 5435 }, { "epoch": 2.4275737778779005, "grad_norm": 0.5981512129080885, "learning_rate": 1.61801460185255e-06, "loss": 0.4129, "step": 5436 }, { "epoch": 2.42802432980401, "grad_norm": 0.5974085256785736, "learning_rate": 1.6154391232748367e-06, "loss": 0.384, "step": 5437 }, { "epoch": 2.4284748817301196, "grad_norm": 0.6070963570601317, "learning_rate": 1.6128655159684824e-06, "loss": 0.4111, "step": 5438 }, { "epoch": 2.4289254336562287, "grad_norm": 0.5782054993945689, "learning_rate": 1.6102937805078544e-06, "loss": 0.4019, "step": 5439 }, { "epoch": 2.4293759855823382, "grad_norm": 0.5886040190781222, "learning_rate": 1.6077239174669245e-06, "loss": 0.3641, "step": 5440 }, { "epoch": 2.429826537508448, "grad_norm": 0.5667004130073947, "learning_rate": 1.6051559274192275e-06, "loss": 0.4004, "step": 5441 }, { "epoch": 2.4302770894345573, "grad_norm": 0.5789822479411743, "learning_rate": 1.6025898109378967e-06, "loss": 0.4092, "step": 5442 }, { "epoch": 2.430727641360667, "grad_norm": 0.571050288055405, "learning_rate": 1.6000255685956312e-06, "loss": 0.3867, "step": 5443 }, { "epoch": 2.4311781932867764, "grad_norm": 0.5982510351193303, "learning_rate": 1.5974632009647295e-06, "loss": 0.3959, "step": 5444 }, { "epoch": 2.431628745212886, "grad_norm": 0.5618772240166842, "learning_rate": 1.5949027086170555e-06, "loss": 0.3869, "step": 5445 }, { "epoch": 2.432079297138995, "grad_norm": 0.5889733273738749, "learning_rate": 1.5923440921240641e-06, "loss": 0.402, "step": 5446 }, { "epoch": 2.4325298490651046, "grad_norm": 0.5878430884497187, "learning_rate": 1.5897873520567875e-06, "loss": 0.3926, "step": 5447 }, { "epoch": 2.432980400991214, "grad_norm": 0.598449656371215, "learning_rate": 1.5872324889858415e-06, "loss": 0.3678, "step": 5448 }, { "epoch": 2.4334309529173237, "grad_norm": 0.6046800538401215, "learning_rate": 1.584679503481421e-06, "loss": 0.395, "step": 5449 }, { "epoch": 2.4338815048434332, "grad_norm": 0.586092717756696, "learning_rate": 1.582128396113305e-06, "loss": 0.4032, "step": 5450 }, { "epoch": 2.434332056769543, "grad_norm": 0.6040117260061514, "learning_rate": 1.5795791674508488e-06, "loss": 0.4134, "step": 5451 }, { "epoch": 2.4347826086956523, "grad_norm": 0.5756414369423003, "learning_rate": 1.5770318180629918e-06, "loss": 0.3842, "step": 5452 }, { "epoch": 2.4352331606217614, "grad_norm": 0.5848800939312908, "learning_rate": 1.5744863485182537e-06, "loss": 0.3977, "step": 5453 }, { "epoch": 2.435683712547871, "grad_norm": 0.5889910675248765, "learning_rate": 1.5719427593847325e-06, "loss": 0.3951, "step": 5454 }, { "epoch": 2.4361342644739805, "grad_norm": 0.5950307636192946, "learning_rate": 1.5694010512301095e-06, "loss": 0.4068, "step": 5455 }, { "epoch": 2.43658481640009, "grad_norm": 0.5857014022044827, "learning_rate": 1.5668612246216431e-06, "loss": 0.371, "step": 5456 }, { "epoch": 2.4370353683261996, "grad_norm": 0.5872387239660937, "learning_rate": 1.5643232801261731e-06, "loss": 0.3942, "step": 5457 }, { "epoch": 2.437485920252309, "grad_norm": 0.5927592020965857, "learning_rate": 1.5617872183101223e-06, "loss": 0.4184, "step": 5458 }, { "epoch": 2.4379364721784187, "grad_norm": 0.5900463101846036, "learning_rate": 1.5592530397394878e-06, "loss": 0.4085, "step": 5459 }, { "epoch": 2.4383870241045282, "grad_norm": 0.6094343033985489, "learning_rate": 1.5567207449798517e-06, "loss": 0.4409, "step": 5460 }, { "epoch": 2.4388375760306378, "grad_norm": 0.5924917895577857, "learning_rate": 1.554190334596367e-06, "loss": 0.3883, "step": 5461 }, { "epoch": 2.439288127956747, "grad_norm": 0.6205701521134828, "learning_rate": 1.5516618091537815e-06, "loss": 0.3754, "step": 5462 }, { "epoch": 2.4397386798828564, "grad_norm": 0.5867222673037611, "learning_rate": 1.5491351692164037e-06, "loss": 0.4111, "step": 5463 }, { "epoch": 2.440189231808966, "grad_norm": 0.6224289156987338, "learning_rate": 1.5466104153481387e-06, "loss": 0.4225, "step": 5464 }, { "epoch": 2.4406397837350755, "grad_norm": 0.6001764710685482, "learning_rate": 1.544087548112454e-06, "loss": 0.3945, "step": 5465 }, { "epoch": 2.441090335661185, "grad_norm": 0.6158894470065248, "learning_rate": 1.5415665680724123e-06, "loss": 0.4215, "step": 5466 }, { "epoch": 2.4415408875872946, "grad_norm": 0.5888428220287244, "learning_rate": 1.5390474757906449e-06, "loss": 0.3975, "step": 5467 }, { "epoch": 2.441991439513404, "grad_norm": 0.5748797542155321, "learning_rate": 1.5365302718293639e-06, "loss": 0.4036, "step": 5468 }, { "epoch": 2.4424419914395132, "grad_norm": 0.6039058060218322, "learning_rate": 1.5340149567503637e-06, "loss": 0.3973, "step": 5469 }, { "epoch": 2.442892543365623, "grad_norm": 0.6037544665535286, "learning_rate": 1.531501531115005e-06, "loss": 0.399, "step": 5470 }, { "epoch": 2.4433430952917323, "grad_norm": 0.6109201018420226, "learning_rate": 1.5289899954842457e-06, "loss": 0.3685, "step": 5471 }, { "epoch": 2.443793647217842, "grad_norm": 0.5583045447682592, "learning_rate": 1.5264803504186033e-06, "loss": 0.4094, "step": 5472 }, { "epoch": 2.4442441991439514, "grad_norm": 0.5808930250380029, "learning_rate": 1.5239725964781914e-06, "loss": 0.3708, "step": 5473 }, { "epoch": 2.444694751070061, "grad_norm": 0.5878442629977103, "learning_rate": 1.5214667342226818e-06, "loss": 0.383, "step": 5474 }, { "epoch": 2.4451453029961705, "grad_norm": 0.597392379778878, "learning_rate": 1.5189627642113436e-06, "loss": 0.3794, "step": 5475 }, { "epoch": 2.4455958549222796, "grad_norm": 0.6134474098727821, "learning_rate": 1.5164606870030053e-06, "loss": 0.4128, "step": 5476 }, { "epoch": 2.446046406848389, "grad_norm": 0.6041380011046018, "learning_rate": 1.513960503156091e-06, "loss": 0.4127, "step": 5477 }, { "epoch": 2.4464969587744987, "grad_norm": 0.6051595368101397, "learning_rate": 1.511462213228585e-06, "loss": 0.4001, "step": 5478 }, { "epoch": 2.4469475107006082, "grad_norm": 0.597299220147051, "learning_rate": 1.5089658177780653e-06, "loss": 0.4037, "step": 5479 }, { "epoch": 2.4473980626267178, "grad_norm": 0.5798396881923146, "learning_rate": 1.506471317361673e-06, "loss": 0.4006, "step": 5480 }, { "epoch": 2.4478486145528273, "grad_norm": 0.5893352869074142, "learning_rate": 1.5039787125361327e-06, "loss": 0.4046, "step": 5481 }, { "epoch": 2.448299166478937, "grad_norm": 0.5787425485498837, "learning_rate": 1.5014880038577485e-06, "loss": 0.3932, "step": 5482 }, { "epoch": 2.448749718405046, "grad_norm": 0.5663496555271672, "learning_rate": 1.4989991918823954e-06, "loss": 0.3896, "step": 5483 }, { "epoch": 2.4492002703311555, "grad_norm": 0.5911359983919069, "learning_rate": 1.4965122771655295e-06, "loss": 0.4123, "step": 5484 }, { "epoch": 2.449650822257265, "grad_norm": 0.5904776787005964, "learning_rate": 1.4940272602621808e-06, "loss": 0.4067, "step": 5485 }, { "epoch": 2.4501013741833746, "grad_norm": 0.5758256068348093, "learning_rate": 1.491544141726957e-06, "loss": 0.387, "step": 5486 }, { "epoch": 2.450551926109484, "grad_norm": 0.5790937154607549, "learning_rate": 1.489062922114044e-06, "loss": 0.3917, "step": 5487 }, { "epoch": 2.4510024780355937, "grad_norm": 0.5744234461181744, "learning_rate": 1.4865836019771995e-06, "loss": 0.3979, "step": 5488 }, { "epoch": 2.4514530299617032, "grad_norm": 0.5992423880288271, "learning_rate": 1.4841061818697612e-06, "loss": 0.4224, "step": 5489 }, { "epoch": 2.4519035818878123, "grad_norm": 0.5957899461839026, "learning_rate": 1.4816306623446397e-06, "loss": 0.4145, "step": 5490 }, { "epoch": 2.452354133813922, "grad_norm": 0.5769579154536658, "learning_rate": 1.4791570439543246e-06, "loss": 0.3972, "step": 5491 }, { "epoch": 2.4528046857400314, "grad_norm": 0.5964580233006983, "learning_rate": 1.4766853272508785e-06, "loss": 0.4202, "step": 5492 }, { "epoch": 2.453255237666141, "grad_norm": 0.5936744032667857, "learning_rate": 1.4742155127859404e-06, "loss": 0.3904, "step": 5493 }, { "epoch": 2.4537057895922505, "grad_norm": 0.5800580364921156, "learning_rate": 1.4717476011107256e-06, "loss": 0.3935, "step": 5494 }, { "epoch": 2.45415634151836, "grad_norm": 0.6036073365541248, "learning_rate": 1.4692815927760274e-06, "loss": 0.3906, "step": 5495 }, { "epoch": 2.4546068934444696, "grad_norm": 0.606729945889346, "learning_rate": 1.4668174883322017e-06, "loss": 0.3791, "step": 5496 }, { "epoch": 2.455057445370579, "grad_norm": 0.6060089915107281, "learning_rate": 1.4643552883292001e-06, "loss": 0.4012, "step": 5497 }, { "epoch": 2.4555079972966887, "grad_norm": 0.5864364140846814, "learning_rate": 1.4618949933165272e-06, "loss": 0.401, "step": 5498 }, { "epoch": 2.4559585492227978, "grad_norm": 0.5825753734420593, "learning_rate": 1.4594366038432838e-06, "loss": 0.3953, "step": 5499 }, { "epoch": 2.4564091011489073, "grad_norm": 0.5950246768781854, "learning_rate": 1.4569801204581246e-06, "loss": 0.3764, "step": 5500 }, { "epoch": 2.456859653075017, "grad_norm": 0.5709520831439581, "learning_rate": 1.4545255437092976e-06, "loss": 0.3756, "step": 5501 }, { "epoch": 2.4573102050011264, "grad_norm": 0.6052913624010491, "learning_rate": 1.4520728741446087e-06, "loss": 0.4127, "step": 5502 }, { "epoch": 2.457760756927236, "grad_norm": 0.5926753229056448, "learning_rate": 1.4496221123114562e-06, "loss": 0.4116, "step": 5503 }, { "epoch": 2.4582113088533455, "grad_norm": 0.591954824174321, "learning_rate": 1.447173258756791e-06, "loss": 0.3923, "step": 5504 }, { "epoch": 2.458661860779455, "grad_norm": 0.5694453485555812, "learning_rate": 1.4447263140271606e-06, "loss": 0.3935, "step": 5505 }, { "epoch": 2.459112412705564, "grad_norm": 0.6184775111917145, "learning_rate": 1.442281278668668e-06, "loss": 0.4122, "step": 5506 }, { "epoch": 2.4595629646316737, "grad_norm": 0.5867382412460614, "learning_rate": 1.4398381532270001e-06, "loss": 0.3951, "step": 5507 }, { "epoch": 2.460013516557783, "grad_norm": 0.5684819866846892, "learning_rate": 1.437396938247415e-06, "loss": 0.3972, "step": 5508 }, { "epoch": 2.4604640684838928, "grad_norm": 0.5906339442893541, "learning_rate": 1.4349576342747462e-06, "loss": 0.3921, "step": 5509 }, { "epoch": 2.4609146204100023, "grad_norm": 0.5863809999408218, "learning_rate": 1.4325202418533957e-06, "loss": 0.3888, "step": 5510 }, { "epoch": 2.461365172336112, "grad_norm": 0.5918072324738924, "learning_rate": 1.4300847615273449e-06, "loss": 0.393, "step": 5511 }, { "epoch": 2.4618157242622214, "grad_norm": 0.5735810201121231, "learning_rate": 1.4276511938401449e-06, "loss": 0.3798, "step": 5512 }, { "epoch": 2.4622662761883305, "grad_norm": 0.5877732510612991, "learning_rate": 1.425219539334921e-06, "loss": 0.3811, "step": 5513 }, { "epoch": 2.46271682811444, "grad_norm": 0.5970876210093439, "learning_rate": 1.4227897985543714e-06, "loss": 0.4, "step": 5514 }, { "epoch": 2.4631673800405496, "grad_norm": 0.5679314289489583, "learning_rate": 1.4203619720407657e-06, "loss": 0.3736, "step": 5515 }, { "epoch": 2.463617931966659, "grad_norm": 0.5965136363581581, "learning_rate": 1.4179360603359504e-06, "loss": 0.3683, "step": 5516 }, { "epoch": 2.4640684838927687, "grad_norm": 0.5763275258059674, "learning_rate": 1.4155120639813392e-06, "loss": 0.3787, "step": 5517 }, { "epoch": 2.464519035818878, "grad_norm": 0.6158896678269274, "learning_rate": 1.4130899835179234e-06, "loss": 0.3959, "step": 5518 }, { "epoch": 2.4649695877449878, "grad_norm": 0.6024555971231995, "learning_rate": 1.4106698194862623e-06, "loss": 0.4361, "step": 5519 }, { "epoch": 2.465420139671097, "grad_norm": 0.5925456857913678, "learning_rate": 1.408251572426491e-06, "loss": 0.4141, "step": 5520 }, { "epoch": 2.4658706915972064, "grad_norm": 0.5955314932003166, "learning_rate": 1.405835242878314e-06, "loss": 0.4153, "step": 5521 }, { "epoch": 2.466321243523316, "grad_norm": 0.5964827163758643, "learning_rate": 1.4034208313810116e-06, "loss": 0.3842, "step": 5522 }, { "epoch": 2.4667717954494255, "grad_norm": 0.5922605074790582, "learning_rate": 1.4010083384734308e-06, "loss": 0.3733, "step": 5523 }, { "epoch": 2.467222347375535, "grad_norm": 0.5916528810779093, "learning_rate": 1.3985977646939952e-06, "loss": 0.3811, "step": 5524 }, { "epoch": 2.4676728993016446, "grad_norm": 0.6044152881609854, "learning_rate": 1.3961891105806967e-06, "loss": 0.4084, "step": 5525 }, { "epoch": 2.468123451227754, "grad_norm": 0.5863207171285276, "learning_rate": 1.3937823766711012e-06, "loss": 0.3912, "step": 5526 }, { "epoch": 2.4685740031538637, "grad_norm": 0.5834668766721408, "learning_rate": 1.391377563502344e-06, "loss": 0.378, "step": 5527 }, { "epoch": 2.469024555079973, "grad_norm": 0.5913370625354439, "learning_rate": 1.3889746716111341e-06, "loss": 0.3796, "step": 5528 }, { "epoch": 2.4694751070060823, "grad_norm": 0.5930799112381909, "learning_rate": 1.38657370153375e-06, "loss": 0.4015, "step": 5529 }, { "epoch": 2.469925658932192, "grad_norm": 0.6203652562441766, "learning_rate": 1.384174653806044e-06, "loss": 0.3847, "step": 5530 }, { "epoch": 2.4703762108583014, "grad_norm": 0.5900712785563403, "learning_rate": 1.3817775289634283e-06, "loss": 0.3979, "step": 5531 }, { "epoch": 2.470826762784411, "grad_norm": 0.5986032167453681, "learning_rate": 1.3793823275409068e-06, "loss": 0.3879, "step": 5532 }, { "epoch": 2.4712773147105205, "grad_norm": 0.5851435313701108, "learning_rate": 1.376989050073031e-06, "loss": 0.3882, "step": 5533 }, { "epoch": 2.47172786663663, "grad_norm": 0.5699172058143327, "learning_rate": 1.3745976970939423e-06, "loss": 0.3851, "step": 5534 }, { "epoch": 2.4721784185627396, "grad_norm": 0.5619502668550284, "learning_rate": 1.3722082691373383e-06, "loss": 0.3723, "step": 5535 }, { "epoch": 2.4726289704888487, "grad_norm": 0.6010906642152426, "learning_rate": 1.3698207667364983e-06, "loss": 0.3994, "step": 5536 }, { "epoch": 2.473079522414958, "grad_norm": 0.5752679789756705, "learning_rate": 1.367435190424261e-06, "loss": 0.3926, "step": 5537 }, { "epoch": 2.4735300743410678, "grad_norm": 0.6060451455255224, "learning_rate": 1.3650515407330467e-06, "loss": 0.3917, "step": 5538 }, { "epoch": 2.4739806262671773, "grad_norm": 0.5773607798413803, "learning_rate": 1.3626698181948327e-06, "loss": 0.3763, "step": 5539 }, { "epoch": 2.474431178193287, "grad_norm": 0.590103025693219, "learning_rate": 1.360290023341182e-06, "loss": 0.3941, "step": 5540 }, { "epoch": 2.4748817301193964, "grad_norm": 0.5624828572329967, "learning_rate": 1.3579121567032116e-06, "loss": 0.4019, "step": 5541 }, { "epoch": 2.475332282045506, "grad_norm": 0.5742306232477754, "learning_rate": 1.3555362188116173e-06, "loss": 0.4114, "step": 5542 }, { "epoch": 2.475782833971615, "grad_norm": 0.5805878249719336, "learning_rate": 1.3531622101966634e-06, "loss": 0.3696, "step": 5543 }, { "epoch": 2.4762333858977246, "grad_norm": 0.6165280632616178, "learning_rate": 1.350790131388181e-06, "loss": 0.3812, "step": 5544 }, { "epoch": 2.476683937823834, "grad_norm": 0.5997353178856323, "learning_rate": 1.348419982915572e-06, "loss": 0.3793, "step": 5545 }, { "epoch": 2.4771344897499437, "grad_norm": 0.594580632712986, "learning_rate": 1.3460517653078098e-06, "loss": 0.3669, "step": 5546 }, { "epoch": 2.477585041676053, "grad_norm": 0.5887319892205387, "learning_rate": 1.343685479093433e-06, "loss": 0.4225, "step": 5547 }, { "epoch": 2.4780355936021627, "grad_norm": 0.5999004326534991, "learning_rate": 1.3413211248005498e-06, "loss": 0.4024, "step": 5548 }, { "epoch": 2.4784861455282723, "grad_norm": 0.5974154709374867, "learning_rate": 1.3389587029568407e-06, "loss": 0.3974, "step": 5549 }, { "epoch": 2.4789366974543814, "grad_norm": 0.5937173503832119, "learning_rate": 1.3365982140895506e-06, "loss": 0.3776, "step": 5550 }, { "epoch": 2.4789366974543814, "eval_loss": 0.661859929561615, "eval_runtime": 24.4071, "eval_samples_per_second": 11.431, "eval_steps_per_second": 0.492, "step": 5550 }, { "epoch": 2.479387249380491, "grad_norm": 0.5862902398977636, "learning_rate": 1.3342396587254957e-06, "loss": 0.4045, "step": 5551 }, { "epoch": 2.4798378013066005, "grad_norm": 0.5719241721693242, "learning_rate": 1.3318830373910595e-06, "loss": 0.4012, "step": 5552 }, { "epoch": 2.48028835323271, "grad_norm": 0.6044186523241057, "learning_rate": 1.3295283506121936e-06, "loss": 0.4002, "step": 5553 }, { "epoch": 2.4807389051588196, "grad_norm": 0.5840621786249947, "learning_rate": 1.3271755989144197e-06, "loss": 0.3897, "step": 5554 }, { "epoch": 2.481189457084929, "grad_norm": 0.5691204806492786, "learning_rate": 1.3248247828228244e-06, "loss": 0.403, "step": 5555 }, { "epoch": 2.4816400090110387, "grad_norm": 0.5655526568576471, "learning_rate": 1.3224759028620693e-06, "loss": 0.3602, "step": 5556 }, { "epoch": 2.482090560937148, "grad_norm": 0.5751633235411114, "learning_rate": 1.3201289595563693e-06, "loss": 0.3835, "step": 5557 }, { "epoch": 2.4825411128632577, "grad_norm": 0.5907818029065242, "learning_rate": 1.317783953429528e-06, "loss": 0.3995, "step": 5558 }, { "epoch": 2.482991664789367, "grad_norm": 0.6083105064941676, "learning_rate": 1.3154408850048939e-06, "loss": 0.4045, "step": 5559 }, { "epoch": 2.4834422167154764, "grad_norm": 0.5872110124896781, "learning_rate": 1.3130997548054048e-06, "loss": 0.4029, "step": 5560 }, { "epoch": 2.483892768641586, "grad_norm": 0.5923847332085956, "learning_rate": 1.3107605633535469e-06, "loss": 0.3915, "step": 5561 }, { "epoch": 2.4843433205676955, "grad_norm": 0.5818899008272822, "learning_rate": 1.3084233111713906e-06, "loss": 0.3805, "step": 5562 }, { "epoch": 2.484793872493805, "grad_norm": 0.567534974710301, "learning_rate": 1.3060879987805563e-06, "loss": 0.3949, "step": 5563 }, { "epoch": 2.4852444244199146, "grad_norm": 0.5788107647909755, "learning_rate": 1.3037546267022505e-06, "loss": 0.3916, "step": 5564 }, { "epoch": 2.485694976346024, "grad_norm": 0.5844134426138665, "learning_rate": 1.3014231954572287e-06, "loss": 0.37, "step": 5565 }, { "epoch": 2.486145528272133, "grad_norm": 0.6015345251110137, "learning_rate": 1.2990937055658205e-06, "loss": 0.389, "step": 5566 }, { "epoch": 2.4865960801982427, "grad_norm": 0.5739903053357597, "learning_rate": 1.2967661575479318e-06, "loss": 0.3981, "step": 5567 }, { "epoch": 2.4870466321243523, "grad_norm": 0.5803848081225148, "learning_rate": 1.2944405519230153e-06, "loss": 0.3921, "step": 5568 }, { "epoch": 2.487497184050462, "grad_norm": 0.5644458454314247, "learning_rate": 1.29211688921011e-06, "loss": 0.3654, "step": 5569 }, { "epoch": 2.4879477359765714, "grad_norm": 0.5844417282220502, "learning_rate": 1.2897951699278044e-06, "loss": 0.3915, "step": 5570 }, { "epoch": 2.488398287902681, "grad_norm": 0.5871460047328237, "learning_rate": 1.2874753945942697e-06, "loss": 0.3958, "step": 5571 }, { "epoch": 2.4888488398287905, "grad_norm": 0.5832762110957779, "learning_rate": 1.2851575637272262e-06, "loss": 0.3826, "step": 5572 }, { "epoch": 2.4892993917548996, "grad_norm": 0.5824895035760291, "learning_rate": 1.2828416778439778e-06, "loss": 0.4164, "step": 5573 }, { "epoch": 2.489749943681009, "grad_norm": 0.5781261307525236, "learning_rate": 1.2805277374613744e-06, "loss": 0.3847, "step": 5574 }, { "epoch": 2.4902004956071186, "grad_norm": 0.5784329848060592, "learning_rate": 1.2782157430958553e-06, "loss": 0.3907, "step": 5575 }, { "epoch": 2.490651047533228, "grad_norm": 0.5841309002015285, "learning_rate": 1.275905695263402e-06, "loss": 0.3909, "step": 5576 }, { "epoch": 2.4911015994593377, "grad_norm": 0.5888019966672132, "learning_rate": 1.2735975944795775e-06, "loss": 0.3903, "step": 5577 }, { "epoch": 2.4915521513854473, "grad_norm": 0.579973072318031, "learning_rate": 1.2712914412595046e-06, "loss": 0.4073, "step": 5578 }, { "epoch": 2.492002703311557, "grad_norm": 0.5556025967227182, "learning_rate": 1.26898723611787e-06, "loss": 0.3929, "step": 5579 }, { "epoch": 2.492453255237666, "grad_norm": 0.5894721968921263, "learning_rate": 1.2666849795689306e-06, "loss": 0.3901, "step": 5580 }, { "epoch": 2.4929038071637755, "grad_norm": 0.5737827837605306, "learning_rate": 1.2643846721265029e-06, "loss": 0.3508, "step": 5581 }, { "epoch": 2.493354359089885, "grad_norm": 0.5910177471130065, "learning_rate": 1.262086314303973e-06, "loss": 0.3799, "step": 5582 }, { "epoch": 2.4938049110159946, "grad_norm": 0.5993089011536701, "learning_rate": 1.2597899066142882e-06, "loss": 0.3946, "step": 5583 }, { "epoch": 2.494255462942104, "grad_norm": 0.6049683235793041, "learning_rate": 1.257495449569962e-06, "loss": 0.4095, "step": 5584 }, { "epoch": 2.4947060148682136, "grad_norm": 0.5951977016871213, "learning_rate": 1.2552029436830737e-06, "loss": 0.4196, "step": 5585 }, { "epoch": 2.495156566794323, "grad_norm": 0.5892731020647233, "learning_rate": 1.2529123894652661e-06, "loss": 0.3924, "step": 5586 }, { "epoch": 2.4956071187204323, "grad_norm": 0.5808388203833484, "learning_rate": 1.250623787427746e-06, "loss": 0.3859, "step": 5587 }, { "epoch": 2.496057670646542, "grad_norm": 0.6154455504577999, "learning_rate": 1.248337138081286e-06, "loss": 0.3894, "step": 5588 }, { "epoch": 2.4965082225726514, "grad_norm": 0.5887449912158892, "learning_rate": 1.2460524419362196e-06, "loss": 0.3893, "step": 5589 }, { "epoch": 2.496958774498761, "grad_norm": 0.577085494234772, "learning_rate": 1.2437696995024495e-06, "loss": 0.3947, "step": 5590 }, { "epoch": 2.4974093264248705, "grad_norm": 0.5752067400817411, "learning_rate": 1.241488911289439e-06, "loss": 0.3723, "step": 5591 }, { "epoch": 2.49785987835098, "grad_norm": 0.5850626920388585, "learning_rate": 1.2392100778062123e-06, "loss": 0.3924, "step": 5592 }, { "epoch": 2.4983104302770895, "grad_norm": 0.6206094841232304, "learning_rate": 1.2369331995613664e-06, "loss": 0.3939, "step": 5593 }, { "epoch": 2.498760982203199, "grad_norm": 0.5812302035125027, "learning_rate": 1.2346582770630489e-06, "loss": 0.3743, "step": 5594 }, { "epoch": 2.4992115341293086, "grad_norm": 0.5810855462379013, "learning_rate": 1.2323853108189876e-06, "loss": 0.3757, "step": 5595 }, { "epoch": 2.4996620860554177, "grad_norm": 0.5978245750999536, "learning_rate": 1.2301143013364559e-06, "loss": 0.397, "step": 5596 }, { "epoch": 2.5001126379815273, "grad_norm": 0.5813958240320761, "learning_rate": 1.227845249122307e-06, "loss": 0.4095, "step": 5597 }, { "epoch": 2.500563189907637, "grad_norm": 0.5943038904160559, "learning_rate": 1.2255781546829403e-06, "loss": 0.3741, "step": 5598 }, { "epoch": 2.5010137418337464, "grad_norm": 0.6000628991872758, "learning_rate": 1.223313018524338e-06, "loss": 0.3952, "step": 5599 }, { "epoch": 2.501464293759856, "grad_norm": 0.5877662556322794, "learning_rate": 1.2210498411520256e-06, "loss": 0.3977, "step": 5600 }, { "epoch": 2.5019148456859654, "grad_norm": 0.5838524994827851, "learning_rate": 1.2187886230711044e-06, "loss": 0.389, "step": 5601 }, { "epoch": 2.502365397612075, "grad_norm": 0.5838363799315419, "learning_rate": 1.216529364786233e-06, "loss": 0.4024, "step": 5602 }, { "epoch": 2.502815949538184, "grad_norm": 0.5914157984635217, "learning_rate": 1.214272066801635e-06, "loss": 0.3872, "step": 5603 }, { "epoch": 2.5032665014642936, "grad_norm": 0.5941942936559784, "learning_rate": 1.212016729621095e-06, "loss": 0.409, "step": 5604 }, { "epoch": 2.503717053390403, "grad_norm": 0.618041518425173, "learning_rate": 1.2097633537479604e-06, "loss": 0.395, "step": 5605 }, { "epoch": 2.5041676053165127, "grad_norm": 0.5937076300399328, "learning_rate": 1.207511939685142e-06, "loss": 0.4053, "step": 5606 }, { "epoch": 2.5046181572426223, "grad_norm": 0.5797469532835334, "learning_rate": 1.2052624879351105e-06, "loss": 0.3778, "step": 5607 }, { "epoch": 2.505068709168732, "grad_norm": 0.5822111216848167, "learning_rate": 1.2030149989999007e-06, "loss": 0.4064, "step": 5608 }, { "epoch": 2.5055192610948414, "grad_norm": 0.5915249508796011, "learning_rate": 1.2007694733811082e-06, "loss": 0.3801, "step": 5609 }, { "epoch": 2.5059698130209505, "grad_norm": 0.5761010194289137, "learning_rate": 1.198525911579891e-06, "loss": 0.3889, "step": 5610 }, { "epoch": 2.50642036494706, "grad_norm": 0.6148426484677341, "learning_rate": 1.1962843140969683e-06, "loss": 0.399, "step": 5611 }, { "epoch": 2.5068709168731695, "grad_norm": 0.5772325063700119, "learning_rate": 1.19404468143262e-06, "loss": 0.3635, "step": 5612 }, { "epoch": 2.507321468799279, "grad_norm": 0.5932373888551328, "learning_rate": 1.1918070140866921e-06, "loss": 0.3899, "step": 5613 }, { "epoch": 2.5077720207253886, "grad_norm": 0.5880912988815438, "learning_rate": 1.1895713125585851e-06, "loss": 0.4114, "step": 5614 }, { "epoch": 2.508222572651498, "grad_norm": 0.5761003270211055, "learning_rate": 1.1873375773472662e-06, "loss": 0.3937, "step": 5615 }, { "epoch": 2.5086731245776077, "grad_norm": 0.5885416488600254, "learning_rate": 1.1851058089512601e-06, "loss": 0.3943, "step": 5616 }, { "epoch": 2.509123676503717, "grad_norm": 0.5928153806216174, "learning_rate": 1.1828760078686563e-06, "loss": 0.3858, "step": 5617 }, { "epoch": 2.509574228429827, "grad_norm": 0.5857257857972705, "learning_rate": 1.1806481745971021e-06, "loss": 0.3864, "step": 5618 }, { "epoch": 2.510024780355936, "grad_norm": 0.5743203758474448, "learning_rate": 1.1784223096338065e-06, "loss": 0.3933, "step": 5619 }, { "epoch": 2.5104753322820454, "grad_norm": 0.5779220340376645, "learning_rate": 1.1761984134755388e-06, "loss": 0.3656, "step": 5620 }, { "epoch": 2.510925884208155, "grad_norm": 0.6023814655429514, "learning_rate": 1.1739764866186309e-06, "loss": 0.3943, "step": 5621 }, { "epoch": 2.5113764361342645, "grad_norm": 0.5584662513615177, "learning_rate": 1.1717565295589728e-06, "loss": 0.3733, "step": 5622 }, { "epoch": 2.511826988060374, "grad_norm": 0.5597446339377666, "learning_rate": 1.1695385427920159e-06, "loss": 0.4003, "step": 5623 }, { "epoch": 2.512277539986483, "grad_norm": 0.6099155094109941, "learning_rate": 1.167322526812772e-06, "loss": 0.4239, "step": 5624 }, { "epoch": 2.512728091912593, "grad_norm": 0.5740812011055565, "learning_rate": 1.1651084821158133e-06, "loss": 0.4244, "step": 5625 }, { "epoch": 2.5131786438387023, "grad_norm": 0.5748185816271687, "learning_rate": 1.1628964091952732e-06, "loss": 0.3921, "step": 5626 }, { "epoch": 2.513629195764812, "grad_norm": 0.5969131147254249, "learning_rate": 1.1606863085448383e-06, "loss": 0.4178, "step": 5627 }, { "epoch": 2.5140797476909214, "grad_norm": 0.617161356423394, "learning_rate": 1.158478180657766e-06, "loss": 0.4205, "step": 5628 }, { "epoch": 2.514530299617031, "grad_norm": 0.5748473625326167, "learning_rate": 1.1562720260268624e-06, "loss": 0.3958, "step": 5629 }, { "epoch": 2.5149808515431404, "grad_norm": 0.5939388939642825, "learning_rate": 1.1540678451445043e-06, "loss": 0.4207, "step": 5630 }, { "epoch": 2.51543140346925, "grad_norm": 0.5807646354827868, "learning_rate": 1.151865638502615e-06, "loss": 0.4175, "step": 5631 }, { "epoch": 2.5158819553953595, "grad_norm": 0.6133206360773944, "learning_rate": 1.1496654065926927e-06, "loss": 0.3956, "step": 5632 }, { "epoch": 2.5163325073214686, "grad_norm": 0.5928714239185302, "learning_rate": 1.1474671499057766e-06, "loss": 0.3776, "step": 5633 }, { "epoch": 2.516783059247578, "grad_norm": 0.591609943714373, "learning_rate": 1.1452708689324855e-06, "loss": 0.4101, "step": 5634 }, { "epoch": 2.5172336111736877, "grad_norm": 0.5825438608001552, "learning_rate": 1.143076564162977e-06, "loss": 0.4005, "step": 5635 }, { "epoch": 2.5176841630997973, "grad_norm": 0.5804667428969599, "learning_rate": 1.1408842360869864e-06, "loss": 0.3894, "step": 5636 }, { "epoch": 2.518134715025907, "grad_norm": 0.5897666851642761, "learning_rate": 1.1386938851937924e-06, "loss": 0.3888, "step": 5637 }, { "epoch": 2.5185852669520163, "grad_norm": 0.5836408618673894, "learning_rate": 1.1365055119722401e-06, "loss": 0.3795, "step": 5638 }, { "epoch": 2.519035818878126, "grad_norm": 0.5957207359545066, "learning_rate": 1.1343191169107336e-06, "loss": 0.3985, "step": 5639 }, { "epoch": 2.519486370804235, "grad_norm": 0.5831896080790371, "learning_rate": 1.1321347004972316e-06, "loss": 0.3601, "step": 5640 }, { "epoch": 2.5199369227303445, "grad_norm": 0.5926483191422952, "learning_rate": 1.129952263219256e-06, "loss": 0.3871, "step": 5641 }, { "epoch": 2.520387474656454, "grad_norm": 0.5899648673893843, "learning_rate": 1.127771805563882e-06, "loss": 0.4056, "step": 5642 }, { "epoch": 2.5208380265825636, "grad_norm": 0.6012843641512466, "learning_rate": 1.1255933280177479e-06, "loss": 0.388, "step": 5643 }, { "epoch": 2.521288578508673, "grad_norm": 0.5829901763821261, "learning_rate": 1.1234168310670457e-06, "loss": 0.3791, "step": 5644 }, { "epoch": 2.5217391304347827, "grad_norm": 0.5671101741277658, "learning_rate": 1.1212423151975283e-06, "loss": 0.4175, "step": 5645 }, { "epoch": 2.5221896823608922, "grad_norm": 0.5686250864658733, "learning_rate": 1.119069780894504e-06, "loss": 0.3896, "step": 5646 }, { "epoch": 2.5226402342870013, "grad_norm": 0.5950270493927846, "learning_rate": 1.1168992286428425e-06, "loss": 0.389, "step": 5647 }, { "epoch": 2.5230907862131113, "grad_norm": 0.5850255436191654, "learning_rate": 1.114730658926968e-06, "loss": 0.4057, "step": 5648 }, { "epoch": 2.5235413381392204, "grad_norm": 0.608687511580999, "learning_rate": 1.112564072230863e-06, "loss": 0.4216, "step": 5649 }, { "epoch": 2.52399189006533, "grad_norm": 0.5758582351787225, "learning_rate": 1.110399469038068e-06, "loss": 0.4013, "step": 5650 }, { "epoch": 2.5244424419914395, "grad_norm": 0.5838110744511038, "learning_rate": 1.1082368498316798e-06, "loss": 0.4, "step": 5651 }, { "epoch": 2.524892993917549, "grad_norm": 0.5812528086528661, "learning_rate": 1.106076215094355e-06, "loss": 0.3979, "step": 5652 }, { "epoch": 2.5253435458436586, "grad_norm": 0.5658383414165543, "learning_rate": 1.1039175653083012e-06, "loss": 0.3936, "step": 5653 }, { "epoch": 2.5257940977697677, "grad_norm": 0.58255674924095, "learning_rate": 1.101760900955292e-06, "loss": 0.3862, "step": 5654 }, { "epoch": 2.5262446496958777, "grad_norm": 0.5820904054162469, "learning_rate": 1.0996062225166481e-06, "loss": 0.3991, "step": 5655 }, { "epoch": 2.526695201621987, "grad_norm": 0.5602426965937334, "learning_rate": 1.0974535304732581e-06, "loss": 0.3841, "step": 5656 }, { "epoch": 2.5271457535480963, "grad_norm": 0.5774390169902429, "learning_rate": 1.0953028253055541e-06, "loss": 0.405, "step": 5657 }, { "epoch": 2.527596305474206, "grad_norm": 0.5699685630022762, "learning_rate": 1.0931541074935392e-06, "loss": 0.3982, "step": 5658 }, { "epoch": 2.5280468574003154, "grad_norm": 0.5803886781012121, "learning_rate": 1.0910073775167585e-06, "loss": 0.3835, "step": 5659 }, { "epoch": 2.528497409326425, "grad_norm": 0.5738875683600023, "learning_rate": 1.0888626358543265e-06, "loss": 0.3892, "step": 5660 }, { "epoch": 2.5289479612525345, "grad_norm": 0.5850715954701429, "learning_rate": 1.0867198829849035e-06, "loss": 0.4139, "step": 5661 }, { "epoch": 2.529398513178644, "grad_norm": 0.590484219902995, "learning_rate": 1.0845791193867117e-06, "loss": 0.357, "step": 5662 }, { "epoch": 2.529849065104753, "grad_norm": 0.5685996220843069, "learning_rate": 1.0824403455375287e-06, "loss": 0.3512, "step": 5663 }, { "epoch": 2.5302996170308627, "grad_norm": 0.5830099554067284, "learning_rate": 1.0803035619146873e-06, "loss": 0.4057, "step": 5664 }, { "epoch": 2.5307501689569722, "grad_norm": 0.5960733208749627, "learning_rate": 1.078168768995075e-06, "loss": 0.4104, "step": 5665 }, { "epoch": 2.531200720883082, "grad_norm": 0.5658738514106425, "learning_rate": 1.0760359672551357e-06, "loss": 0.3689, "step": 5666 }, { "epoch": 2.5316512728091913, "grad_norm": 0.5712163330450246, "learning_rate": 1.0739051571708736e-06, "loss": 0.4011, "step": 5667 }, { "epoch": 2.532101824735301, "grad_norm": 0.5871950463866437, "learning_rate": 1.0717763392178381e-06, "loss": 0.4187, "step": 5668 }, { "epoch": 2.5325523766614104, "grad_norm": 0.6108501045654584, "learning_rate": 1.0696495138711472e-06, "loss": 0.3945, "step": 5669 }, { "epoch": 2.5330029285875195, "grad_norm": 0.567105662285161, "learning_rate": 1.0675246816054585e-06, "loss": 0.377, "step": 5670 }, { "epoch": 2.533453480513629, "grad_norm": 0.5811737934592398, "learning_rate": 1.065401842895003e-06, "loss": 0.3943, "step": 5671 }, { "epoch": 2.5339040324397386, "grad_norm": 0.5851730389643821, "learning_rate": 1.0632809982135494e-06, "loss": 0.4357, "step": 5672 }, { "epoch": 2.534354584365848, "grad_norm": 0.5813522431378364, "learning_rate": 1.0611621480344315e-06, "loss": 0.3958, "step": 5673 }, { "epoch": 2.5348051362919577, "grad_norm": 0.5764919595386523, "learning_rate": 1.059045292830536e-06, "loss": 0.3965, "step": 5674 }, { "epoch": 2.5352556882180672, "grad_norm": 0.5801075732020656, "learning_rate": 1.0569304330743036e-06, "loss": 0.3854, "step": 5675 }, { "epoch": 2.535706240144177, "grad_norm": 0.5833405326939287, "learning_rate": 1.0548175692377305e-06, "loss": 0.4109, "step": 5676 }, { "epoch": 2.536156792070286, "grad_norm": 0.5857210016633321, "learning_rate": 1.0527067017923654e-06, "loss": 0.3589, "step": 5677 }, { "epoch": 2.5366073439963954, "grad_norm": 0.595878885415125, "learning_rate": 1.0505978312093156e-06, "loss": 0.4072, "step": 5678 }, { "epoch": 2.537057895922505, "grad_norm": 0.5916939383388944, "learning_rate": 1.0484909579592372e-06, "loss": 0.4072, "step": 5679 }, { "epoch": 2.5375084478486145, "grad_norm": 0.5911363707109053, "learning_rate": 1.0463860825123463e-06, "loss": 0.4018, "step": 5680 }, { "epoch": 2.537958999774724, "grad_norm": 0.5746557992275975, "learning_rate": 1.0442832053384077e-06, "loss": 0.3694, "step": 5681 }, { "epoch": 2.5384095517008336, "grad_norm": 0.5877770337238506, "learning_rate": 1.0421823269067443e-06, "loss": 0.4062, "step": 5682 }, { "epoch": 2.538860103626943, "grad_norm": 0.5864797160539604, "learning_rate": 1.040083447686231e-06, "loss": 0.4064, "step": 5683 }, { "epoch": 2.5393106555530522, "grad_norm": 0.6004381965520448, "learning_rate": 1.037986568145297e-06, "loss": 0.3472, "step": 5684 }, { "epoch": 2.5397612074791622, "grad_norm": 0.5961662762511883, "learning_rate": 1.0358916887519243e-06, "loss": 0.3885, "step": 5685 }, { "epoch": 2.5402117594052713, "grad_norm": 0.5749508329798622, "learning_rate": 1.0337988099736519e-06, "loss": 0.4048, "step": 5686 }, { "epoch": 2.540662311331381, "grad_norm": 0.5851527668692729, "learning_rate": 1.031707932277568e-06, "loss": 0.3902, "step": 5687 }, { "epoch": 2.5411128632574904, "grad_norm": 0.581571345421258, "learning_rate": 1.0296190561303132e-06, "loss": 0.432, "step": 5688 }, { "epoch": 2.5415634151836, "grad_norm": 0.5872360522393149, "learning_rate": 1.027532181998091e-06, "loss": 0.3813, "step": 5689 }, { "epoch": 2.5420139671097095, "grad_norm": 0.5938596762894112, "learning_rate": 1.0254473103466421e-06, "loss": 0.3879, "step": 5690 }, { "epoch": 2.5424645190358186, "grad_norm": 0.5961714744126105, "learning_rate": 1.0233644416412792e-06, "loss": 0.4118, "step": 5691 }, { "epoch": 2.5429150709619286, "grad_norm": 0.5852422263212432, "learning_rate": 1.0212835763468488e-06, "loss": 0.4036, "step": 5692 }, { "epoch": 2.5433656228880377, "grad_norm": 0.6004346438850074, "learning_rate": 1.0192047149277684e-06, "loss": 0.402, "step": 5693 }, { "epoch": 2.5438161748141472, "grad_norm": 0.5744711642441763, "learning_rate": 1.0171278578479925e-06, "loss": 0.377, "step": 5694 }, { "epoch": 2.544266726740257, "grad_norm": 0.5870715604319362, "learning_rate": 1.0150530055710405e-06, "loss": 0.4157, "step": 5695 }, { "epoch": 2.5447172786663663, "grad_norm": 0.5839665688022945, "learning_rate": 1.0129801585599763e-06, "loss": 0.3767, "step": 5696 }, { "epoch": 2.545167830592476, "grad_norm": 0.564879166588146, "learning_rate": 1.0109093172774187e-06, "loss": 0.3897, "step": 5697 }, { "epoch": 2.5456183825185854, "grad_norm": 0.5705098552141984, "learning_rate": 1.0088404821855414e-06, "loss": 0.3884, "step": 5698 }, { "epoch": 2.546068934444695, "grad_norm": 0.5848738010541824, "learning_rate": 1.0067736537460671e-06, "loss": 0.406, "step": 5699 }, { "epoch": 2.546519486370804, "grad_norm": 0.5617644958586584, "learning_rate": 1.0047088324202714e-06, "loss": 0.3733, "step": 5700 }, { "epoch": 2.5469700382969136, "grad_norm": 0.5934995239674327, "learning_rate": 1.0026460186689834e-06, "loss": 0.4186, "step": 5701 }, { "epoch": 2.547420590223023, "grad_norm": 0.5819334033711999, "learning_rate": 1.0005852129525817e-06, "loss": 0.3951, "step": 5702 }, { "epoch": 2.5478711421491327, "grad_norm": 0.6011490470548863, "learning_rate": 9.985264157310005e-07, "loss": 0.3781, "step": 5703 }, { "epoch": 2.5483216940752422, "grad_norm": 0.5712882405034241, "learning_rate": 9.964696274637208e-07, "loss": 0.3863, "step": 5704 }, { "epoch": 2.5487722460013518, "grad_norm": 0.5668827163539629, "learning_rate": 9.944148486097793e-07, "loss": 0.3662, "step": 5705 }, { "epoch": 2.5492227979274613, "grad_norm": 0.5489950087446045, "learning_rate": 9.923620796277632e-07, "loss": 0.3915, "step": 5706 }, { "epoch": 2.5496733498535704, "grad_norm": 0.5688740158891799, "learning_rate": 9.903113209758098e-07, "loss": 0.3796, "step": 5707 }, { "epoch": 2.55012390177968, "grad_norm": 0.5703303962034335, "learning_rate": 9.88262573111608e-07, "loss": 0.3734, "step": 5708 }, { "epoch": 2.5505744537057895, "grad_norm": 0.5687146629194789, "learning_rate": 9.862158364924002e-07, "loss": 0.3784, "step": 5709 }, { "epoch": 2.551025005631899, "grad_norm": 0.5863268719375909, "learning_rate": 9.841711115749776e-07, "loss": 0.407, "step": 5710 }, { "epoch": 2.5514755575580086, "grad_norm": 0.6005985363260441, "learning_rate": 9.821283988156838e-07, "loss": 0.3717, "step": 5711 }, { "epoch": 2.551926109484118, "grad_norm": 0.5819124044053258, "learning_rate": 9.800876986704111e-07, "loss": 0.3839, "step": 5712 }, { "epoch": 2.5523766614102277, "grad_norm": 0.574165142077513, "learning_rate": 9.780490115946074e-07, "loss": 0.3751, "step": 5713 }, { "epoch": 2.5528272133363368, "grad_norm": 0.5668306594366005, "learning_rate": 9.760123380432617e-07, "loss": 0.3788, "step": 5714 }, { "epoch": 2.5532777652624468, "grad_norm": 0.5982652362595691, "learning_rate": 9.739776784709254e-07, "loss": 0.3899, "step": 5715 }, { "epoch": 2.553728317188556, "grad_norm": 0.5993788083257539, "learning_rate": 9.719450333316949e-07, "loss": 0.3911, "step": 5716 }, { "epoch": 2.5541788691146654, "grad_norm": 0.5828118830605155, "learning_rate": 9.699144030792163e-07, "loss": 0.3951, "step": 5717 }, { "epoch": 2.554629421040775, "grad_norm": 0.5661055550966823, "learning_rate": 9.67885788166686e-07, "loss": 0.3936, "step": 5718 }, { "epoch": 2.5550799729668845, "grad_norm": 0.5670590555336162, "learning_rate": 9.658591890468515e-07, "loss": 0.3802, "step": 5719 }, { "epoch": 2.555530524892994, "grad_norm": 0.5910490000278636, "learning_rate": 9.638346061720116e-07, "loss": 0.3914, "step": 5720 }, { "epoch": 2.555981076819103, "grad_norm": 0.5872971310781404, "learning_rate": 9.618120399940145e-07, "loss": 0.3631, "step": 5721 }, { "epoch": 2.556431628745213, "grad_norm": 0.5657307871711782, "learning_rate": 9.59791490964257e-07, "loss": 0.3884, "step": 5722 }, { "epoch": 2.5568821806713222, "grad_norm": 0.5647418665193519, "learning_rate": 9.577729595336827e-07, "loss": 0.4018, "step": 5723 }, { "epoch": 2.5573327325974318, "grad_norm": 0.5810809382889265, "learning_rate": 9.557564461527956e-07, "loss": 0.403, "step": 5724 }, { "epoch": 2.5577832845235413, "grad_norm": 0.5971311661635128, "learning_rate": 9.537419512716362e-07, "loss": 0.4329, "step": 5725 }, { "epoch": 2.558233836449651, "grad_norm": 0.5658995769569307, "learning_rate": 9.517294753398066e-07, "loss": 0.4093, "step": 5726 }, { "epoch": 2.5586843883757604, "grad_norm": 0.5964157782432848, "learning_rate": 9.497190188064442e-07, "loss": 0.3965, "step": 5727 }, { "epoch": 2.55913494030187, "grad_norm": 0.5995497729196505, "learning_rate": 9.477105821202537e-07, "loss": 0.4191, "step": 5728 }, { "epoch": 2.5595854922279795, "grad_norm": 0.5818424947558125, "learning_rate": 9.457041657294696e-07, "loss": 0.4217, "step": 5729 }, { "epoch": 2.5600360441540886, "grad_norm": 0.577316984388629, "learning_rate": 9.43699770081894e-07, "loss": 0.3761, "step": 5730 }, { "epoch": 2.560486596080198, "grad_norm": 0.5991137425312897, "learning_rate": 9.41697395624862e-07, "loss": 0.4169, "step": 5731 }, { "epoch": 2.5609371480063077, "grad_norm": 0.5864416105856411, "learning_rate": 9.396970428052698e-07, "loss": 0.3932, "step": 5732 }, { "epoch": 2.561387699932417, "grad_norm": 0.5875049501515933, "learning_rate": 9.376987120695547e-07, "loss": 0.4163, "step": 5733 }, { "epoch": 2.5618382518585268, "grad_norm": 0.5804374866442421, "learning_rate": 9.357024038637052e-07, "loss": 0.3994, "step": 5734 }, { "epoch": 2.5622888037846363, "grad_norm": 0.6127489394086618, "learning_rate": 9.337081186332597e-07, "loss": 0.3971, "step": 5735 }, { "epoch": 2.562739355710746, "grad_norm": 0.5793781124084981, "learning_rate": 9.317158568233031e-07, "loss": 0.3802, "step": 5736 }, { "epoch": 2.563189907636855, "grad_norm": 0.5994811062234756, "learning_rate": 9.297256188784709e-07, "loss": 0.3969, "step": 5737 }, { "epoch": 2.5636404595629645, "grad_norm": 0.580375533878936, "learning_rate": 9.277374052429444e-07, "loss": 0.388, "step": 5738 }, { "epoch": 2.564091011489074, "grad_norm": 0.5956985792872095, "learning_rate": 9.257512163604543e-07, "loss": 0.4027, "step": 5739 }, { "epoch": 2.5645415634151836, "grad_norm": 0.5712767382162794, "learning_rate": 9.237670526742793e-07, "loss": 0.3946, "step": 5740 }, { "epoch": 2.564992115341293, "grad_norm": 0.5880957085731997, "learning_rate": 9.217849146272473e-07, "loss": 0.3907, "step": 5741 }, { "epoch": 2.5654426672674027, "grad_norm": 0.5994412588749588, "learning_rate": 9.198048026617323e-07, "loss": 0.4058, "step": 5742 }, { "epoch": 2.565893219193512, "grad_norm": 0.6181739548422688, "learning_rate": 9.178267172196575e-07, "loss": 0.3865, "step": 5743 }, { "epoch": 2.5663437711196213, "grad_norm": 0.5498991085789796, "learning_rate": 9.158506587424931e-07, "loss": 0.387, "step": 5744 }, { "epoch": 2.5667943230457313, "grad_norm": 0.5891554294372379, "learning_rate": 9.138766276712552e-07, "loss": 0.3797, "step": 5745 }, { "epoch": 2.5672448749718404, "grad_norm": 0.5856025814062163, "learning_rate": 9.119046244465124e-07, "loss": 0.4139, "step": 5746 }, { "epoch": 2.56769542689795, "grad_norm": 0.6005870169853988, "learning_rate": 9.09934649508375e-07, "loss": 0.3876, "step": 5747 }, { "epoch": 2.5681459788240595, "grad_norm": 0.570154803993629, "learning_rate": 9.079667032965067e-07, "loss": 0.3421, "step": 5748 }, { "epoch": 2.568596530750169, "grad_norm": 0.5705701345996793, "learning_rate": 9.060007862501074e-07, "loss": 0.4227, "step": 5749 }, { "epoch": 2.5690470826762786, "grad_norm": 0.5636259185581639, "learning_rate": 9.040368988079418e-07, "loss": 0.403, "step": 5750 }, { "epoch": 2.5694976346023877, "grad_norm": 0.588454304250057, "learning_rate": 9.020750414083012e-07, "loss": 0.3897, "step": 5751 }, { "epoch": 2.5699481865284977, "grad_norm": 0.5749654857277188, "learning_rate": 9.001152144890435e-07, "loss": 0.3804, "step": 5752 }, { "epoch": 2.5703987384546068, "grad_norm": 0.5670772468368372, "learning_rate": 8.981574184875563e-07, "loss": 0.4005, "step": 5753 }, { "epoch": 2.5708492903807163, "grad_norm": 0.5934944034401153, "learning_rate": 8.96201653840788e-07, "loss": 0.3774, "step": 5754 }, { "epoch": 2.571299842306826, "grad_norm": 0.600333075561414, "learning_rate": 8.942479209852217e-07, "loss": 0.3967, "step": 5755 }, { "epoch": 2.5717503942329354, "grad_norm": 0.5951397940989617, "learning_rate": 8.922962203568986e-07, "loss": 0.3917, "step": 5756 }, { "epoch": 2.572200946159045, "grad_norm": 0.5893676874474933, "learning_rate": 8.903465523913957e-07, "loss": 0.3987, "step": 5757 }, { "epoch": 2.5726514980851545, "grad_norm": 0.6279902314383187, "learning_rate": 8.883989175238428e-07, "loss": 0.4363, "step": 5758 }, { "epoch": 2.573102050011264, "grad_norm": 0.5797863563911384, "learning_rate": 8.864533161889133e-07, "loss": 0.3812, "step": 5759 }, { "epoch": 2.573552601937373, "grad_norm": 0.564387200846793, "learning_rate": 8.845097488208288e-07, "loss": 0.3827, "step": 5760 }, { "epoch": 2.5740031538634827, "grad_norm": 0.5791933644891388, "learning_rate": 8.825682158533555e-07, "loss": 0.3592, "step": 5761 }, { "epoch": 2.574453705789592, "grad_norm": 0.5714245575093825, "learning_rate": 8.806287177198058e-07, "loss": 0.3782, "step": 5762 }, { "epoch": 2.5749042577157017, "grad_norm": 0.564771059158164, "learning_rate": 8.78691254853039e-07, "loss": 0.3862, "step": 5763 }, { "epoch": 2.5753548096418113, "grad_norm": 0.5900232582652558, "learning_rate": 8.767558276854548e-07, "loss": 0.4309, "step": 5764 }, { "epoch": 2.575805361567921, "grad_norm": 0.5715176083152541, "learning_rate": 8.748224366490121e-07, "loss": 0.3737, "step": 5765 }, { "epoch": 2.5762559134940304, "grad_norm": 0.5644962028850644, "learning_rate": 8.728910821751957e-07, "loss": 0.3948, "step": 5766 }, { "epoch": 2.5767064654201395, "grad_norm": 0.5934471258802981, "learning_rate": 8.709617646950563e-07, "loss": 0.4365, "step": 5767 }, { "epoch": 2.577157017346249, "grad_norm": 0.5976547444408506, "learning_rate": 8.69034484639173e-07, "loss": 0.4031, "step": 5768 }, { "epoch": 2.5776075692723586, "grad_norm": 0.6204069609658833, "learning_rate": 8.671092424376793e-07, "loss": 0.3974, "step": 5769 }, { "epoch": 2.578058121198468, "grad_norm": 0.5648037474953326, "learning_rate": 8.651860385202527e-07, "loss": 0.4085, "step": 5770 }, { "epoch": 2.5785086731245777, "grad_norm": 0.5856124880215474, "learning_rate": 8.632648733161142e-07, "loss": 0.3984, "step": 5771 }, { "epoch": 2.578959225050687, "grad_norm": 0.5907918875555904, "learning_rate": 8.613457472540321e-07, "loss": 0.4035, "step": 5772 }, { "epoch": 2.5794097769767967, "grad_norm": 0.5923900637454081, "learning_rate": 8.594286607623159e-07, "loss": 0.4199, "step": 5773 }, { "epoch": 2.579860328902906, "grad_norm": 0.5935494788403547, "learning_rate": 8.575136142688234e-07, "loss": 0.3669, "step": 5774 }, { "epoch": 2.5803108808290154, "grad_norm": 0.5805740991628616, "learning_rate": 8.55600608200956e-07, "loss": 0.3689, "step": 5775 }, { "epoch": 2.580761432755125, "grad_norm": 0.6020618163210854, "learning_rate": 8.536896429856589e-07, "loss": 0.3843, "step": 5776 }, { "epoch": 2.5812119846812345, "grad_norm": 0.5875461732989049, "learning_rate": 8.517807190494232e-07, "loss": 0.3906, "step": 5777 }, { "epoch": 2.581662536607344, "grad_norm": 0.5884898171947802, "learning_rate": 8.498738368182824e-07, "loss": 0.3783, "step": 5778 }, { "epoch": 2.5821130885334536, "grad_norm": 0.5952503042562544, "learning_rate": 8.479689967178173e-07, "loss": 0.3735, "step": 5779 }, { "epoch": 2.582563640459563, "grad_norm": 0.5866171067587468, "learning_rate": 8.4606619917315e-07, "loss": 0.392, "step": 5780 }, { "epoch": 2.583014192385672, "grad_norm": 0.5858575566330602, "learning_rate": 8.441654446089498e-07, "loss": 0.3904, "step": 5781 }, { "epoch": 2.583464744311782, "grad_norm": 0.590629275505218, "learning_rate": 8.42266733449425e-07, "loss": 0.3842, "step": 5782 }, { "epoch": 2.5839152962378913, "grad_norm": 0.5620414778126656, "learning_rate": 8.403700661183356e-07, "loss": 0.3915, "step": 5783 }, { "epoch": 2.584365848164001, "grad_norm": 0.5690976997738074, "learning_rate": 8.384754430389752e-07, "loss": 0.3939, "step": 5784 }, { "epoch": 2.5848164000901104, "grad_norm": 0.591656643148139, "learning_rate": 8.365828646341933e-07, "loss": 0.4335, "step": 5785 }, { "epoch": 2.58526695201622, "grad_norm": 0.5830836298761619, "learning_rate": 8.3469233132637e-07, "loss": 0.3898, "step": 5786 }, { "epoch": 2.5857175039423295, "grad_norm": 0.5744320137462717, "learning_rate": 8.32803843537443e-07, "loss": 0.3959, "step": 5787 }, { "epoch": 2.5861680558684386, "grad_norm": 0.573169117195964, "learning_rate": 8.309174016888788e-07, "loss": 0.3903, "step": 5788 }, { "epoch": 2.5866186077945486, "grad_norm": 0.6045743112127676, "learning_rate": 8.290330062017015e-07, "loss": 0.4065, "step": 5789 }, { "epoch": 2.5870691597206577, "grad_norm": 0.5753956101762416, "learning_rate": 8.271506574964638e-07, "loss": 0.386, "step": 5790 }, { "epoch": 2.587519711646767, "grad_norm": 0.5596815781721666, "learning_rate": 8.252703559932785e-07, "loss": 0.3874, "step": 5791 }, { "epoch": 2.5879702635728767, "grad_norm": 0.5674961355995958, "learning_rate": 8.233921021117863e-07, "loss": 0.3831, "step": 5792 }, { "epoch": 2.5884208154989863, "grad_norm": 0.5892338404037349, "learning_rate": 8.215158962711778e-07, "loss": 0.4198, "step": 5793 }, { "epoch": 2.588871367425096, "grad_norm": 0.5783317226456004, "learning_rate": 8.196417388901867e-07, "loss": 0.4201, "step": 5794 }, { "epoch": 2.5893219193512054, "grad_norm": 0.5962210264240773, "learning_rate": 8.177696303870875e-07, "loss": 0.388, "step": 5795 }, { "epoch": 2.589772471277315, "grad_norm": 0.5764557714836407, "learning_rate": 8.158995711797002e-07, "loss": 0.3987, "step": 5796 }, { "epoch": 2.590223023203424, "grad_norm": 0.5713597392031344, "learning_rate": 8.140315616853856e-07, "loss": 0.3963, "step": 5797 }, { "epoch": 2.5906735751295336, "grad_norm": 0.5774565961780872, "learning_rate": 8.121656023210456e-07, "loss": 0.3593, "step": 5798 }, { "epoch": 2.591124127055643, "grad_norm": 0.582740470023478, "learning_rate": 8.103016935031272e-07, "loss": 0.3884, "step": 5799 }, { "epoch": 2.5915746789817526, "grad_norm": 0.5969932395160917, "learning_rate": 8.084398356476175e-07, "loss": 0.4145, "step": 5800 }, { "epoch": 2.592025230907862, "grad_norm": 0.5841734513557518, "learning_rate": 8.065800291700498e-07, "loss": 0.393, "step": 5801 }, { "epoch": 2.5924757828339717, "grad_norm": 0.5782444738758522, "learning_rate": 8.047222744854943e-07, "loss": 0.3893, "step": 5802 }, { "epoch": 2.5929263347600813, "grad_norm": 0.5852941650751277, "learning_rate": 8.02866572008566e-07, "loss": 0.3866, "step": 5803 }, { "epoch": 2.5933768866861904, "grad_norm": 0.5810069280054511, "learning_rate": 8.010129221534235e-07, "loss": 0.3755, "step": 5804 }, { "epoch": 2.5938274386123, "grad_norm": 0.5703631726362359, "learning_rate": 7.991613253337638e-07, "loss": 0.3853, "step": 5805 }, { "epoch": 2.5942779905384095, "grad_norm": 0.5890665133768355, "learning_rate": 7.973117819628274e-07, "loss": 0.3867, "step": 5806 }, { "epoch": 2.594728542464519, "grad_norm": 0.5870816541065779, "learning_rate": 7.954642924533995e-07, "loss": 0.4003, "step": 5807 }, { "epoch": 2.5951790943906285, "grad_norm": 0.5999296657313085, "learning_rate": 7.936188572177972e-07, "loss": 0.3969, "step": 5808 }, { "epoch": 2.595629646316738, "grad_norm": 0.5595779573352688, "learning_rate": 7.917754766678942e-07, "loss": 0.3613, "step": 5809 }, { "epoch": 2.5960801982428476, "grad_norm": 0.5832421634619636, "learning_rate": 7.899341512150894e-07, "loss": 0.4199, "step": 5810 }, { "epoch": 2.5965307501689567, "grad_norm": 0.5607741432067546, "learning_rate": 7.880948812703382e-07, "loss": 0.3488, "step": 5811 }, { "epoch": 2.5969813020950667, "grad_norm": 0.6243216484116315, "learning_rate": 7.862576672441235e-07, "loss": 0.3903, "step": 5812 }, { "epoch": 2.597431854021176, "grad_norm": 0.5884073679140378, "learning_rate": 7.844225095464819e-07, "loss": 0.4119, "step": 5813 }, { "epoch": 2.5978824059472854, "grad_norm": 0.6117804636378118, "learning_rate": 7.825894085869779e-07, "loss": 0.4103, "step": 5814 }, { "epoch": 2.598332957873395, "grad_norm": 0.5768489686620542, "learning_rate": 7.807583647747308e-07, "loss": 0.3896, "step": 5815 }, { "epoch": 2.5987835097995045, "grad_norm": 0.5944519328397488, "learning_rate": 7.789293785183904e-07, "loss": 0.3607, "step": 5816 }, { "epoch": 2.599234061725614, "grad_norm": 0.5708171610192916, "learning_rate": 7.771024502261526e-07, "loss": 0.3729, "step": 5817 }, { "epoch": 2.599684613651723, "grad_norm": 0.5857031949925802, "learning_rate": 7.752775803057533e-07, "loss": 0.3717, "step": 5818 }, { "epoch": 2.600135165577833, "grad_norm": 0.5856775094068384, "learning_rate": 7.734547691644623e-07, "loss": 0.3908, "step": 5819 }, { "epoch": 2.600585717503942, "grad_norm": 0.5903855342842791, "learning_rate": 7.716340172091042e-07, "loss": 0.4023, "step": 5820 }, { "epoch": 2.6010362694300517, "grad_norm": 0.6016941013334063, "learning_rate": 7.698153248460271e-07, "loss": 0.3899, "step": 5821 }, { "epoch": 2.6014868213561613, "grad_norm": 0.6129465099043451, "learning_rate": 7.679986924811356e-07, "loss": 0.4026, "step": 5822 }, { "epoch": 2.601937373282271, "grad_norm": 0.6117155522967312, "learning_rate": 7.661841205198594e-07, "loss": 0.3873, "step": 5823 }, { "epoch": 2.6023879252083804, "grad_norm": 0.5906988540316623, "learning_rate": 7.643716093671827e-07, "loss": 0.3914, "step": 5824 }, { "epoch": 2.60283847713449, "grad_norm": 0.5954530383087456, "learning_rate": 7.625611594276162e-07, "loss": 0.4054, "step": 5825 }, { "epoch": 2.6032890290605994, "grad_norm": 0.6046858661035753, "learning_rate": 7.607527711052242e-07, "loss": 0.3685, "step": 5826 }, { "epoch": 2.6037395809867085, "grad_norm": 0.6004588285103886, "learning_rate": 7.589464448035988e-07, "loss": 0.3915, "step": 5827 }, { "epoch": 2.604190132912818, "grad_norm": 0.603998987856232, "learning_rate": 7.571421809258783e-07, "loss": 0.38, "step": 5828 }, { "epoch": 2.6046406848389276, "grad_norm": 0.5712513374324757, "learning_rate": 7.553399798747396e-07, "loss": 0.3938, "step": 5829 }, { "epoch": 2.605091236765037, "grad_norm": 0.5710135997688718, "learning_rate": 7.535398420523987e-07, "loss": 0.4032, "step": 5830 }, { "epoch": 2.6055417886911467, "grad_norm": 0.5960352027169272, "learning_rate": 7.517417678606121e-07, "loss": 0.398, "step": 5831 }, { "epoch": 2.6059923406172563, "grad_norm": 0.5845969308255128, "learning_rate": 7.499457577006753e-07, "loss": 0.3634, "step": 5832 }, { "epoch": 2.606442892543366, "grad_norm": 0.5751009694465411, "learning_rate": 7.481518119734211e-07, "loss": 0.3793, "step": 5833 }, { "epoch": 2.606893444469475, "grad_norm": 0.5758945652244509, "learning_rate": 7.463599310792257e-07, "loss": 0.3592, "step": 5834 }, { "epoch": 2.6073439963955845, "grad_norm": 0.5795177489381427, "learning_rate": 7.445701154180008e-07, "loss": 0.3849, "step": 5835 }, { "epoch": 2.607794548321694, "grad_norm": 0.5753079644860941, "learning_rate": 7.427823653891986e-07, "loss": 0.3976, "step": 5836 }, { "epoch": 2.6082451002478035, "grad_norm": 0.580855763346887, "learning_rate": 7.409966813918101e-07, "loss": 0.3773, "step": 5837 }, { "epoch": 2.608695652173913, "grad_norm": 0.5772360136038455, "learning_rate": 7.392130638243667e-07, "loss": 0.3817, "step": 5838 }, { "epoch": 2.6091462041000226, "grad_norm": 0.5922480332193153, "learning_rate": 7.374315130849363e-07, "loss": 0.3786, "step": 5839 }, { "epoch": 2.609596756026132, "grad_norm": 0.572687744528067, "learning_rate": 7.356520295711256e-07, "loss": 0.3671, "step": 5840 }, { "epoch": 2.6100473079522413, "grad_norm": 0.5983231496777642, "learning_rate": 7.338746136800823e-07, "loss": 0.4195, "step": 5841 }, { "epoch": 2.6104978598783513, "grad_norm": 0.57801830208123, "learning_rate": 7.320992658084891e-07, "loss": 0.3794, "step": 5842 }, { "epoch": 2.6109484118044604, "grad_norm": 0.5897418229259797, "learning_rate": 7.303259863525724e-07, "loss": 0.4023, "step": 5843 }, { "epoch": 2.61139896373057, "grad_norm": 0.5940703194384049, "learning_rate": 7.285547757080924e-07, "loss": 0.3728, "step": 5844 }, { "epoch": 2.6118495156566794, "grad_norm": 0.6303078660016977, "learning_rate": 7.267856342703461e-07, "loss": 0.3932, "step": 5845 }, { "epoch": 2.612300067582789, "grad_norm": 0.5928900727029971, "learning_rate": 7.250185624341765e-07, "loss": 0.3914, "step": 5846 }, { "epoch": 2.6127506195088985, "grad_norm": 0.5883115112231749, "learning_rate": 7.232535605939539e-07, "loss": 0.3712, "step": 5847 }, { "epoch": 2.6132011714350076, "grad_norm": 0.5717551616109633, "learning_rate": 7.214906291435997e-07, "loss": 0.3959, "step": 5848 }, { "epoch": 2.6136517233611176, "grad_norm": 0.6053554281620455, "learning_rate": 7.197297684765592e-07, "loss": 0.4308, "step": 5849 }, { "epoch": 2.6141022752872267, "grad_norm": 0.5538658062597505, "learning_rate": 7.17970978985828e-07, "loss": 0.3746, "step": 5850 }, { "epoch": 2.6145528272133363, "grad_norm": 0.5804746976879779, "learning_rate": 7.162142610639278e-07, "loss": 0.396, "step": 5851 }, { "epoch": 2.615003379139446, "grad_norm": 0.5741171418892566, "learning_rate": 7.144596151029304e-07, "loss": 0.3961, "step": 5852 }, { "epoch": 2.6154539310655553, "grad_norm": 0.5701155785245238, "learning_rate": 7.127070414944337e-07, "loss": 0.3597, "step": 5853 }, { "epoch": 2.615904482991665, "grad_norm": 0.5674649071304855, "learning_rate": 7.109565406295804e-07, "loss": 0.4166, "step": 5854 }, { "epoch": 2.6163550349177744, "grad_norm": 0.5902729407654543, "learning_rate": 7.09208112899048e-07, "loss": 0.3979, "step": 5855 }, { "epoch": 2.616805586843884, "grad_norm": 0.5906765276397948, "learning_rate": 7.07461758693051e-07, "loss": 0.3839, "step": 5856 }, { "epoch": 2.617256138769993, "grad_norm": 0.5785268539449063, "learning_rate": 7.057174784013432e-07, "loss": 0.3733, "step": 5857 }, { "epoch": 2.6177066906961026, "grad_norm": 0.5798241855221149, "learning_rate": 7.03975272413212e-07, "loss": 0.3866, "step": 5858 }, { "epoch": 2.618157242622212, "grad_norm": 0.5900119229707281, "learning_rate": 7.022351411174866e-07, "loss": 0.3776, "step": 5859 }, { "epoch": 2.6186077945483217, "grad_norm": 0.5907625617577207, "learning_rate": 7.004970849025294e-07, "loss": 0.3814, "step": 5860 }, { "epoch": 2.6190583464744313, "grad_norm": 0.5712127406095424, "learning_rate": 6.987611041562392e-07, "loss": 0.3881, "step": 5861 }, { "epoch": 2.619508898400541, "grad_norm": 0.5750111731933449, "learning_rate": 6.97027199266056e-07, "loss": 0.3792, "step": 5862 }, { "epoch": 2.6199594503266503, "grad_norm": 0.6005244729413309, "learning_rate": 6.952953706189525e-07, "loss": 0.3887, "step": 5863 }, { "epoch": 2.6204100022527594, "grad_norm": 0.5743779188246216, "learning_rate": 6.935656186014383e-07, "loss": 0.386, "step": 5864 }, { "epoch": 2.620860554178869, "grad_norm": 0.5637342958762813, "learning_rate": 6.918379435995626e-07, "loss": 0.3763, "step": 5865 }, { "epoch": 2.6213111061049785, "grad_norm": 0.5941105696842218, "learning_rate": 6.901123459989068e-07, "loss": 0.3916, "step": 5866 }, { "epoch": 2.621761658031088, "grad_norm": 0.5803108963301288, "learning_rate": 6.883888261845917e-07, "loss": 0.3954, "step": 5867 }, { "epoch": 2.6222122099571976, "grad_norm": 0.5917141972904675, "learning_rate": 6.866673845412742e-07, "loss": 0.3946, "step": 5868 }, { "epoch": 2.622662761883307, "grad_norm": 0.5714901333512945, "learning_rate": 6.849480214531456e-07, "loss": 0.4079, "step": 5869 }, { "epoch": 2.6231133138094167, "grad_norm": 0.5779886567309949, "learning_rate": 6.832307373039349e-07, "loss": 0.3863, "step": 5870 }, { "epoch": 2.623563865735526, "grad_norm": 0.57270224827819, "learning_rate": 6.815155324769063e-07, "loss": 0.3764, "step": 5871 }, { "epoch": 2.6240144176616353, "grad_norm": 0.5871323019690278, "learning_rate": 6.798024073548615e-07, "loss": 0.395, "step": 5872 }, { "epoch": 2.624464969587745, "grad_norm": 0.5870512565047282, "learning_rate": 6.780913623201346e-07, "loss": 0.3998, "step": 5873 }, { "epoch": 2.6249155215138544, "grad_norm": 0.5542691972501073, "learning_rate": 6.763823977545991e-07, "loss": 0.3862, "step": 5874 }, { "epoch": 2.625366073439964, "grad_norm": 0.6198916675392787, "learning_rate": 6.746755140396633e-07, "loss": 0.398, "step": 5875 }, { "epoch": 2.6258166253660735, "grad_norm": 0.5694403337030316, "learning_rate": 6.729707115562689e-07, "loss": 0.3914, "step": 5876 }, { "epoch": 2.626267177292183, "grad_norm": 0.5403958987749942, "learning_rate": 6.712679906848962e-07, "loss": 0.3734, "step": 5877 }, { "epoch": 2.626717729218292, "grad_norm": 0.5712547745801378, "learning_rate": 6.695673518055578e-07, "loss": 0.3725, "step": 5878 }, { "epoch": 2.627168281144402, "grad_norm": 0.5643665202435667, "learning_rate": 6.678687952978069e-07, "loss": 0.3814, "step": 5879 }, { "epoch": 2.6276188330705113, "grad_norm": 0.5668840482520701, "learning_rate": 6.661723215407223e-07, "loss": 0.3802, "step": 5880 }, { "epoch": 2.628069384996621, "grad_norm": 0.5759818830168845, "learning_rate": 6.644779309129312e-07, "loss": 0.3835, "step": 5881 }, { "epoch": 2.6285199369227303, "grad_norm": 0.5977683999553529, "learning_rate": 6.627856237925812e-07, "loss": 0.3762, "step": 5882 }, { "epoch": 2.62897048884884, "grad_norm": 0.5758920148564358, "learning_rate": 6.6109540055737e-07, "loss": 0.3929, "step": 5883 }, { "epoch": 2.6294210407749494, "grad_norm": 0.5889822848576174, "learning_rate": 6.594072615845159e-07, "loss": 0.3833, "step": 5884 }, { "epoch": 2.6298715927010585, "grad_norm": 0.5942377147642254, "learning_rate": 6.577212072507844e-07, "loss": 0.3919, "step": 5885 }, { "epoch": 2.6303221446271685, "grad_norm": 0.5822305005511019, "learning_rate": 6.560372379324654e-07, "loss": 0.3682, "step": 5886 }, { "epoch": 2.6307726965532776, "grad_norm": 0.5761355924600382, "learning_rate": 6.543553540053926e-07, "loss": 0.433, "step": 5887 }, { "epoch": 2.631223248479387, "grad_norm": 0.5824967579076281, "learning_rate": 6.52675555844926e-07, "loss": 0.3655, "step": 5888 }, { "epoch": 2.6316738004054967, "grad_norm": 0.5679753769146296, "learning_rate": 6.509978438259657e-07, "loss": 0.3716, "step": 5889 }, { "epoch": 2.6321243523316062, "grad_norm": 0.6017345249718921, "learning_rate": 6.493222183229442e-07, "loss": 0.3891, "step": 5890 }, { "epoch": 2.632574904257716, "grad_norm": 0.5744406515761337, "learning_rate": 6.476486797098291e-07, "loss": 0.4181, "step": 5891 }, { "epoch": 2.6330254561838253, "grad_norm": 0.5767114506864542, "learning_rate": 6.459772283601218e-07, "loss": 0.3846, "step": 5892 }, { "epoch": 2.633476008109935, "grad_norm": 0.5770604285709338, "learning_rate": 6.44307864646857e-07, "loss": 0.4165, "step": 5893 }, { "epoch": 2.633926560036044, "grad_norm": 0.583240442801985, "learning_rate": 6.426405889426046e-07, "loss": 0.4002, "step": 5894 }, { "epoch": 2.6343771119621535, "grad_norm": 0.5594265525305427, "learning_rate": 6.40975401619468e-07, "loss": 0.3818, "step": 5895 }, { "epoch": 2.634827663888263, "grad_norm": 0.591070194608332, "learning_rate": 6.393123030490856e-07, "loss": 0.4157, "step": 5896 }, { "epoch": 2.6352782158143726, "grad_norm": 0.5663939769369029, "learning_rate": 6.37651293602628e-07, "loss": 0.3704, "step": 5897 }, { "epoch": 2.635728767740482, "grad_norm": 0.5588421137839317, "learning_rate": 6.359923736508011e-07, "loss": 0.3832, "step": 5898 }, { "epoch": 2.6361793196665917, "grad_norm": 0.5692636892681082, "learning_rate": 6.343355435638421e-07, "loss": 0.3569, "step": 5899 }, { "epoch": 2.6366298715927012, "grad_norm": 0.5624916077378953, "learning_rate": 6.326808037115251e-07, "loss": 0.377, "step": 5900 }, { "epoch": 2.6370804235188103, "grad_norm": 0.5908526746058621, "learning_rate": 6.310281544631547e-07, "loss": 0.3931, "step": 5901 }, { "epoch": 2.63753097544492, "grad_norm": 0.5890309591182562, "learning_rate": 6.293775961875704e-07, "loss": 0.3693, "step": 5902 }, { "epoch": 2.6379815273710294, "grad_norm": 0.5911875087025271, "learning_rate": 6.277291292531462e-07, "loss": 0.381, "step": 5903 }, { "epoch": 2.638432079297139, "grad_norm": 0.5890192505440536, "learning_rate": 6.260827540277847e-07, "loss": 0.3751, "step": 5904 }, { "epoch": 2.6388826312232485, "grad_norm": 0.5700297626280949, "learning_rate": 6.244384708789286e-07, "loss": 0.378, "step": 5905 }, { "epoch": 2.639333183149358, "grad_norm": 0.6005109495143767, "learning_rate": 6.227962801735465e-07, "loss": 0.3749, "step": 5906 }, { "epoch": 2.6397837350754676, "grad_norm": 0.5800557322486636, "learning_rate": 6.211561822781476e-07, "loss": 0.377, "step": 5907 }, { "epoch": 2.6402342870015767, "grad_norm": 0.589654052787086, "learning_rate": 6.195181775587655e-07, "loss": 0.3731, "step": 5908 }, { "epoch": 2.6406848389276867, "grad_norm": 0.5951291097115473, "learning_rate": 6.178822663809758e-07, "loss": 0.3996, "step": 5909 }, { "epoch": 2.641135390853796, "grad_norm": 0.5948889281419961, "learning_rate": 6.162484491098764e-07, "loss": 0.3956, "step": 5910 }, { "epoch": 2.6415859427799053, "grad_norm": 0.5814210754176355, "learning_rate": 6.1461672611011e-07, "loss": 0.36, "step": 5911 }, { "epoch": 2.642036494706015, "grad_norm": 0.5870168878221071, "learning_rate": 6.129870977458385e-07, "loss": 0.4144, "step": 5912 }, { "epoch": 2.6424870466321244, "grad_norm": 0.5790817131681004, "learning_rate": 6.113595643807702e-07, "loss": 0.3751, "step": 5913 }, { "epoch": 2.642937598558234, "grad_norm": 0.548219238890773, "learning_rate": 6.097341263781365e-07, "loss": 0.4012, "step": 5914 }, { "epoch": 2.643388150484343, "grad_norm": 0.5763759201728993, "learning_rate": 6.081107841007006e-07, "loss": 0.3836, "step": 5915 }, { "epoch": 2.643838702410453, "grad_norm": 0.575655280644293, "learning_rate": 6.06489537910766e-07, "loss": 0.3845, "step": 5916 }, { "epoch": 2.644289254336562, "grad_norm": 0.5801657572852659, "learning_rate": 6.048703881701579e-07, "loss": 0.4231, "step": 5917 }, { "epoch": 2.6447398062626717, "grad_norm": 0.587095886319562, "learning_rate": 6.032533352402447e-07, "loss": 0.3918, "step": 5918 }, { "epoch": 2.6451903581887812, "grad_norm": 0.5802741866511371, "learning_rate": 6.016383794819169e-07, "loss": 0.421, "step": 5919 }, { "epoch": 2.6456409101148908, "grad_norm": 0.5699062941814799, "learning_rate": 6.00025521255605e-07, "loss": 0.4212, "step": 5920 }, { "epoch": 2.6460914620410003, "grad_norm": 0.5640194260194055, "learning_rate": 5.984147609212631e-07, "loss": 0.3701, "step": 5921 }, { "epoch": 2.64654201396711, "grad_norm": 0.5865578809524183, "learning_rate": 5.968060988383884e-07, "loss": 0.3914, "step": 5922 }, { "epoch": 2.6469925658932194, "grad_norm": 0.5822123272327947, "learning_rate": 5.951995353659956e-07, "loss": 0.3969, "step": 5923 }, { "epoch": 2.6474431178193285, "grad_norm": 0.576889256292293, "learning_rate": 5.935950708626437e-07, "loss": 0.3746, "step": 5924 }, { "epoch": 2.647893669745438, "grad_norm": 0.5769169784182953, "learning_rate": 5.91992705686415e-07, "loss": 0.3778, "step": 5925 }, { "epoch": 2.6483442216715476, "grad_norm": 0.5567202883535984, "learning_rate": 5.903924401949279e-07, "loss": 0.3378, "step": 5926 }, { "epoch": 2.648794773597657, "grad_norm": 0.5745548755864687, "learning_rate": 5.887942747453301e-07, "loss": 0.3601, "step": 5927 }, { "epoch": 2.6492453255237667, "grad_norm": 0.6026461209464704, "learning_rate": 5.871982096943018e-07, "loss": 0.3925, "step": 5928 }, { "epoch": 2.6496958774498762, "grad_norm": 0.5851223390214135, "learning_rate": 5.856042453980526e-07, "loss": 0.3831, "step": 5929 }, { "epoch": 2.6501464293759858, "grad_norm": 0.5840438667671544, "learning_rate": 5.840123822123256e-07, "loss": 0.4091, "step": 5930 }, { "epoch": 2.650596981302095, "grad_norm": 0.5905880965481825, "learning_rate": 5.824226204923933e-07, "loss": 0.4023, "step": 5931 }, { "epoch": 2.6510475332282044, "grad_norm": 0.5696769238421964, "learning_rate": 5.808349605930586e-07, "loss": 0.3719, "step": 5932 }, { "epoch": 2.651498085154314, "grad_norm": 0.5651562802153055, "learning_rate": 5.79249402868659e-07, "loss": 0.385, "step": 5933 }, { "epoch": 2.6519486370804235, "grad_norm": 0.5692039970292084, "learning_rate": 5.776659476730573e-07, "loss": 0.4067, "step": 5934 }, { "epoch": 2.652399189006533, "grad_norm": 0.5769130904682639, "learning_rate": 5.760845953596527e-07, "loss": 0.3799, "step": 5935 }, { "epoch": 2.6528497409326426, "grad_norm": 0.5589980734233455, "learning_rate": 5.745053462813699e-07, "loss": 0.3994, "step": 5936 }, { "epoch": 2.653300292858752, "grad_norm": 0.5689705511072701, "learning_rate": 5.729282007906678e-07, "loss": 0.3855, "step": 5937 }, { "epoch": 2.6537508447848612, "grad_norm": 0.5925043157693931, "learning_rate": 5.71353159239535e-07, "loss": 0.3683, "step": 5938 }, { "epoch": 2.654201396710971, "grad_norm": 0.5870368131468483, "learning_rate": 5.697802219794901e-07, "loss": 0.3758, "step": 5939 }, { "epoch": 2.6546519486370803, "grad_norm": 0.5783642830203052, "learning_rate": 5.682093893615836e-07, "loss": 0.3769, "step": 5940 }, { "epoch": 2.65510250056319, "grad_norm": 0.5585253927255728, "learning_rate": 5.666406617363895e-07, "loss": 0.3692, "step": 5941 }, { "epoch": 2.6555530524892994, "grad_norm": 0.572550742211982, "learning_rate": 5.650740394540255e-07, "loss": 0.3987, "step": 5942 }, { "epoch": 2.656003604415409, "grad_norm": 0.5829065775677741, "learning_rate": 5.63509522864123e-07, "loss": 0.4041, "step": 5943 }, { "epoch": 2.6564541563415185, "grad_norm": 0.5910082815036644, "learning_rate": 5.619471123158581e-07, "loss": 0.4122, "step": 5944 }, { "epoch": 2.6569047082676276, "grad_norm": 0.5884471206718581, "learning_rate": 5.603868081579256e-07, "loss": 0.3831, "step": 5945 }, { "epoch": 2.6573552601937376, "grad_norm": 0.5806887684747466, "learning_rate": 5.58828610738561e-07, "loss": 0.3956, "step": 5946 }, { "epoch": 2.6578058121198467, "grad_norm": 0.5974587087792403, "learning_rate": 5.572725204055173e-07, "loss": 0.3895, "step": 5947 }, { "epoch": 2.658256364045956, "grad_norm": 0.5550168471397302, "learning_rate": 5.557185375060892e-07, "loss": 0.3884, "step": 5948 }, { "epoch": 2.6587069159720658, "grad_norm": 0.5640202832514438, "learning_rate": 5.541666623870923e-07, "loss": 0.4167, "step": 5949 }, { "epoch": 2.6591574678981753, "grad_norm": 0.5756561415947148, "learning_rate": 5.526168953948752e-07, "loss": 0.4255, "step": 5950 }, { "epoch": 2.659608019824285, "grad_norm": 0.581523148040471, "learning_rate": 5.510692368753168e-07, "loss": 0.3855, "step": 5951 }, { "epoch": 2.6600585717503944, "grad_norm": 0.5629826889503746, "learning_rate": 5.495236871738241e-07, "loss": 0.3715, "step": 5952 }, { "epoch": 2.660509123676504, "grad_norm": 0.5693092246818361, "learning_rate": 5.479802466353335e-07, "loss": 0.3862, "step": 5953 }, { "epoch": 2.660959675602613, "grad_norm": 0.5820574861042274, "learning_rate": 5.464389156043115e-07, "loss": 0.4217, "step": 5954 }, { "epoch": 2.6614102275287226, "grad_norm": 0.59087184825337, "learning_rate": 5.448996944247531e-07, "loss": 0.3985, "step": 5955 }, { "epoch": 2.661860779454832, "grad_norm": 0.5837106730190418, "learning_rate": 5.433625834401834e-07, "loss": 0.4013, "step": 5956 }, { "epoch": 2.6623113313809417, "grad_norm": 0.5625563411851541, "learning_rate": 5.418275829936537e-07, "loss": 0.3887, "step": 5957 }, { "epoch": 2.662761883307051, "grad_norm": 0.5782959044407244, "learning_rate": 5.402946934277486e-07, "loss": 0.377, "step": 5958 }, { "epoch": 2.6632124352331608, "grad_norm": 0.5785032826693187, "learning_rate": 5.38763915084578e-07, "loss": 0.4167, "step": 5959 }, { "epoch": 2.6636629871592703, "grad_norm": 0.5847486038996949, "learning_rate": 5.372352483057818e-07, "loss": 0.3892, "step": 5960 }, { "epoch": 2.6641135390853794, "grad_norm": 0.5762591511903098, "learning_rate": 5.357086934325295e-07, "loss": 0.4251, "step": 5961 }, { "epoch": 2.664564091011489, "grad_norm": 0.6037606720119402, "learning_rate": 5.341842508055184e-07, "loss": 0.4148, "step": 5962 }, { "epoch": 2.6650146429375985, "grad_norm": 0.575028198056552, "learning_rate": 5.326619207649741e-07, "loss": 0.3856, "step": 5963 }, { "epoch": 2.665465194863708, "grad_norm": 0.5934528314631912, "learning_rate": 5.311417036506516e-07, "loss": 0.3674, "step": 5964 }, { "epoch": 2.6659157467898176, "grad_norm": 0.5737829637950502, "learning_rate": 5.296235998018339e-07, "loss": 0.3676, "step": 5965 }, { "epoch": 2.666366298715927, "grad_norm": 0.589573273515846, "learning_rate": 5.281076095573312e-07, "loss": 0.3782, "step": 5966 }, { "epoch": 2.6668168506420367, "grad_norm": 0.5731722119010964, "learning_rate": 5.265937332554849e-07, "loss": 0.3774, "step": 5967 }, { "epoch": 2.6672674025681458, "grad_norm": 0.572096848489714, "learning_rate": 5.250819712341626e-07, "loss": 0.4017, "step": 5968 }, { "epoch": 2.6677179544942553, "grad_norm": 0.5910614568413762, "learning_rate": 5.235723238307588e-07, "loss": 0.3733, "step": 5969 }, { "epoch": 2.668168506420365, "grad_norm": 0.5916894306674048, "learning_rate": 5.220647913821975e-07, "loss": 0.3907, "step": 5970 }, { "epoch": 2.6686190583464744, "grad_norm": 0.585910053207616, "learning_rate": 5.205593742249326e-07, "loss": 0.4135, "step": 5971 }, { "epoch": 2.669069610272584, "grad_norm": 0.5725986078016554, "learning_rate": 5.190560726949435e-07, "loss": 0.3705, "step": 5972 }, { "epoch": 2.6695201621986935, "grad_norm": 0.5962527933501358, "learning_rate": 5.175548871277358e-07, "loss": 0.4078, "step": 5973 }, { "epoch": 2.669970714124803, "grad_norm": 0.5728515915622074, "learning_rate": 5.160558178583486e-07, "loss": 0.3781, "step": 5974 }, { "epoch": 2.670421266050912, "grad_norm": 0.5870826047399089, "learning_rate": 5.145588652213429e-07, "loss": 0.3895, "step": 5975 }, { "epoch": 2.670871817977022, "grad_norm": 0.5836966831602028, "learning_rate": 5.130640295508072e-07, "loss": 0.3945, "step": 5976 }, { "epoch": 2.671322369903131, "grad_norm": 0.5849498116509326, "learning_rate": 5.115713111803655e-07, "loss": 0.4078, "step": 5977 }, { "epoch": 2.6717729218292408, "grad_norm": 0.5760957855058968, "learning_rate": 5.100807104431571e-07, "loss": 0.3866, "step": 5978 }, { "epoch": 2.6722234737553503, "grad_norm": 0.5956197899056926, "learning_rate": 5.085922276718613e-07, "loss": 0.3971, "step": 5979 }, { "epoch": 2.67267402568146, "grad_norm": 0.5686245908515467, "learning_rate": 5.071058631986736e-07, "loss": 0.3788, "step": 5980 }, { "epoch": 2.6731245776075694, "grad_norm": 0.5868924203158585, "learning_rate": 5.056216173553263e-07, "loss": 0.4053, "step": 5981 }, { "epoch": 2.6735751295336785, "grad_norm": 0.5840487199369339, "learning_rate": 5.04139490473069e-07, "loss": 0.406, "step": 5982 }, { "epoch": 2.6740256814597885, "grad_norm": 0.5898953603895956, "learning_rate": 5.026594828826892e-07, "loss": 0.4063, "step": 5983 }, { "epoch": 2.6744762333858976, "grad_norm": 0.5666680755893786, "learning_rate": 5.011815949144916e-07, "loss": 0.4, "step": 5984 }, { "epoch": 2.674926785312007, "grad_norm": 0.6211631687879937, "learning_rate": 4.997058268983135e-07, "loss": 0.3883, "step": 5985 }, { "epoch": 2.6753773372381167, "grad_norm": 0.5753681603977565, "learning_rate": 4.982321791635192e-07, "loss": 0.4182, "step": 5986 }, { "epoch": 2.675827889164226, "grad_norm": 0.5761120759159754, "learning_rate": 4.967606520389956e-07, "loss": 0.3588, "step": 5987 }, { "epoch": 2.6762784410903357, "grad_norm": 0.5774937217718846, "learning_rate": 4.952912458531612e-07, "loss": 0.3905, "step": 5988 }, { "epoch": 2.6767289930164453, "grad_norm": 0.5837335954330852, "learning_rate": 4.938239609339579e-07, "loss": 0.4169, "step": 5989 }, { "epoch": 2.677179544942555, "grad_norm": 0.5680175654655665, "learning_rate": 4.923587976088561e-07, "loss": 0.3813, "step": 5990 }, { "epoch": 2.677630096868664, "grad_norm": 0.5961398804434045, "learning_rate": 4.908957562048511e-07, "loss": 0.3906, "step": 5991 }, { "epoch": 2.6780806487947735, "grad_norm": 0.5897422357735056, "learning_rate": 4.894348370484648e-07, "loss": 0.364, "step": 5992 }, { "epoch": 2.678531200720883, "grad_norm": 0.5895439491394704, "learning_rate": 4.879760404657475e-07, "loss": 0.3787, "step": 5993 }, { "epoch": 2.6789817526469926, "grad_norm": 0.5722578095619132, "learning_rate": 4.865193667822732e-07, "loss": 0.381, "step": 5994 }, { "epoch": 2.679432304573102, "grad_norm": 0.557464005571987, "learning_rate": 4.85064816323143e-07, "loss": 0.3735, "step": 5995 }, { "epoch": 2.6798828564992117, "grad_norm": 0.5721118934040171, "learning_rate": 4.836123894129852e-07, "loss": 0.3845, "step": 5996 }, { "epoch": 2.680333408425321, "grad_norm": 0.5856001917620386, "learning_rate": 4.821620863759535e-07, "loss": 0.4053, "step": 5997 }, { "epoch": 2.6807839603514303, "grad_norm": 0.5862972989343515, "learning_rate": 4.80713907535727e-07, "loss": 0.3948, "step": 5998 }, { "epoch": 2.68123451227754, "grad_norm": 0.5842502478057429, "learning_rate": 4.792678532155115e-07, "loss": 0.3888, "step": 5999 }, { "epoch": 2.6816850642036494, "grad_norm": 0.5771436476222339, "learning_rate": 4.778239237380355e-07, "loss": 0.405, "step": 6000 }, { "epoch": 2.682135616129759, "grad_norm": 0.5856640945937727, "learning_rate": 4.7638211942556136e-07, "loss": 0.3742, "step": 6001 }, { "epoch": 2.6825861680558685, "grad_norm": 0.5701150104663573, "learning_rate": 4.7494244059986593e-07, "loss": 0.384, "step": 6002 }, { "epoch": 2.683036719981978, "grad_norm": 0.582485030130425, "learning_rate": 4.735048875822634e-07, "loss": 0.3927, "step": 6003 }, { "epoch": 2.6834872719080876, "grad_norm": 0.5791303899058394, "learning_rate": 4.720694606935816e-07, "loss": 0.3807, "step": 6004 }, { "epoch": 2.6839378238341967, "grad_norm": 0.5857968628281329, "learning_rate": 4.7063616025418647e-07, "loss": 0.4072, "step": 6005 }, { "epoch": 2.6843883757603066, "grad_norm": 0.5864735428595176, "learning_rate": 4.692049865839565e-07, "loss": 0.41, "step": 6006 }, { "epoch": 2.6848389276864157, "grad_norm": 0.5714819442795307, "learning_rate": 4.6777594000230855e-07, "loss": 0.409, "step": 6007 }, { "epoch": 2.6852894796125253, "grad_norm": 0.571219633910986, "learning_rate": 4.6634902082817067e-07, "loss": 0.3929, "step": 6008 }, { "epoch": 2.685740031538635, "grad_norm": 0.5755203415259693, "learning_rate": 4.649242293800105e-07, "loss": 0.4111, "step": 6009 }, { "epoch": 2.6861905834647444, "grad_norm": 0.580351092116235, "learning_rate": 4.6350156597581017e-07, "loss": 0.3828, "step": 6010 }, { "epoch": 2.686641135390854, "grad_norm": 0.5602373396823617, "learning_rate": 4.620810309330803e-07, "loss": 0.37, "step": 6011 }, { "epoch": 2.687091687316963, "grad_norm": 0.5993105732575186, "learning_rate": 4.6066262456885726e-07, "loss": 0.4256, "step": 6012 }, { "epoch": 2.687542239243073, "grad_norm": 0.5645939206750078, "learning_rate": 4.592463471997022e-07, "loss": 0.3762, "step": 6013 }, { "epoch": 2.687992791169182, "grad_norm": 0.5768115549019112, "learning_rate": 4.578321991417023e-07, "loss": 0.3704, "step": 6014 }, { "epoch": 2.6884433430952916, "grad_norm": 0.5947298276399031, "learning_rate": 4.56420180710464e-07, "loss": 0.3791, "step": 6015 }, { "epoch": 2.688893895021401, "grad_norm": 0.5893214788175107, "learning_rate": 4.550102922211275e-07, "loss": 0.3951, "step": 6016 }, { "epoch": 2.6893444469475107, "grad_norm": 0.5715519265316266, "learning_rate": 4.5360253398834765e-07, "loss": 0.417, "step": 6017 }, { "epoch": 2.6897949988736203, "grad_norm": 0.5656506534108965, "learning_rate": 4.521969063263132e-07, "loss": 0.3836, "step": 6018 }, { "epoch": 2.69024555079973, "grad_norm": 0.6101296456962414, "learning_rate": 4.5079340954872987e-07, "loss": 0.3969, "step": 6019 }, { "epoch": 2.6906961027258394, "grad_norm": 0.5908339982234674, "learning_rate": 4.493920439688315e-07, "loss": 0.3952, "step": 6020 }, { "epoch": 2.6911466546519485, "grad_norm": 0.57545105420287, "learning_rate": 4.479928098993758e-07, "loss": 0.3898, "step": 6021 }, { "epoch": 2.691597206578058, "grad_norm": 0.5840301862629823, "learning_rate": 4.4659570765264506e-07, "loss": 0.4132, "step": 6022 }, { "epoch": 2.6920477585041676, "grad_norm": 0.5847121966382672, "learning_rate": 4.4520073754044547e-07, "loss": 0.3654, "step": 6023 }, { "epoch": 2.692498310430277, "grad_norm": 0.5604435546437575, "learning_rate": 4.43807899874108e-07, "loss": 0.3667, "step": 6024 }, { "epoch": 2.6929488623563866, "grad_norm": 0.5772489121105939, "learning_rate": 4.424171949644851e-07, "loss": 0.3682, "step": 6025 }, { "epoch": 2.693399414282496, "grad_norm": 0.5906130833492015, "learning_rate": 4.410286231219563e-07, "loss": 0.3901, "step": 6026 }, { "epoch": 2.6938499662086057, "grad_norm": 0.5871125948571569, "learning_rate": 4.396421846564236e-07, "loss": 0.3854, "step": 6027 }, { "epoch": 2.694300518134715, "grad_norm": 0.56724477535172, "learning_rate": 4.3825787987731405e-07, "loss": 0.3767, "step": 6028 }, { "epoch": 2.6947510700608244, "grad_norm": 0.5709861511558837, "learning_rate": 4.36875709093576e-07, "loss": 0.3921, "step": 6029 }, { "epoch": 2.695201621986934, "grad_norm": 0.5773445531238474, "learning_rate": 4.35495672613685e-07, "loss": 0.3863, "step": 6030 }, { "epoch": 2.6956521739130435, "grad_norm": 0.5675375730235427, "learning_rate": 4.341177707456368e-07, "loss": 0.387, "step": 6031 }, { "epoch": 2.696102725839153, "grad_norm": 0.5910871358144157, "learning_rate": 4.327420037969532e-07, "loss": 0.4229, "step": 6032 }, { "epoch": 2.6965532777652625, "grad_norm": 0.6010377466364109, "learning_rate": 4.313683720746775e-07, "loss": 0.4164, "step": 6033 }, { "epoch": 2.697003829691372, "grad_norm": 0.5799663760234216, "learning_rate": 4.299968758853812e-07, "loss": 0.4168, "step": 6034 }, { "epoch": 2.697454381617481, "grad_norm": 0.581163237173428, "learning_rate": 4.2862751553515047e-07, "loss": 0.366, "step": 6035 }, { "epoch": 2.6979049335435907, "grad_norm": 0.5865096080576558, "learning_rate": 4.272602913296053e-07, "loss": 0.4037, "step": 6036 }, { "epoch": 2.6983554854697003, "grad_norm": 0.5703763364102911, "learning_rate": 4.2589520357387836e-07, "loss": 0.3714, "step": 6037 }, { "epoch": 2.69880603739581, "grad_norm": 0.6091483439848644, "learning_rate": 4.2453225257263585e-07, "loss": 0.4053, "step": 6038 }, { "epoch": 2.6992565893219194, "grad_norm": 0.5928757035685747, "learning_rate": 4.231714386300567e-07, "loss": 0.421, "step": 6039 }, { "epoch": 2.699707141248029, "grad_norm": 0.6118363557559781, "learning_rate": 4.2181276204985356e-07, "loss": 0.3901, "step": 6040 }, { "epoch": 2.7001576931741385, "grad_norm": 0.5882049233969875, "learning_rate": 4.204562231352516e-07, "loss": 0.4101, "step": 6041 }, { "epoch": 2.7006082451002476, "grad_norm": 0.5737129343004032, "learning_rate": 4.1910182218900977e-07, "loss": 0.3743, "step": 6042 }, { "epoch": 2.7010587970263575, "grad_norm": 0.5898514405702647, "learning_rate": 4.1774955951339734e-07, "loss": 0.3925, "step": 6043 }, { "epoch": 2.7015093489524666, "grad_norm": 0.5782209512094396, "learning_rate": 4.1639943541021966e-07, "loss": 0.3882, "step": 6044 }, { "epoch": 2.701959900878576, "grad_norm": 0.588330752141274, "learning_rate": 4.150514501807945e-07, "loss": 0.4002, "step": 6045 }, { "epoch": 2.7024104528046857, "grad_norm": 0.5838387919449651, "learning_rate": 4.137056041259657e-07, "loss": 0.3768, "step": 6046 }, { "epoch": 2.7028610047307953, "grad_norm": 0.5797655456841616, "learning_rate": 4.123618975461008e-07, "loss": 0.3619, "step": 6047 }, { "epoch": 2.703311556656905, "grad_norm": 0.6024242765654773, "learning_rate": 4.1102033074108984e-07, "loss": 0.3759, "step": 6048 }, { "epoch": 2.7037621085830144, "grad_norm": 0.6024948346667992, "learning_rate": 4.0968090401034444e-07, "loss": 0.3903, "step": 6049 }, { "epoch": 2.704212660509124, "grad_norm": 0.584559415811675, "learning_rate": 4.083436176527977e-07, "loss": 0.3975, "step": 6050 }, { "epoch": 2.704663212435233, "grad_norm": 0.588348038736934, "learning_rate": 4.070084719669054e-07, "loss": 0.4164, "step": 6051 }, { "epoch": 2.7051137643613425, "grad_norm": 0.6019965074186363, "learning_rate": 4.056754672506491e-07, "loss": 0.3862, "step": 6052 }, { "epoch": 2.705564316287452, "grad_norm": 0.5932480840108216, "learning_rate": 4.043446038015264e-07, "loss": 0.372, "step": 6053 }, { "epoch": 2.7060148682135616, "grad_norm": 0.575038075817494, "learning_rate": 4.030158819165619e-07, "loss": 0.3901, "step": 6054 }, { "epoch": 2.706465420139671, "grad_norm": 0.5804518286495891, "learning_rate": 4.016893018922996e-07, "loss": 0.3813, "step": 6055 }, { "epoch": 2.7069159720657807, "grad_norm": 0.5791981371883855, "learning_rate": 4.0036486402480704e-07, "loss": 0.4103, "step": 6056 }, { "epoch": 2.7073665239918903, "grad_norm": 0.5711876565207216, "learning_rate": 3.9904256860967436e-07, "loss": 0.4157, "step": 6057 }, { "epoch": 2.7078170759179994, "grad_norm": 0.5848208470865461, "learning_rate": 3.97722415942009e-07, "loss": 0.3917, "step": 6058 }, { "epoch": 2.708267627844109, "grad_norm": 0.5627479161818861, "learning_rate": 3.964044063164463e-07, "loss": 0.3845, "step": 6059 }, { "epoch": 2.7087181797702184, "grad_norm": 0.5670495703105042, "learning_rate": 3.950885400271409e-07, "loss": 0.3871, "step": 6060 }, { "epoch": 2.709168731696328, "grad_norm": 0.5966966533549901, "learning_rate": 3.937748173677647e-07, "loss": 0.4253, "step": 6061 }, { "epoch": 2.7096192836224375, "grad_norm": 0.577200780764197, "learning_rate": 3.924632386315186e-07, "loss": 0.4125, "step": 6062 }, { "epoch": 2.710069835548547, "grad_norm": 0.5833587407229516, "learning_rate": 3.911538041111207e-07, "loss": 0.4041, "step": 6063 }, { "epoch": 2.7105203874746566, "grad_norm": 0.5772488045074634, "learning_rate": 3.898465140988106e-07, "loss": 0.3561, "step": 6064 }, { "epoch": 2.7109709394007657, "grad_norm": 0.6012606222906496, "learning_rate": 3.885413688863515e-07, "loss": 0.3936, "step": 6065 }, { "epoch": 2.7114214913268753, "grad_norm": 0.5699131245151386, "learning_rate": 3.8723836876502494e-07, "loss": 0.3706, "step": 6066 }, { "epoch": 2.711872043252985, "grad_norm": 0.5671306540368021, "learning_rate": 3.8593751402563715e-07, "loss": 0.3808, "step": 6067 }, { "epoch": 2.7123225951790944, "grad_norm": 0.5725071611331906, "learning_rate": 3.8463880495851147e-07, "loss": 0.3849, "step": 6068 }, { "epoch": 2.712773147105204, "grad_norm": 0.5815726382178246, "learning_rate": 3.83342241853496e-07, "loss": 0.407, "step": 6069 }, { "epoch": 2.7132236990313134, "grad_norm": 0.5913499210611224, "learning_rate": 3.820478249999582e-07, "loss": 0.379, "step": 6070 }, { "epoch": 2.713674250957423, "grad_norm": 0.6095332895507486, "learning_rate": 3.80755554686788e-07, "loss": 0.411, "step": 6071 }, { "epoch": 2.714124802883532, "grad_norm": 0.5934142237012743, "learning_rate": 3.7946543120239153e-07, "loss": 0.3943, "step": 6072 }, { "epoch": 2.714575354809642, "grad_norm": 0.5620499548168071, "learning_rate": 3.781774548347039e-07, "loss": 0.3641, "step": 6073 }, { "epoch": 2.715025906735751, "grad_norm": 0.5594865268087056, "learning_rate": 3.768916258711719e-07, "loss": 0.3856, "step": 6074 }, { "epoch": 2.7154764586618607, "grad_norm": 0.5880183217553567, "learning_rate": 3.756079445987715e-07, "loss": 0.3808, "step": 6075 }, { "epoch": 2.7159270105879703, "grad_norm": 0.5769216876161021, "learning_rate": 3.743264113039924e-07, "loss": 0.3917, "step": 6076 }, { "epoch": 2.71637756251408, "grad_norm": 0.5860321489172957, "learning_rate": 3.7304702627285137e-07, "loss": 0.3921, "step": 6077 }, { "epoch": 2.7168281144401893, "grad_norm": 0.5579533671240413, "learning_rate": 3.7176978979087787e-07, "loss": 0.3858, "step": 6078 }, { "epoch": 2.7172786663662984, "grad_norm": 0.5813241907308191, "learning_rate": 3.7049470214313156e-07, "loss": 0.4068, "step": 6079 }, { "epoch": 2.7177292182924084, "grad_norm": 0.5765979982958044, "learning_rate": 3.692217636141826e-07, "loss": 0.3809, "step": 6080 }, { "epoch": 2.7181797702185175, "grad_norm": 0.5889675303959009, "learning_rate": 3.679509744881282e-07, "loss": 0.3894, "step": 6081 }, { "epoch": 2.718630322144627, "grad_norm": 0.5777881489714107, "learning_rate": 3.6668233504858486e-07, "loss": 0.3826, "step": 6082 }, { "epoch": 2.7190808740707366, "grad_norm": 0.5819186823321773, "learning_rate": 3.6541584557868604e-07, "loss": 0.3848, "step": 6083 }, { "epoch": 2.719531425996846, "grad_norm": 0.5858042223690932, "learning_rate": 3.64151506361089e-07, "loss": 0.4178, "step": 6084 }, { "epoch": 2.7199819779229557, "grad_norm": 0.5869380166350161, "learning_rate": 3.628893176779691e-07, "loss": 0.3907, "step": 6085 }, { "epoch": 2.7204325298490653, "grad_norm": 0.5994076397133691, "learning_rate": 3.6162927981102327e-07, "loss": 0.4004, "step": 6086 }, { "epoch": 2.720883081775175, "grad_norm": 0.5690257613083455, "learning_rate": 3.603713930414676e-07, "loss": 0.3714, "step": 6087 }, { "epoch": 2.721333633701284, "grad_norm": 0.5695940603049022, "learning_rate": 3.591156576500365e-07, "loss": 0.3877, "step": 6088 }, { "epoch": 2.7217841856273934, "grad_norm": 0.5894546162284856, "learning_rate": 3.578620739169869e-07, "loss": 0.412, "step": 6089 }, { "epoch": 2.722234737553503, "grad_norm": 0.5828437126867981, "learning_rate": 3.5661064212209494e-07, "loss": 0.3866, "step": 6090 }, { "epoch": 2.7226852894796125, "grad_norm": 0.5673124985306348, "learning_rate": 3.5536136254465393e-07, "loss": 0.377, "step": 6091 }, { "epoch": 2.723135841405722, "grad_norm": 0.5851189046420234, "learning_rate": 3.541142354634808e-07, "loss": 0.4135, "step": 6092 }, { "epoch": 2.7235863933318316, "grad_norm": 0.5588950888669598, "learning_rate": 3.5286926115690843e-07, "loss": 0.4214, "step": 6093 }, { "epoch": 2.724036945257941, "grad_norm": 0.5723017716928714, "learning_rate": 3.516264399027924e-07, "loss": 0.4404, "step": 6094 }, { "epoch": 2.7244874971840503, "grad_norm": 0.5831791392311506, "learning_rate": 3.5038577197850756e-07, "loss": 0.3816, "step": 6095 }, { "epoch": 2.72493804911016, "grad_norm": 0.5639637211596197, "learning_rate": 3.4914725766094227e-07, "loss": 0.387, "step": 6096 }, { "epoch": 2.7253886010362693, "grad_norm": 0.5668610954650609, "learning_rate": 3.4791089722651437e-07, "loss": 0.3744, "step": 6097 }, { "epoch": 2.725839152962379, "grad_norm": 0.5914558008304697, "learning_rate": 3.4667669095114986e-07, "loss": 0.4067, "step": 6098 }, { "epoch": 2.7262897048884884, "grad_norm": 0.5915166309817396, "learning_rate": 3.4544463911030613e-07, "loss": 0.3645, "step": 6099 }, { "epoch": 2.726740256814598, "grad_norm": 0.5829418325229673, "learning_rate": 3.4421474197894655e-07, "loss": 0.3799, "step": 6100 }, { "epoch": 2.7271908087407075, "grad_norm": 0.5835671858148817, "learning_rate": 3.4298699983156713e-07, "loss": 0.3877, "step": 6101 }, { "epoch": 2.7276413606668166, "grad_norm": 0.5812621153643822, "learning_rate": 3.4176141294216867e-07, "loss": 0.3957, "step": 6102 }, { "epoch": 2.7280919125929266, "grad_norm": 0.5796293589116887, "learning_rate": 3.4053798158428575e-07, "loss": 0.4191, "step": 6103 }, { "epoch": 2.7285424645190357, "grad_norm": 0.5913933407048925, "learning_rate": 3.393167060309588e-07, "loss": 0.4549, "step": 6104 }, { "epoch": 2.7289930164451452, "grad_norm": 0.5818761733555377, "learning_rate": 3.380975865547564e-07, "loss": 0.3983, "step": 6105 }, { "epoch": 2.7289930164451452, "eval_loss": 0.6601502299308777, "eval_runtime": 24.3763, "eval_samples_per_second": 11.446, "eval_steps_per_second": 0.492, "step": 6105 } ], "logging_steps": 1, "max_steps": 6657, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 555, "total_flos": 3834631359037440.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }