diff --git "a/e1.0/trainer_state.json" "b/e1.0/trainer_state.json" deleted file mode 100644--- "a/e1.0/trainer_state.json" +++ /dev/null @@ -1,11233 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.005163240866168, - "eval_steps": 500, - "global_step": 3200, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.000628227025541355, - "grad_norm": 37.5, - "learning_rate": 2.5000000000000004e-07, - "loss": 1.9002, - "step": 2 - }, - { - "epoch": 0.00125645405108271, - "grad_norm": 5.65625, - "learning_rate": 5.000000000000001e-07, - "loss": 1.6234, - "step": 4 - }, - { - "epoch": 0.001884681076624065, - "grad_norm": 5.15625, - "learning_rate": 7.5e-07, - "loss": 1.6223, - "step": 6 - }, - { - "epoch": 0.00251290810216542, - "grad_norm": 4.8125, - "learning_rate": 1.0000000000000002e-06, - "loss": 1.6607, - "step": 8 - }, - { - "epoch": 0.003141135127706775, - "grad_norm": 5.03125, - "learning_rate": 1.25e-06, - "loss": 1.6644, - "step": 10 - }, - { - "epoch": 0.00376936215324813, - "grad_norm": 5.71875, - "learning_rate": 1.5e-06, - "loss": 1.8307, - "step": 12 - }, - { - "epoch": 0.004397589178789485, - "grad_norm": 5.28125, - "learning_rate": 1.75e-06, - "loss": 1.623, - "step": 14 - }, - { - "epoch": 0.00502581620433084, - "grad_norm": 3.625, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.6915, - "step": 16 - }, - { - "epoch": 0.005654043229872195, - "grad_norm": 3.609375, - "learning_rate": 2.25e-06, - "loss": 1.8222, - "step": 18 - }, - { - "epoch": 0.00628227025541355, - "grad_norm": 3.21875, - "learning_rate": 2.5e-06, - "loss": 1.6702, - "step": 20 - }, - { - "epoch": 0.006910497280954905, - "grad_norm": 2.140625, - "learning_rate": 2.7500000000000004e-06, - "loss": 1.6118, - "step": 22 - }, - { - "epoch": 0.00753872430649626, - "grad_norm": 2.890625, - "learning_rate": 3e-06, - "loss": 1.6499, - "step": 24 - }, - { - "epoch": 0.008166951332037615, - "grad_norm": 2.421875, - "learning_rate": 3.2500000000000002e-06, - "loss": 1.4785, - "step": 26 - }, - { - "epoch": 0.00879517835757897, - "grad_norm": 2.421875, - "learning_rate": 3.5e-06, - "loss": 1.7235, - "step": 28 - }, - { - "epoch": 0.009423405383120325, - "grad_norm": 2.25, - "learning_rate": 3.7500000000000005e-06, - "loss": 1.6245, - "step": 30 - }, - { - "epoch": 0.01005163240866168, - "grad_norm": 1.6640625, - "learning_rate": 4.000000000000001e-06, - "loss": 1.5486, - "step": 32 - }, - { - "epoch": 0.010679859434203035, - "grad_norm": 2.09375, - "learning_rate": 4.25e-06, - "loss": 1.4448, - "step": 34 - }, - { - "epoch": 0.01130808645974439, - "grad_norm": 1.5078125, - "learning_rate": 4.5e-06, - "loss": 1.48, - "step": 36 - }, - { - "epoch": 0.011936313485285744, - "grad_norm": 1.6796875, - "learning_rate": 4.75e-06, - "loss": 1.5128, - "step": 38 - }, - { - "epoch": 0.0125645405108271, - "grad_norm": 1.5234375, - "learning_rate": 5e-06, - "loss": 1.4774, - "step": 40 - }, - { - "epoch": 0.013192767536368456, - "grad_norm": 1.546875, - "learning_rate": 5.2500000000000006e-06, - "loss": 1.3692, - "step": 42 - }, - { - "epoch": 0.01382099456190981, - "grad_norm": 1.34375, - "learning_rate": 5.500000000000001e-06, - "loss": 1.5056, - "step": 44 - }, - { - "epoch": 0.014449221587451166, - "grad_norm": 1.21875, - "learning_rate": 5.75e-06, - "loss": 1.4744, - "step": 46 - }, - { - "epoch": 0.01507744861299252, - "grad_norm": 1.0078125, - "learning_rate": 6e-06, - "loss": 1.5776, - "step": 48 - }, - { - "epoch": 0.015705675638533874, - "grad_norm": 1.1015625, - "learning_rate": 6.25e-06, - "loss": 1.486, - "step": 50 - }, - { - "epoch": 0.01633390266407523, - "grad_norm": 0.85546875, - "learning_rate": 6.5000000000000004e-06, - "loss": 1.5541, - "step": 52 - }, - { - "epoch": 0.016962129689616585, - "grad_norm": 0.984375, - "learning_rate": 6.750000000000001e-06, - "loss": 1.3999, - "step": 54 - }, - { - "epoch": 0.01759035671515794, - "grad_norm": 0.96875, - "learning_rate": 7e-06, - "loss": 1.4455, - "step": 56 - }, - { - "epoch": 0.018218583740699294, - "grad_norm": 1.0546875, - "learning_rate": 7.25e-06, - "loss": 1.5221, - "step": 58 - }, - { - "epoch": 0.01884681076624065, - "grad_norm": 0.88671875, - "learning_rate": 7.500000000000001e-06, - "loss": 1.4798, - "step": 60 - }, - { - "epoch": 0.019475037791782005, - "grad_norm": 0.9140625, - "learning_rate": 7.75e-06, - "loss": 1.4334, - "step": 62 - }, - { - "epoch": 0.02010326481732336, - "grad_norm": 0.98046875, - "learning_rate": 8.000000000000001e-06, - "loss": 1.3293, - "step": 64 - }, - { - "epoch": 0.020731491842864717, - "grad_norm": 0.83984375, - "learning_rate": 8.25e-06, - "loss": 1.4174, - "step": 66 - }, - { - "epoch": 0.02135971886840607, - "grad_norm": 0.859375, - "learning_rate": 8.5e-06, - "loss": 1.4177, - "step": 68 - }, - { - "epoch": 0.021987945893947425, - "grad_norm": 0.84765625, - "learning_rate": 8.750000000000001e-06, - "loss": 1.3708, - "step": 70 - }, - { - "epoch": 0.02261617291948878, - "grad_norm": 0.90234375, - "learning_rate": 9e-06, - "loss": 1.4062, - "step": 72 - }, - { - "epoch": 0.023244399945030136, - "grad_norm": 0.83984375, - "learning_rate": 9.250000000000001e-06, - "loss": 1.3829, - "step": 74 - }, - { - "epoch": 0.02387262697057149, - "grad_norm": 0.9375, - "learning_rate": 9.5e-06, - "loss": 1.3441, - "step": 76 - }, - { - "epoch": 0.024500853996112845, - "grad_norm": 0.90234375, - "learning_rate": 9.75e-06, - "loss": 1.5625, - "step": 78 - }, - { - "epoch": 0.0251290810216542, - "grad_norm": 0.81640625, - "learning_rate": 1e-05, - "loss": 1.4504, - "step": 80 - }, - { - "epoch": 0.025757308047195556, - "grad_norm": 0.83984375, - "learning_rate": 1.025e-05, - "loss": 1.4672, - "step": 82 - }, - { - "epoch": 0.026385535072736912, - "grad_norm": 0.82421875, - "learning_rate": 1.0500000000000001e-05, - "loss": 1.4405, - "step": 84 - }, - { - "epoch": 0.027013762098278264, - "grad_norm": 1.0546875, - "learning_rate": 1.075e-05, - "loss": 1.3557, - "step": 86 - }, - { - "epoch": 0.02764198912381962, - "grad_norm": 0.90234375, - "learning_rate": 1.1000000000000001e-05, - "loss": 1.4309, - "step": 88 - }, - { - "epoch": 0.028270216149360976, - "grad_norm": 0.78125, - "learning_rate": 1.125e-05, - "loss": 1.3528, - "step": 90 - }, - { - "epoch": 0.02889844317490233, - "grad_norm": 0.96875, - "learning_rate": 1.15e-05, - "loss": 1.4093, - "step": 92 - }, - { - "epoch": 0.029526670200443684, - "grad_norm": 0.87890625, - "learning_rate": 1.1750000000000001e-05, - "loss": 1.4324, - "step": 94 - }, - { - "epoch": 0.03015489722598504, - "grad_norm": 0.875, - "learning_rate": 1.2e-05, - "loss": 1.4622, - "step": 96 - }, - { - "epoch": 0.030783124251526395, - "grad_norm": 0.85546875, - "learning_rate": 1.2250000000000001e-05, - "loss": 1.5166, - "step": 98 - }, - { - "epoch": 0.03141135127706775, - "grad_norm": 0.76953125, - "learning_rate": 1.25e-05, - "loss": 1.4729, - "step": 100 - }, - { - "epoch": 0.032039578302609104, - "grad_norm": 0.8828125, - "learning_rate": 1.275e-05, - "loss": 1.4201, - "step": 102 - }, - { - "epoch": 0.03266780532815046, - "grad_norm": 0.97265625, - "learning_rate": 1.3000000000000001e-05, - "loss": 1.3646, - "step": 104 - }, - { - "epoch": 0.033296032353691815, - "grad_norm": 0.859375, - "learning_rate": 1.325e-05, - "loss": 1.3105, - "step": 106 - }, - { - "epoch": 0.03392425937923317, - "grad_norm": 0.78125, - "learning_rate": 1.3500000000000001e-05, - "loss": 1.5302, - "step": 108 - }, - { - "epoch": 0.03455248640477453, - "grad_norm": 0.875, - "learning_rate": 1.375e-05, - "loss": 1.3979, - "step": 110 - }, - { - "epoch": 0.03518071343031588, - "grad_norm": 0.796875, - "learning_rate": 1.4e-05, - "loss": 1.3961, - "step": 112 - }, - { - "epoch": 0.03580894045585724, - "grad_norm": 0.796875, - "learning_rate": 1.425e-05, - "loss": 1.3645, - "step": 114 - }, - { - "epoch": 0.03643716748139859, - "grad_norm": 0.7421875, - "learning_rate": 1.45e-05, - "loss": 1.306, - "step": 116 - }, - { - "epoch": 0.03706539450693994, - "grad_norm": 0.8828125, - "learning_rate": 1.4750000000000003e-05, - "loss": 1.3799, - "step": 118 - }, - { - "epoch": 0.0376936215324813, - "grad_norm": 0.73828125, - "learning_rate": 1.5000000000000002e-05, - "loss": 1.3281, - "step": 120 - }, - { - "epoch": 0.038321848558022654, - "grad_norm": 0.87890625, - "learning_rate": 1.525e-05, - "loss": 1.4052, - "step": 122 - }, - { - "epoch": 0.03895007558356401, - "grad_norm": 0.8203125, - "learning_rate": 1.55e-05, - "loss": 1.4946, - "step": 124 - }, - { - "epoch": 0.039578302609105366, - "grad_norm": 0.80859375, - "learning_rate": 1.575e-05, - "loss": 1.4292, - "step": 126 - }, - { - "epoch": 0.04020652963464672, - "grad_norm": 1.0078125, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.4858, - "step": 128 - }, - { - "epoch": 0.04083475666018808, - "grad_norm": 0.9765625, - "learning_rate": 1.6250000000000002e-05, - "loss": 1.2745, - "step": 130 - }, - { - "epoch": 0.04146298368572943, - "grad_norm": 0.8046875, - "learning_rate": 1.65e-05, - "loss": 1.4684, - "step": 132 - }, - { - "epoch": 0.04209121071127078, - "grad_norm": 0.80859375, - "learning_rate": 1.675e-05, - "loss": 1.4275, - "step": 134 - }, - { - "epoch": 0.04271943773681214, - "grad_norm": 0.90625, - "learning_rate": 1.7e-05, - "loss": 1.2831, - "step": 136 - }, - { - "epoch": 0.043347664762353494, - "grad_norm": 0.953125, - "learning_rate": 1.7250000000000003e-05, - "loss": 1.445, - "step": 138 - }, - { - "epoch": 0.04397589178789485, - "grad_norm": 0.80859375, - "learning_rate": 1.7500000000000002e-05, - "loss": 1.3457, - "step": 140 - }, - { - "epoch": 0.044604118813436205, - "grad_norm": 0.8046875, - "learning_rate": 1.775e-05, - "loss": 1.3961, - "step": 142 - }, - { - "epoch": 0.04523234583897756, - "grad_norm": 0.75, - "learning_rate": 1.8e-05, - "loss": 1.2985, - "step": 144 - }, - { - "epoch": 0.04586057286451892, - "grad_norm": 0.81640625, - "learning_rate": 1.825e-05, - "loss": 1.3075, - "step": 146 - }, - { - "epoch": 0.04648879989006027, - "grad_norm": 0.76953125, - "learning_rate": 1.8500000000000002e-05, - "loss": 1.3602, - "step": 148 - }, - { - "epoch": 0.04711702691560163, - "grad_norm": 0.8828125, - "learning_rate": 1.8750000000000002e-05, - "loss": 1.4481, - "step": 150 - }, - { - "epoch": 0.04774525394114298, - "grad_norm": 0.80078125, - "learning_rate": 1.9e-05, - "loss": 1.409, - "step": 152 - }, - { - "epoch": 0.04837348096668433, - "grad_norm": 0.80859375, - "learning_rate": 1.925e-05, - "loss": 1.357, - "step": 154 - }, - { - "epoch": 0.04900170799222569, - "grad_norm": 0.77734375, - "learning_rate": 1.95e-05, - "loss": 1.2841, - "step": 156 - }, - { - "epoch": 0.049629935017767045, - "grad_norm": 0.7421875, - "learning_rate": 1.9750000000000002e-05, - "loss": 1.4336, - "step": 158 - }, - { - "epoch": 0.0502581620433084, - "grad_norm": 0.9609375, - "learning_rate": 2e-05, - "loss": 1.3853, - "step": 160 - }, - { - "epoch": 0.050886389068849756, - "grad_norm": 0.89453125, - "learning_rate": 1.9997461123452876e-05, - "loss": 1.3465, - "step": 162 - }, - { - "epoch": 0.05151461609439111, - "grad_norm": 0.84375, - "learning_rate": 1.9994922246905744e-05, - "loss": 1.3669, - "step": 164 - }, - { - "epoch": 0.05214284311993247, - "grad_norm": 0.7421875, - "learning_rate": 1.999238337035862e-05, - "loss": 1.6046, - "step": 166 - }, - { - "epoch": 0.052771070145473824, - "grad_norm": 0.81640625, - "learning_rate": 1.998984449381149e-05, - "loss": 1.2993, - "step": 168 - }, - { - "epoch": 0.05339929717101517, - "grad_norm": 0.79296875, - "learning_rate": 1.9987305617264362e-05, - "loss": 1.4495, - "step": 170 - }, - { - "epoch": 0.05402752419655653, - "grad_norm": 0.7734375, - "learning_rate": 1.9984766740717233e-05, - "loss": 1.3141, - "step": 172 - }, - { - "epoch": 0.054655751222097884, - "grad_norm": 0.71484375, - "learning_rate": 1.9982227864170108e-05, - "loss": 1.4852, - "step": 174 - }, - { - "epoch": 0.05528397824763924, - "grad_norm": 0.875, - "learning_rate": 1.997968898762298e-05, - "loss": 1.4228, - "step": 176 - }, - { - "epoch": 0.055912205273180596, - "grad_norm": 0.80078125, - "learning_rate": 1.997715011107585e-05, - "loss": 1.5617, - "step": 178 - }, - { - "epoch": 0.05654043229872195, - "grad_norm": 0.82421875, - "learning_rate": 1.9974611234528722e-05, - "loss": 1.2938, - "step": 180 - }, - { - "epoch": 0.05716865932426331, - "grad_norm": 0.7421875, - "learning_rate": 1.9972072357981597e-05, - "loss": 1.469, - "step": 182 - }, - { - "epoch": 0.05779688634980466, - "grad_norm": 0.82421875, - "learning_rate": 1.9969533481434465e-05, - "loss": 1.41, - "step": 184 - }, - { - "epoch": 0.05842511337534602, - "grad_norm": 0.7265625, - "learning_rate": 1.996699460488734e-05, - "loss": 1.4414, - "step": 186 - }, - { - "epoch": 0.05905334040088737, - "grad_norm": 0.8828125, - "learning_rate": 1.996445572834021e-05, - "loss": 1.2717, - "step": 188 - }, - { - "epoch": 0.059681567426428724, - "grad_norm": 0.7890625, - "learning_rate": 1.9961916851793083e-05, - "loss": 1.3179, - "step": 190 - }, - { - "epoch": 0.06030979445197008, - "grad_norm": 0.88671875, - "learning_rate": 1.9959377975245954e-05, - "loss": 1.4353, - "step": 192 - }, - { - "epoch": 0.060938021477511435, - "grad_norm": 0.79296875, - "learning_rate": 1.995683909869883e-05, - "loss": 1.4721, - "step": 194 - }, - { - "epoch": 0.06156624850305279, - "grad_norm": 0.7890625, - "learning_rate": 1.99543002221517e-05, - "loss": 1.4394, - "step": 196 - }, - { - "epoch": 0.06219447552859415, - "grad_norm": 0.765625, - "learning_rate": 1.995176134560457e-05, - "loss": 1.4004, - "step": 198 - }, - { - "epoch": 0.0628227025541355, - "grad_norm": 0.703125, - "learning_rate": 1.9949222469057443e-05, - "loss": 1.3159, - "step": 200 - }, - { - "epoch": 0.06345092957967685, - "grad_norm": 0.7890625, - "learning_rate": 1.9946683592510318e-05, - "loss": 1.408, - "step": 202 - }, - { - "epoch": 0.06407915660521821, - "grad_norm": 0.77734375, - "learning_rate": 1.994414471596319e-05, - "loss": 1.362, - "step": 204 - }, - { - "epoch": 0.06470738363075956, - "grad_norm": 0.7421875, - "learning_rate": 1.994160583941606e-05, - "loss": 1.321, - "step": 206 - }, - { - "epoch": 0.06533561065630092, - "grad_norm": 0.76953125, - "learning_rate": 1.9939066962868932e-05, - "loss": 1.3576, - "step": 208 - }, - { - "epoch": 0.06596383768184227, - "grad_norm": 0.77734375, - "learning_rate": 1.9936528086321803e-05, - "loss": 1.4552, - "step": 210 - }, - { - "epoch": 0.06659206470738363, - "grad_norm": 0.875, - "learning_rate": 1.9933989209774675e-05, - "loss": 1.3144, - "step": 212 - }, - { - "epoch": 0.06722029173292499, - "grad_norm": 0.8515625, - "learning_rate": 1.993145033322755e-05, - "loss": 1.3575, - "step": 214 - }, - { - "epoch": 0.06784851875846634, - "grad_norm": 0.828125, - "learning_rate": 1.992891145668042e-05, - "loss": 1.25, - "step": 216 - }, - { - "epoch": 0.0684767457840077, - "grad_norm": 0.75, - "learning_rate": 1.9926372580133292e-05, - "loss": 1.4611, - "step": 218 - }, - { - "epoch": 0.06910497280954905, - "grad_norm": 0.73828125, - "learning_rate": 1.9923833703586163e-05, - "loss": 1.2994, - "step": 220 - }, - { - "epoch": 0.06973319983509041, - "grad_norm": 0.9375, - "learning_rate": 1.9921294827039038e-05, - "loss": 1.2697, - "step": 222 - }, - { - "epoch": 0.07036142686063176, - "grad_norm": 0.81640625, - "learning_rate": 1.991875595049191e-05, - "loss": 1.4699, - "step": 224 - }, - { - "epoch": 0.07098965388617312, - "grad_norm": 0.92578125, - "learning_rate": 1.991621707394478e-05, - "loss": 1.4276, - "step": 226 - }, - { - "epoch": 0.07161788091171448, - "grad_norm": 0.74609375, - "learning_rate": 1.9913678197397652e-05, - "loss": 1.4052, - "step": 228 - }, - { - "epoch": 0.07224610793725583, - "grad_norm": 0.8203125, - "learning_rate": 1.9911139320850527e-05, - "loss": 1.3744, - "step": 230 - }, - { - "epoch": 0.07287433496279717, - "grad_norm": 0.87109375, - "learning_rate": 1.9908600444303395e-05, - "loss": 1.363, - "step": 232 - }, - { - "epoch": 0.07350256198833853, - "grad_norm": 0.859375, - "learning_rate": 1.990606156775627e-05, - "loss": 1.421, - "step": 234 - }, - { - "epoch": 0.07413078901387989, - "grad_norm": 0.76171875, - "learning_rate": 1.990352269120914e-05, - "loss": 1.5131, - "step": 236 - }, - { - "epoch": 0.07475901603942124, - "grad_norm": 1.5625, - "learning_rate": 1.9900983814662013e-05, - "loss": 1.3155, - "step": 238 - }, - { - "epoch": 0.0753872430649626, - "grad_norm": 0.78125, - "learning_rate": 1.9898444938114884e-05, - "loss": 1.3595, - "step": 240 - }, - { - "epoch": 0.07601547009050395, - "grad_norm": 0.7890625, - "learning_rate": 1.989590606156776e-05, - "loss": 1.3706, - "step": 242 - }, - { - "epoch": 0.07664369711604531, - "grad_norm": 0.72265625, - "learning_rate": 1.989336718502063e-05, - "loss": 1.3058, - "step": 244 - }, - { - "epoch": 0.07727192414158666, - "grad_norm": 0.71484375, - "learning_rate": 1.98908283084735e-05, - "loss": 1.3404, - "step": 246 - }, - { - "epoch": 0.07790015116712802, - "grad_norm": 0.9453125, - "learning_rate": 1.9888289431926376e-05, - "loss": 1.235, - "step": 248 - }, - { - "epoch": 0.07852837819266938, - "grad_norm": 0.84765625, - "learning_rate": 1.9885750555379248e-05, - "loss": 1.3668, - "step": 250 - }, - { - "epoch": 0.07915660521821073, - "grad_norm": 0.71875, - "learning_rate": 1.988321167883212e-05, - "loss": 1.3602, - "step": 252 - }, - { - "epoch": 0.07978483224375209, - "grad_norm": 0.828125, - "learning_rate": 1.988067280228499e-05, - "loss": 1.3833, - "step": 254 - }, - { - "epoch": 0.08041305926929344, - "grad_norm": 0.796875, - "learning_rate": 1.9878133925737865e-05, - "loss": 1.4476, - "step": 256 - }, - { - "epoch": 0.0810412862948348, - "grad_norm": 0.68359375, - "learning_rate": 1.9875595049190733e-05, - "loss": 1.4111, - "step": 258 - }, - { - "epoch": 0.08166951332037616, - "grad_norm": 0.88671875, - "learning_rate": 1.9873056172643608e-05, - "loss": 1.3636, - "step": 260 - }, - { - "epoch": 0.08229774034591751, - "grad_norm": 0.78515625, - "learning_rate": 1.987051729609648e-05, - "loss": 1.2524, - "step": 262 - }, - { - "epoch": 0.08292596737145887, - "grad_norm": 0.9609375, - "learning_rate": 1.986797841954935e-05, - "loss": 1.4048, - "step": 264 - }, - { - "epoch": 0.08355419439700022, - "grad_norm": 0.7109375, - "learning_rate": 1.9865439543002222e-05, - "loss": 1.3619, - "step": 266 - }, - { - "epoch": 0.08418242142254156, - "grad_norm": 0.796875, - "learning_rate": 1.9862900666455097e-05, - "loss": 1.4125, - "step": 268 - }, - { - "epoch": 0.08481064844808292, - "grad_norm": 0.6875, - "learning_rate": 1.986036178990797e-05, - "loss": 1.4536, - "step": 270 - }, - { - "epoch": 0.08543887547362428, - "grad_norm": 0.8125, - "learning_rate": 1.985782291336084e-05, - "loss": 1.392, - "step": 272 - }, - { - "epoch": 0.08606710249916563, - "grad_norm": 0.77734375, - "learning_rate": 1.985528403681371e-05, - "loss": 1.393, - "step": 274 - }, - { - "epoch": 0.08669532952470699, - "grad_norm": 0.91015625, - "learning_rate": 1.9852745160266586e-05, - "loss": 1.3635, - "step": 276 - }, - { - "epoch": 0.08732355655024834, - "grad_norm": 0.75, - "learning_rate": 1.9850206283719454e-05, - "loss": 1.4626, - "step": 278 - }, - { - "epoch": 0.0879517835757897, - "grad_norm": 0.8671875, - "learning_rate": 1.984766740717233e-05, - "loss": 1.3507, - "step": 280 - }, - { - "epoch": 0.08858001060133106, - "grad_norm": 0.83203125, - "learning_rate": 1.98451285306252e-05, - "loss": 1.4432, - "step": 282 - }, - { - "epoch": 0.08920823762687241, - "grad_norm": 0.83203125, - "learning_rate": 1.984258965407807e-05, - "loss": 1.3932, - "step": 284 - }, - { - "epoch": 0.08983646465241377, - "grad_norm": 0.8203125, - "learning_rate": 1.9840050777530943e-05, - "loss": 1.391, - "step": 286 - }, - { - "epoch": 0.09046469167795512, - "grad_norm": 0.7109375, - "learning_rate": 1.9837511900983818e-05, - "loss": 1.4163, - "step": 288 - }, - { - "epoch": 0.09109291870349648, - "grad_norm": 1.171875, - "learning_rate": 1.983497302443669e-05, - "loss": 1.4135, - "step": 290 - }, - { - "epoch": 0.09172114572903783, - "grad_norm": 0.8515625, - "learning_rate": 1.983243414788956e-05, - "loss": 1.4099, - "step": 292 - }, - { - "epoch": 0.09234937275457919, - "grad_norm": 0.76171875, - "learning_rate": 1.982989527134243e-05, - "loss": 1.2512, - "step": 294 - }, - { - "epoch": 0.09297759978012055, - "grad_norm": 0.734375, - "learning_rate": 1.9827356394795306e-05, - "loss": 1.255, - "step": 296 - }, - { - "epoch": 0.0936058268056619, - "grad_norm": 0.87109375, - "learning_rate": 1.9824817518248174e-05, - "loss": 1.2295, - "step": 298 - }, - { - "epoch": 0.09423405383120326, - "grad_norm": 0.765625, - "learning_rate": 1.982227864170105e-05, - "loss": 1.4514, - "step": 300 - }, - { - "epoch": 0.09486228085674461, - "grad_norm": 0.8828125, - "learning_rate": 1.981973976515392e-05, - "loss": 1.3137, - "step": 302 - }, - { - "epoch": 0.09549050788228595, - "grad_norm": 0.86328125, - "learning_rate": 1.9817200888606792e-05, - "loss": 1.3511, - "step": 304 - }, - { - "epoch": 0.09611873490782731, - "grad_norm": 0.85546875, - "learning_rate": 1.9814662012059663e-05, - "loss": 1.3035, - "step": 306 - }, - { - "epoch": 0.09674696193336867, - "grad_norm": 0.734375, - "learning_rate": 1.9812123135512538e-05, - "loss": 1.4151, - "step": 308 - }, - { - "epoch": 0.09737518895891002, - "grad_norm": 0.79296875, - "learning_rate": 1.980958425896541e-05, - "loss": 1.3819, - "step": 310 - }, - { - "epoch": 0.09800341598445138, - "grad_norm": 0.76953125, - "learning_rate": 1.980704538241828e-05, - "loss": 1.3212, - "step": 312 - }, - { - "epoch": 0.09863164300999273, - "grad_norm": 0.86328125, - "learning_rate": 1.9804506505871152e-05, - "loss": 1.4313, - "step": 314 - }, - { - "epoch": 0.09925987003553409, - "grad_norm": 0.828125, - "learning_rate": 1.9801967629324027e-05, - "loss": 1.4021, - "step": 316 - }, - { - "epoch": 0.09988809706107545, - "grad_norm": 0.7578125, - "learning_rate": 1.97994287527769e-05, - "loss": 1.3191, - "step": 318 - }, - { - "epoch": 0.1005163240866168, - "grad_norm": 0.734375, - "learning_rate": 1.979688987622977e-05, - "loss": 1.3138, - "step": 320 - }, - { - "epoch": 0.10114455111215816, - "grad_norm": 0.78125, - "learning_rate": 1.979435099968264e-05, - "loss": 1.3881, - "step": 322 - }, - { - "epoch": 0.10177277813769951, - "grad_norm": 0.7265625, - "learning_rate": 1.9791812123135513e-05, - "loss": 1.1786, - "step": 324 - }, - { - "epoch": 0.10240100516324087, - "grad_norm": 0.73828125, - "learning_rate": 1.9789273246588384e-05, - "loss": 1.246, - "step": 326 - }, - { - "epoch": 0.10302923218878222, - "grad_norm": 0.8515625, - "learning_rate": 1.978673437004126e-05, - "loss": 1.359, - "step": 328 - }, - { - "epoch": 0.10365745921432358, - "grad_norm": 0.91796875, - "learning_rate": 1.978419549349413e-05, - "loss": 1.271, - "step": 330 - }, - { - "epoch": 0.10428568623986494, - "grad_norm": 0.75, - "learning_rate": 1.9781656616947e-05, - "loss": 1.4137, - "step": 332 - }, - { - "epoch": 0.10491391326540629, - "grad_norm": 0.75390625, - "learning_rate": 1.9779117740399876e-05, - "loss": 1.3471, - "step": 334 - }, - { - "epoch": 0.10554214029094765, - "grad_norm": 0.70703125, - "learning_rate": 1.9776578863852748e-05, - "loss": 1.4421, - "step": 336 - }, - { - "epoch": 0.10617036731648899, - "grad_norm": 0.73828125, - "learning_rate": 1.977403998730562e-05, - "loss": 1.2823, - "step": 338 - }, - { - "epoch": 0.10679859434203035, - "grad_norm": 0.76171875, - "learning_rate": 1.977150111075849e-05, - "loss": 1.463, - "step": 340 - }, - { - "epoch": 0.1074268213675717, - "grad_norm": 0.74609375, - "learning_rate": 1.9768962234211365e-05, - "loss": 1.2987, - "step": 342 - }, - { - "epoch": 0.10805504839311306, - "grad_norm": 0.88671875, - "learning_rate": 1.9766423357664237e-05, - "loss": 1.4113, - "step": 344 - }, - { - "epoch": 0.10868327541865441, - "grad_norm": 0.7578125, - "learning_rate": 1.9763884481117108e-05, - "loss": 1.4153, - "step": 346 - }, - { - "epoch": 0.10931150244419577, - "grad_norm": 0.703125, - "learning_rate": 1.976134560456998e-05, - "loss": 1.3976, - "step": 348 - }, - { - "epoch": 0.10993972946973712, - "grad_norm": 0.78515625, - "learning_rate": 1.975880672802285e-05, - "loss": 1.395, - "step": 350 - }, - { - "epoch": 0.11056795649527848, - "grad_norm": 0.83984375, - "learning_rate": 1.9756267851475722e-05, - "loss": 1.533, - "step": 352 - }, - { - "epoch": 0.11119618352081984, - "grad_norm": 0.796875, - "learning_rate": 1.9753728974928597e-05, - "loss": 1.3265, - "step": 354 - }, - { - "epoch": 0.11182441054636119, - "grad_norm": 0.76171875, - "learning_rate": 1.9751190098381468e-05, - "loss": 1.4088, - "step": 356 - }, - { - "epoch": 0.11245263757190255, - "grad_norm": 0.8671875, - "learning_rate": 1.974865122183434e-05, - "loss": 1.4432, - "step": 358 - }, - { - "epoch": 0.1130808645974439, - "grad_norm": 0.984375, - "learning_rate": 1.974611234528721e-05, - "loss": 1.2292, - "step": 360 - }, - { - "epoch": 0.11370909162298526, - "grad_norm": 0.73828125, - "learning_rate": 1.9743573468740086e-05, - "loss": 1.3708, - "step": 362 - }, - { - "epoch": 0.11433731864852661, - "grad_norm": 0.73046875, - "learning_rate": 1.9741034592192957e-05, - "loss": 1.2918, - "step": 364 - }, - { - "epoch": 0.11496554567406797, - "grad_norm": 0.79296875, - "learning_rate": 1.973849571564583e-05, - "loss": 1.4335, - "step": 366 - }, - { - "epoch": 0.11559377269960933, - "grad_norm": 0.78515625, - "learning_rate": 1.97359568390987e-05, - "loss": 1.2187, - "step": 368 - }, - { - "epoch": 0.11622199972515068, - "grad_norm": 10.625, - "learning_rate": 1.9733417962551575e-05, - "loss": 1.2494, - "step": 370 - }, - { - "epoch": 0.11685022675069204, - "grad_norm": 0.75, - "learning_rate": 1.9730879086004443e-05, - "loss": 1.2348, - "step": 372 - }, - { - "epoch": 0.11747845377623338, - "grad_norm": 0.6953125, - "learning_rate": 1.9728340209457317e-05, - "loss": 1.3933, - "step": 374 - }, - { - "epoch": 0.11810668080177474, - "grad_norm": 0.8515625, - "learning_rate": 1.972580133291019e-05, - "loss": 1.445, - "step": 376 - }, - { - "epoch": 0.11873490782731609, - "grad_norm": 0.8515625, - "learning_rate": 1.972326245636306e-05, - "loss": 1.3521, - "step": 378 - }, - { - "epoch": 0.11936313485285745, - "grad_norm": 0.69921875, - "learning_rate": 1.972072357981593e-05, - "loss": 1.4533, - "step": 380 - }, - { - "epoch": 0.1199913618783988, - "grad_norm": 0.78515625, - "learning_rate": 1.9718184703268806e-05, - "loss": 1.2909, - "step": 382 - }, - { - "epoch": 0.12061958890394016, - "grad_norm": 0.8203125, - "learning_rate": 1.9715645826721678e-05, - "loss": 1.5502, - "step": 384 - }, - { - "epoch": 0.12124781592948151, - "grad_norm": 0.84765625, - "learning_rate": 1.971310695017455e-05, - "loss": 1.3525, - "step": 386 - }, - { - "epoch": 0.12187604295502287, - "grad_norm": 0.796875, - "learning_rate": 1.971056807362742e-05, - "loss": 1.5028, - "step": 388 - }, - { - "epoch": 0.12250426998056423, - "grad_norm": 6.21875, - "learning_rate": 1.9708029197080295e-05, - "loss": 1.3943, - "step": 390 - }, - { - "epoch": 0.12313249700610558, - "grad_norm": 0.765625, - "learning_rate": 1.9705490320533163e-05, - "loss": 1.5042, - "step": 392 - }, - { - "epoch": 0.12376072403164694, - "grad_norm": 0.79296875, - "learning_rate": 1.9702951443986038e-05, - "loss": 1.3527, - "step": 394 - }, - { - "epoch": 0.1243889510571883, - "grad_norm": 0.734375, - "learning_rate": 1.970041256743891e-05, - "loss": 1.5268, - "step": 396 - }, - { - "epoch": 0.12501717808272964, - "grad_norm": 0.77734375, - "learning_rate": 1.969787369089178e-05, - "loss": 1.2923, - "step": 398 - }, - { - "epoch": 0.125645405108271, - "grad_norm": 0.85546875, - "learning_rate": 1.9695334814344652e-05, - "loss": 1.4865, - "step": 400 - }, - { - "epoch": 0.12627363213381235, - "grad_norm": 0.828125, - "learning_rate": 1.9692795937797527e-05, - "loss": 1.2926, - "step": 402 - }, - { - "epoch": 0.1269018591593537, - "grad_norm": 0.796875, - "learning_rate": 1.96902570612504e-05, - "loss": 1.3763, - "step": 404 - }, - { - "epoch": 0.12753008618489506, - "grad_norm": 0.77734375, - "learning_rate": 1.968771818470327e-05, - "loss": 1.4208, - "step": 406 - }, - { - "epoch": 0.12815831321043641, - "grad_norm": 0.8515625, - "learning_rate": 1.968517930815614e-05, - "loss": 1.2802, - "step": 408 - }, - { - "epoch": 0.12878654023597777, - "grad_norm": 0.7578125, - "learning_rate": 1.9682640431609016e-05, - "loss": 1.3137, - "step": 410 - }, - { - "epoch": 0.12941476726151913, - "grad_norm": 0.734375, - "learning_rate": 1.9680101555061887e-05, - "loss": 1.2313, - "step": 412 - }, - { - "epoch": 0.13004299428706048, - "grad_norm": 0.73828125, - "learning_rate": 1.967756267851476e-05, - "loss": 1.3286, - "step": 414 - }, - { - "epoch": 0.13067122131260184, - "grad_norm": 0.86328125, - "learning_rate": 1.967502380196763e-05, - "loss": 1.3544, - "step": 416 - }, - { - "epoch": 0.1312994483381432, - "grad_norm": 0.796875, - "learning_rate": 1.96724849254205e-05, - "loss": 1.4726, - "step": 418 - }, - { - "epoch": 0.13192767536368455, - "grad_norm": 0.71875, - "learning_rate": 1.9669946048873376e-05, - "loss": 1.3215, - "step": 420 - }, - { - "epoch": 0.1325559023892259, - "grad_norm": 0.78515625, - "learning_rate": 1.9667407172326248e-05, - "loss": 1.5521, - "step": 422 - }, - { - "epoch": 0.13318412941476726, - "grad_norm": 0.796875, - "learning_rate": 1.966486829577912e-05, - "loss": 1.3127, - "step": 424 - }, - { - "epoch": 0.13381235644030862, - "grad_norm": 0.8359375, - "learning_rate": 1.966232941923199e-05, - "loss": 1.2696, - "step": 426 - }, - { - "epoch": 0.13444058346584997, - "grad_norm": 0.8046875, - "learning_rate": 1.9659790542684865e-05, - "loss": 1.2138, - "step": 428 - }, - { - "epoch": 0.13506881049139133, - "grad_norm": 0.7890625, - "learning_rate": 1.9657251666137736e-05, - "loss": 1.4204, - "step": 430 - }, - { - "epoch": 0.13569703751693268, - "grad_norm": 0.75390625, - "learning_rate": 1.9654712789590608e-05, - "loss": 1.2865, - "step": 432 - }, - { - "epoch": 0.13632526454247404, - "grad_norm": 0.73828125, - "learning_rate": 1.965217391304348e-05, - "loss": 1.2856, - "step": 434 - }, - { - "epoch": 0.1369534915680154, - "grad_norm": 0.73046875, - "learning_rate": 1.9649635036496354e-05, - "loss": 1.4284, - "step": 436 - }, - { - "epoch": 0.13758171859355675, - "grad_norm": 0.73828125, - "learning_rate": 1.9647096159949225e-05, - "loss": 1.3569, - "step": 438 - }, - { - "epoch": 0.1382099456190981, - "grad_norm": 0.671875, - "learning_rate": 1.9644557283402097e-05, - "loss": 1.3295, - "step": 440 - }, - { - "epoch": 0.13883817264463946, - "grad_norm": 0.7421875, - "learning_rate": 1.9642018406854968e-05, - "loss": 1.2948, - "step": 442 - }, - { - "epoch": 0.13946639967018082, - "grad_norm": 0.79296875, - "learning_rate": 1.963947953030784e-05, - "loss": 1.4097, - "step": 444 - }, - { - "epoch": 0.14009462669572217, - "grad_norm": 1.0703125, - "learning_rate": 1.963694065376071e-05, - "loss": 1.3674, - "step": 446 - }, - { - "epoch": 0.14072285372126353, - "grad_norm": 0.8671875, - "learning_rate": 1.9634401777213586e-05, - "loss": 1.4544, - "step": 448 - }, - { - "epoch": 0.14135108074680489, - "grad_norm": 0.72265625, - "learning_rate": 1.9631862900666457e-05, - "loss": 1.3385, - "step": 450 - }, - { - "epoch": 0.14197930777234624, - "grad_norm": 0.75390625, - "learning_rate": 1.962932402411933e-05, - "loss": 1.3962, - "step": 452 - }, - { - "epoch": 0.1426075347978876, - "grad_norm": 0.80078125, - "learning_rate": 1.96267851475722e-05, - "loss": 1.3257, - "step": 454 - }, - { - "epoch": 0.14323576182342895, - "grad_norm": 0.8359375, - "learning_rate": 1.9624246271025075e-05, - "loss": 1.3572, - "step": 456 - }, - { - "epoch": 0.1438639888489703, - "grad_norm": 0.73046875, - "learning_rate": 1.9621707394477946e-05, - "loss": 1.5115, - "step": 458 - }, - { - "epoch": 0.14449221587451166, - "grad_norm": 0.7578125, - "learning_rate": 1.9619168517930817e-05, - "loss": 1.3532, - "step": 460 - }, - { - "epoch": 0.14512044290005302, - "grad_norm": 0.8046875, - "learning_rate": 1.961662964138369e-05, - "loss": 1.3612, - "step": 462 - }, - { - "epoch": 0.14574866992559435, - "grad_norm": 0.73828125, - "learning_rate": 1.9614090764836564e-05, - "loss": 1.3881, - "step": 464 - }, - { - "epoch": 0.1463768969511357, - "grad_norm": 0.94140625, - "learning_rate": 1.961155188828943e-05, - "loss": 1.534, - "step": 466 - }, - { - "epoch": 0.14700512397667706, - "grad_norm": 0.71484375, - "learning_rate": 1.9609013011742306e-05, - "loss": 1.4607, - "step": 468 - }, - { - "epoch": 0.14763335100221842, - "grad_norm": 0.72265625, - "learning_rate": 1.9606474135195178e-05, - "loss": 1.3466, - "step": 470 - }, - { - "epoch": 0.14826157802775977, - "grad_norm": 0.7109375, - "learning_rate": 1.960393525864805e-05, - "loss": 1.3187, - "step": 472 - }, - { - "epoch": 0.14888980505330113, - "grad_norm": 0.77734375, - "learning_rate": 1.960139638210092e-05, - "loss": 1.36, - "step": 474 - }, - { - "epoch": 0.14951803207884248, - "grad_norm": 0.859375, - "learning_rate": 1.9598857505553795e-05, - "loss": 1.2185, - "step": 476 - }, - { - "epoch": 0.15014625910438384, - "grad_norm": 0.71484375, - "learning_rate": 1.9596318629006667e-05, - "loss": 1.4085, - "step": 478 - }, - { - "epoch": 0.1507744861299252, - "grad_norm": 0.98046875, - "learning_rate": 1.9593779752459538e-05, - "loss": 1.3917, - "step": 480 - }, - { - "epoch": 0.15140271315546655, - "grad_norm": 0.7421875, - "learning_rate": 1.959124087591241e-05, - "loss": 1.3497, - "step": 482 - }, - { - "epoch": 0.1520309401810079, - "grad_norm": 0.76171875, - "learning_rate": 1.9588701999365284e-05, - "loss": 1.3855, - "step": 484 - }, - { - "epoch": 0.15265916720654926, - "grad_norm": 0.81640625, - "learning_rate": 1.9586163122818152e-05, - "loss": 1.4071, - "step": 486 - }, - { - "epoch": 0.15328739423209062, - "grad_norm": 0.80859375, - "learning_rate": 1.9583624246271027e-05, - "loss": 1.2817, - "step": 488 - }, - { - "epoch": 0.15391562125763197, - "grad_norm": 0.75, - "learning_rate": 1.9581085369723898e-05, - "loss": 1.3758, - "step": 490 - }, - { - "epoch": 0.15454384828317333, - "grad_norm": 0.78125, - "learning_rate": 1.957854649317677e-05, - "loss": 1.4021, - "step": 492 - }, - { - "epoch": 0.15517207530871469, - "grad_norm": 0.75390625, - "learning_rate": 1.957600761662964e-05, - "loss": 1.4163, - "step": 494 - }, - { - "epoch": 0.15580030233425604, - "grad_norm": 0.7890625, - "learning_rate": 1.9573468740082516e-05, - "loss": 1.3127, - "step": 496 - }, - { - "epoch": 0.1564285293597974, - "grad_norm": 0.79296875, - "learning_rate": 1.9570929863535387e-05, - "loss": 1.2817, - "step": 498 - }, - { - "epoch": 0.15705675638533875, - "grad_norm": 0.76171875, - "learning_rate": 1.956839098698826e-05, - "loss": 1.3561, - "step": 500 - }, - { - "epoch": 0.1576849834108801, - "grad_norm": 0.76171875, - "learning_rate": 1.956585211044113e-05, - "loss": 1.317, - "step": 502 - }, - { - "epoch": 0.15831321043642146, - "grad_norm": 1.3671875, - "learning_rate": 1.9563313233894005e-05, - "loss": 1.4507, - "step": 504 - }, - { - "epoch": 0.15894143746196282, - "grad_norm": 1.0703125, - "learning_rate": 1.9560774357346876e-05, - "loss": 1.3715, - "step": 506 - }, - { - "epoch": 0.15956966448750418, - "grad_norm": 0.765625, - "learning_rate": 1.9558235480799747e-05, - "loss": 1.3502, - "step": 508 - }, - { - "epoch": 0.16019789151304553, - "grad_norm": 0.72265625, - "learning_rate": 1.9555696604252622e-05, - "loss": 1.4822, - "step": 510 - }, - { - "epoch": 0.1608261185385869, - "grad_norm": 0.75, - "learning_rate": 1.955315772770549e-05, - "loss": 1.2966, - "step": 512 - }, - { - "epoch": 0.16145434556412824, - "grad_norm": 0.80078125, - "learning_rate": 1.9550618851158365e-05, - "loss": 1.2823, - "step": 514 - }, - { - "epoch": 0.1620825725896696, - "grad_norm": 0.87109375, - "learning_rate": 1.9548079974611236e-05, - "loss": 1.3542, - "step": 516 - }, - { - "epoch": 0.16271079961521095, - "grad_norm": 0.72265625, - "learning_rate": 1.9545541098064108e-05, - "loss": 1.2341, - "step": 518 - }, - { - "epoch": 0.1633390266407523, - "grad_norm": 0.90625, - "learning_rate": 1.954300222151698e-05, - "loss": 1.4864, - "step": 520 - }, - { - "epoch": 0.16396725366629367, - "grad_norm": 0.80078125, - "learning_rate": 1.9540463344969854e-05, - "loss": 1.4035, - "step": 522 - }, - { - "epoch": 0.16459548069183502, - "grad_norm": 0.7734375, - "learning_rate": 1.9537924468422725e-05, - "loss": 1.3945, - "step": 524 - }, - { - "epoch": 0.16522370771737638, - "grad_norm": 0.94921875, - "learning_rate": 1.9535385591875597e-05, - "loss": 1.3526, - "step": 526 - }, - { - "epoch": 0.16585193474291773, - "grad_norm": 0.71484375, - "learning_rate": 1.9532846715328468e-05, - "loss": 1.4006, - "step": 528 - }, - { - "epoch": 0.1664801617684591, - "grad_norm": 0.703125, - "learning_rate": 1.9530307838781343e-05, - "loss": 1.3114, - "step": 530 - }, - { - "epoch": 0.16710838879400045, - "grad_norm": 0.75390625, - "learning_rate": 1.9527768962234214e-05, - "loss": 1.4793, - "step": 532 - }, - { - "epoch": 0.16773661581954177, - "grad_norm": 0.7734375, - "learning_rate": 1.9525230085687086e-05, - "loss": 1.3226, - "step": 534 - }, - { - "epoch": 0.16836484284508313, - "grad_norm": 0.90625, - "learning_rate": 1.9522691209139957e-05, - "loss": 1.3562, - "step": 536 - }, - { - "epoch": 0.16899306987062448, - "grad_norm": 0.91796875, - "learning_rate": 1.952015233259283e-05, - "loss": 1.3007, - "step": 538 - }, - { - "epoch": 0.16962129689616584, - "grad_norm": 0.69140625, - "learning_rate": 1.95176134560457e-05, - "loss": 1.4449, - "step": 540 - }, - { - "epoch": 0.1702495239217072, - "grad_norm": 0.74609375, - "learning_rate": 1.9515074579498575e-05, - "loss": 1.3835, - "step": 542 - }, - { - "epoch": 0.17087775094724855, - "grad_norm": 0.74609375, - "learning_rate": 1.9512535702951446e-05, - "loss": 1.4734, - "step": 544 - }, - { - "epoch": 0.1715059779727899, - "grad_norm": 0.7109375, - "learning_rate": 1.9509996826404317e-05, - "loss": 1.4759, - "step": 546 - }, - { - "epoch": 0.17213420499833126, - "grad_norm": 0.6953125, - "learning_rate": 1.950745794985719e-05, - "loss": 1.3622, - "step": 548 - }, - { - "epoch": 0.17276243202387262, - "grad_norm": 0.703125, - "learning_rate": 1.9504919073310063e-05, - "loss": 1.5, - "step": 550 - }, - { - "epoch": 0.17339065904941398, - "grad_norm": 0.765625, - "learning_rate": 1.9502380196762935e-05, - "loss": 1.4584, - "step": 552 - }, - { - "epoch": 0.17401888607495533, - "grad_norm": 0.78125, - "learning_rate": 1.9499841320215806e-05, - "loss": 1.1847, - "step": 554 - }, - { - "epoch": 0.1746471131004967, - "grad_norm": 0.8046875, - "learning_rate": 1.9497302443668678e-05, - "loss": 1.4887, - "step": 556 - }, - { - "epoch": 0.17527534012603804, - "grad_norm": 0.703125, - "learning_rate": 1.9494763567121552e-05, - "loss": 1.3411, - "step": 558 - }, - { - "epoch": 0.1759035671515794, - "grad_norm": 0.8125, - "learning_rate": 1.949222469057442e-05, - "loss": 1.4525, - "step": 560 - }, - { - "epoch": 0.17653179417712075, - "grad_norm": 0.78125, - "learning_rate": 1.9489685814027295e-05, - "loss": 1.2743, - "step": 562 - }, - { - "epoch": 0.1771600212026621, - "grad_norm": 0.77734375, - "learning_rate": 1.9487146937480167e-05, - "loss": 1.385, - "step": 564 - }, - { - "epoch": 0.17778824822820347, - "grad_norm": 0.89453125, - "learning_rate": 1.9484608060933038e-05, - "loss": 1.3988, - "step": 566 - }, - { - "epoch": 0.17841647525374482, - "grad_norm": 0.78125, - "learning_rate": 1.948206918438591e-05, - "loss": 1.2637, - "step": 568 - }, - { - "epoch": 0.17904470227928618, - "grad_norm": 0.73046875, - "learning_rate": 1.9479530307838784e-05, - "loss": 1.3832, - "step": 570 - }, - { - "epoch": 0.17967292930482753, - "grad_norm": 0.83984375, - "learning_rate": 1.9476991431291655e-05, - "loss": 1.367, - "step": 572 - }, - { - "epoch": 0.1803011563303689, - "grad_norm": 0.85546875, - "learning_rate": 1.9474452554744527e-05, - "loss": 1.3174, - "step": 574 - }, - { - "epoch": 0.18092938335591024, - "grad_norm": 0.6875, - "learning_rate": 1.9471913678197398e-05, - "loss": 1.2966, - "step": 576 - }, - { - "epoch": 0.1815576103814516, - "grad_norm": 0.796875, - "learning_rate": 1.9469374801650273e-05, - "loss": 1.4582, - "step": 578 - }, - { - "epoch": 0.18218583740699296, - "grad_norm": 0.69921875, - "learning_rate": 1.946683592510314e-05, - "loss": 1.3229, - "step": 580 - }, - { - "epoch": 0.1828140644325343, - "grad_norm": 0.734375, - "learning_rate": 1.9464297048556016e-05, - "loss": 1.2895, - "step": 582 - }, - { - "epoch": 0.18344229145807567, - "grad_norm": 0.73046875, - "learning_rate": 1.9461758172008887e-05, - "loss": 1.5382, - "step": 584 - }, - { - "epoch": 0.18407051848361702, - "grad_norm": 0.92578125, - "learning_rate": 1.945921929546176e-05, - "loss": 1.4349, - "step": 586 - }, - { - "epoch": 0.18469874550915838, - "grad_norm": 0.828125, - "learning_rate": 1.945668041891463e-05, - "loss": 1.3861, - "step": 588 - }, - { - "epoch": 0.18532697253469974, - "grad_norm": 0.76953125, - "learning_rate": 1.9454141542367505e-05, - "loss": 1.2897, - "step": 590 - }, - { - "epoch": 0.1859551995602411, - "grad_norm": 0.8671875, - "learning_rate": 1.9451602665820376e-05, - "loss": 1.3362, - "step": 592 - }, - { - "epoch": 0.18658342658578245, - "grad_norm": 0.8046875, - "learning_rate": 1.9449063789273247e-05, - "loss": 1.3954, - "step": 594 - }, - { - "epoch": 0.1872116536113238, - "grad_norm": 0.734375, - "learning_rate": 1.9446524912726122e-05, - "loss": 1.3541, - "step": 596 - }, - { - "epoch": 0.18783988063686516, - "grad_norm": 0.80859375, - "learning_rate": 1.9443986036178994e-05, - "loss": 1.4498, - "step": 598 - }, - { - "epoch": 0.18846810766240651, - "grad_norm": 0.83203125, - "learning_rate": 1.9441447159631865e-05, - "loss": 1.3767, - "step": 600 - }, - { - "epoch": 0.18909633468794787, - "grad_norm": 0.9453125, - "learning_rate": 1.9438908283084736e-05, - "loss": 1.245, - "step": 602 - }, - { - "epoch": 0.18972456171348923, - "grad_norm": 0.66015625, - "learning_rate": 1.943636940653761e-05, - "loss": 1.4371, - "step": 604 - }, - { - "epoch": 0.19035278873903055, - "grad_norm": 0.7890625, - "learning_rate": 1.943383052999048e-05, - "loss": 1.3194, - "step": 606 - }, - { - "epoch": 0.1909810157645719, - "grad_norm": 0.7421875, - "learning_rate": 1.9431291653443354e-05, - "loss": 1.3339, - "step": 608 - }, - { - "epoch": 0.19160924279011327, - "grad_norm": 0.7578125, - "learning_rate": 1.9428752776896225e-05, - "loss": 1.3773, - "step": 610 - }, - { - "epoch": 0.19223746981565462, - "grad_norm": 0.7265625, - "learning_rate": 1.9426213900349097e-05, - "loss": 1.3456, - "step": 612 - }, - { - "epoch": 0.19286569684119598, - "grad_norm": 1.015625, - "learning_rate": 1.9423675023801968e-05, - "loss": 1.3713, - "step": 614 - }, - { - "epoch": 0.19349392386673733, - "grad_norm": 0.73828125, - "learning_rate": 1.9421136147254843e-05, - "loss": 1.4541, - "step": 616 - }, - { - "epoch": 0.1941221508922787, - "grad_norm": 0.71484375, - "learning_rate": 1.9418597270707714e-05, - "loss": 1.4132, - "step": 618 - }, - { - "epoch": 0.19475037791782004, - "grad_norm": 0.90234375, - "learning_rate": 1.9416058394160586e-05, - "loss": 1.461, - "step": 620 - }, - { - "epoch": 0.1953786049433614, - "grad_norm": 0.765625, - "learning_rate": 1.9413519517613457e-05, - "loss": 1.2741, - "step": 622 - }, - { - "epoch": 0.19600683196890276, - "grad_norm": 0.75, - "learning_rate": 1.9410980641066332e-05, - "loss": 1.4783, - "step": 624 - }, - { - "epoch": 0.1966350589944441, - "grad_norm": 0.9296875, - "learning_rate": 1.94084417645192e-05, - "loss": 1.3676, - "step": 626 - }, - { - "epoch": 0.19726328601998547, - "grad_norm": 0.83984375, - "learning_rate": 1.9405902887972074e-05, - "loss": 1.2958, - "step": 628 - }, - { - "epoch": 0.19789151304552682, - "grad_norm": 0.8125, - "learning_rate": 1.9403364011424946e-05, - "loss": 1.3159, - "step": 630 - }, - { - "epoch": 0.19851974007106818, - "grad_norm": 0.7578125, - "learning_rate": 1.9400825134877817e-05, - "loss": 1.3835, - "step": 632 - }, - { - "epoch": 0.19914796709660953, - "grad_norm": 0.890625, - "learning_rate": 1.939828625833069e-05, - "loss": 1.2554, - "step": 634 - }, - { - "epoch": 0.1997761941221509, - "grad_norm": 0.671875, - "learning_rate": 1.9395747381783563e-05, - "loss": 1.3815, - "step": 636 - }, - { - "epoch": 0.20040442114769225, - "grad_norm": 0.78125, - "learning_rate": 1.9393208505236435e-05, - "loss": 1.4323, - "step": 638 - }, - { - "epoch": 0.2010326481732336, - "grad_norm": 0.7265625, - "learning_rate": 1.9390669628689306e-05, - "loss": 1.3292, - "step": 640 - }, - { - "epoch": 0.20166087519877496, - "grad_norm": 0.82421875, - "learning_rate": 1.9388130752142178e-05, - "loss": 1.2865, - "step": 642 - }, - { - "epoch": 0.20228910222431631, - "grad_norm": 1.015625, - "learning_rate": 1.9385591875595052e-05, - "loss": 1.3822, - "step": 644 - }, - { - "epoch": 0.20291732924985767, - "grad_norm": 0.75, - "learning_rate": 1.9383052999047924e-05, - "loss": 1.3657, - "step": 646 - }, - { - "epoch": 0.20354555627539903, - "grad_norm": 0.82421875, - "learning_rate": 1.9380514122500795e-05, - "loss": 1.3554, - "step": 648 - }, - { - "epoch": 0.20417378330094038, - "grad_norm": 0.75, - "learning_rate": 1.9377975245953666e-05, - "loss": 1.331, - "step": 650 - }, - { - "epoch": 0.20480201032648174, - "grad_norm": 0.96484375, - "learning_rate": 1.9375436369406538e-05, - "loss": 1.3798, - "step": 652 - }, - { - "epoch": 0.2054302373520231, - "grad_norm": 0.80078125, - "learning_rate": 1.937289749285941e-05, - "loss": 1.4126, - "step": 654 - }, - { - "epoch": 0.20605846437756445, - "grad_norm": 0.7265625, - "learning_rate": 1.9370358616312284e-05, - "loss": 1.5412, - "step": 656 - }, - { - "epoch": 0.2066866914031058, - "grad_norm": 0.6875, - "learning_rate": 1.9367819739765155e-05, - "loss": 1.4367, - "step": 658 - }, - { - "epoch": 0.20731491842864716, - "grad_norm": 0.8828125, - "learning_rate": 1.9365280863218027e-05, - "loss": 1.3944, - "step": 660 - }, - { - "epoch": 0.20794314545418852, - "grad_norm": 0.74609375, - "learning_rate": 1.9362741986670898e-05, - "loss": 1.4311, - "step": 662 - }, - { - "epoch": 0.20857137247972987, - "grad_norm": 0.73046875, - "learning_rate": 1.9360203110123773e-05, - "loss": 1.452, - "step": 664 - }, - { - "epoch": 0.20919959950527123, - "grad_norm": 0.73828125, - "learning_rate": 1.9357664233576644e-05, - "loss": 1.3529, - "step": 666 - }, - { - "epoch": 0.20982782653081258, - "grad_norm": 0.703125, - "learning_rate": 1.9355125357029516e-05, - "loss": 1.3444, - "step": 668 - }, - { - "epoch": 0.21045605355635394, - "grad_norm": 0.70703125, - "learning_rate": 1.9352586480482387e-05, - "loss": 1.352, - "step": 670 - }, - { - "epoch": 0.2110842805818953, - "grad_norm": 0.81640625, - "learning_rate": 1.9350047603935262e-05, - "loss": 1.455, - "step": 672 - }, - { - "epoch": 0.21171250760743665, - "grad_norm": 0.7578125, - "learning_rate": 1.934750872738813e-05, - "loss": 1.2581, - "step": 674 - }, - { - "epoch": 0.21234073463297798, - "grad_norm": 0.8515625, - "learning_rate": 1.9344969850841005e-05, - "loss": 1.3224, - "step": 676 - }, - { - "epoch": 0.21296896165851933, - "grad_norm": 0.6875, - "learning_rate": 1.9342430974293876e-05, - "loss": 1.4604, - "step": 678 - }, - { - "epoch": 0.2135971886840607, - "grad_norm": 0.75, - "learning_rate": 1.9339892097746747e-05, - "loss": 1.2345, - "step": 680 - }, - { - "epoch": 0.21422541570960205, - "grad_norm": 0.70703125, - "learning_rate": 1.9337353221199622e-05, - "loss": 1.4289, - "step": 682 - }, - { - "epoch": 0.2148536427351434, - "grad_norm": 0.875, - "learning_rate": 1.9334814344652494e-05, - "loss": 1.4216, - "step": 684 - }, - { - "epoch": 0.21548186976068476, - "grad_norm": 0.796875, - "learning_rate": 1.9332275468105365e-05, - "loss": 1.4541, - "step": 686 - }, - { - "epoch": 0.2161100967862261, - "grad_norm": 0.83203125, - "learning_rate": 1.9329736591558236e-05, - "loss": 1.3089, - "step": 688 - }, - { - "epoch": 0.21673832381176747, - "grad_norm": 0.8828125, - "learning_rate": 1.932719771501111e-05, - "loss": 1.3822, - "step": 690 - }, - { - "epoch": 0.21736655083730883, - "grad_norm": 0.78125, - "learning_rate": 1.9324658838463982e-05, - "loss": 1.2839, - "step": 692 - }, - { - "epoch": 0.21799477786285018, - "grad_norm": 0.7421875, - "learning_rate": 1.9322119961916854e-05, - "loss": 1.2813, - "step": 694 - }, - { - "epoch": 0.21862300488839154, - "grad_norm": 0.7265625, - "learning_rate": 1.9319581085369725e-05, - "loss": 1.3437, - "step": 696 - }, - { - "epoch": 0.2192512319139329, - "grad_norm": 0.69140625, - "learning_rate": 1.93170422088226e-05, - "loss": 1.3649, - "step": 698 - }, - { - "epoch": 0.21987945893947425, - "grad_norm": 0.70703125, - "learning_rate": 1.9314503332275468e-05, - "loss": 1.3949, - "step": 700 - }, - { - "epoch": 0.2205076859650156, - "grad_norm": 0.9921875, - "learning_rate": 1.9311964455728343e-05, - "loss": 1.3488, - "step": 702 - }, - { - "epoch": 0.22113591299055696, - "grad_norm": 0.8671875, - "learning_rate": 1.9309425579181214e-05, - "loss": 1.268, - "step": 704 - }, - { - "epoch": 0.22176414001609832, - "grad_norm": 0.875, - "learning_rate": 1.9306886702634085e-05, - "loss": 1.3855, - "step": 706 - }, - { - "epoch": 0.22239236704163967, - "grad_norm": 0.765625, - "learning_rate": 1.9304347826086957e-05, - "loss": 1.2877, - "step": 708 - }, - { - "epoch": 0.22302059406718103, - "grad_norm": 0.7734375, - "learning_rate": 1.930180894953983e-05, - "loss": 1.3324, - "step": 710 - }, - { - "epoch": 0.22364882109272238, - "grad_norm": 0.75390625, - "learning_rate": 1.9299270072992703e-05, - "loss": 1.3602, - "step": 712 - }, - { - "epoch": 0.22427704811826374, - "grad_norm": 0.76171875, - "learning_rate": 1.9296731196445574e-05, - "loss": 1.3006, - "step": 714 - }, - { - "epoch": 0.2249052751438051, - "grad_norm": 0.7578125, - "learning_rate": 1.9294192319898446e-05, - "loss": 1.3667, - "step": 716 - }, - { - "epoch": 0.22553350216934645, - "grad_norm": 0.70703125, - "learning_rate": 1.929165344335132e-05, - "loss": 1.3536, - "step": 718 - }, - { - "epoch": 0.2261617291948878, - "grad_norm": 0.73828125, - "learning_rate": 1.928911456680419e-05, - "loss": 1.4343, - "step": 720 - }, - { - "epoch": 0.22678995622042916, - "grad_norm": 0.76171875, - "learning_rate": 1.9286575690257063e-05, - "loss": 1.355, - "step": 722 - }, - { - "epoch": 0.22741818324597052, - "grad_norm": 0.88671875, - "learning_rate": 1.9284036813709935e-05, - "loss": 1.3999, - "step": 724 - }, - { - "epoch": 0.22804641027151187, - "grad_norm": 0.9140625, - "learning_rate": 1.9281497937162806e-05, - "loss": 1.3638, - "step": 726 - }, - { - "epoch": 0.22867463729705323, - "grad_norm": 0.7265625, - "learning_rate": 1.9278959060615677e-05, - "loss": 1.2724, - "step": 728 - }, - { - "epoch": 0.22930286432259459, - "grad_norm": 1.0, - "learning_rate": 1.9276420184068552e-05, - "loss": 1.3783, - "step": 730 - }, - { - "epoch": 0.22993109134813594, - "grad_norm": 0.7578125, - "learning_rate": 1.9273881307521424e-05, - "loss": 1.2429, - "step": 732 - }, - { - "epoch": 0.2305593183736773, - "grad_norm": 0.76171875, - "learning_rate": 1.9271342430974295e-05, - "loss": 1.4618, - "step": 734 - }, - { - "epoch": 0.23118754539921865, - "grad_norm": 0.70703125, - "learning_rate": 1.9268803554427166e-05, - "loss": 1.3145, - "step": 736 - }, - { - "epoch": 0.23181577242476, - "grad_norm": 0.74609375, - "learning_rate": 1.926626467788004e-05, - "loss": 1.3562, - "step": 738 - }, - { - "epoch": 0.23244399945030136, - "grad_norm": 0.7734375, - "learning_rate": 1.9263725801332913e-05, - "loss": 1.3047, - "step": 740 - }, - { - "epoch": 0.23307222647584272, - "grad_norm": 0.765625, - "learning_rate": 1.9261186924785784e-05, - "loss": 1.4534, - "step": 742 - }, - { - "epoch": 0.23370045350138408, - "grad_norm": 0.84375, - "learning_rate": 1.9258648048238655e-05, - "loss": 1.3435, - "step": 744 - }, - { - "epoch": 0.23432868052692543, - "grad_norm": 0.75, - "learning_rate": 1.9256109171691527e-05, - "loss": 1.3951, - "step": 746 - }, - { - "epoch": 0.23495690755246676, - "grad_norm": 0.68359375, - "learning_rate": 1.9253570295144398e-05, - "loss": 1.3799, - "step": 748 - }, - { - "epoch": 0.23558513457800812, - "grad_norm": 0.8046875, - "learning_rate": 1.9251031418597273e-05, - "loss": 1.5794, - "step": 750 - }, - { - "epoch": 0.23621336160354947, - "grad_norm": 0.73828125, - "learning_rate": 1.9248492542050144e-05, - "loss": 1.3543, - "step": 752 - }, - { - "epoch": 0.23684158862909083, - "grad_norm": 0.71484375, - "learning_rate": 1.9245953665503016e-05, - "loss": 1.2956, - "step": 754 - }, - { - "epoch": 0.23746981565463218, - "grad_norm": 0.8359375, - "learning_rate": 1.9243414788955887e-05, - "loss": 1.2537, - "step": 756 - }, - { - "epoch": 0.23809804268017354, - "grad_norm": 0.83203125, - "learning_rate": 1.9240875912408762e-05, - "loss": 1.3696, - "step": 758 - }, - { - "epoch": 0.2387262697057149, - "grad_norm": 0.8359375, - "learning_rate": 1.9238337035861633e-05, - "loss": 1.4097, - "step": 760 - }, - { - "epoch": 0.23935449673125625, - "grad_norm": 1.015625, - "learning_rate": 1.9235798159314505e-05, - "loss": 1.307, - "step": 762 - }, - { - "epoch": 0.2399827237567976, - "grad_norm": 0.91015625, - "learning_rate": 1.923325928276738e-05, - "loss": 1.2294, - "step": 764 - }, - { - "epoch": 0.24061095078233896, - "grad_norm": 0.796875, - "learning_rate": 1.923072040622025e-05, - "loss": 1.3091, - "step": 766 - }, - { - "epoch": 0.24123917780788032, - "grad_norm": 0.859375, - "learning_rate": 1.9228181529673122e-05, - "loss": 1.3432, - "step": 768 - }, - { - "epoch": 0.24186740483342167, - "grad_norm": 0.80078125, - "learning_rate": 1.9225642653125993e-05, - "loss": 1.3201, - "step": 770 - }, - { - "epoch": 0.24249563185896303, - "grad_norm": 0.80078125, - "learning_rate": 1.9223103776578865e-05, - "loss": 1.4521, - "step": 772 - }, - { - "epoch": 0.24312385888450438, - "grad_norm": 0.95703125, - "learning_rate": 1.9220564900031736e-05, - "loss": 1.4254, - "step": 774 - }, - { - "epoch": 0.24375208591004574, - "grad_norm": 0.8125, - "learning_rate": 1.921802602348461e-05, - "loss": 1.3347, - "step": 776 - }, - { - "epoch": 0.2443803129355871, - "grad_norm": 0.7890625, - "learning_rate": 1.9215487146937482e-05, - "loss": 1.2956, - "step": 778 - }, - { - "epoch": 0.24500853996112845, - "grad_norm": 0.7734375, - "learning_rate": 1.9212948270390354e-05, - "loss": 1.3128, - "step": 780 - }, - { - "epoch": 0.2456367669866698, - "grad_norm": 0.84375, - "learning_rate": 1.9210409393843225e-05, - "loss": 1.3065, - "step": 782 - }, - { - "epoch": 0.24626499401221116, - "grad_norm": 1.015625, - "learning_rate": 1.92078705172961e-05, - "loss": 1.2968, - "step": 784 - }, - { - "epoch": 0.24689322103775252, - "grad_norm": 0.87890625, - "learning_rate": 1.920533164074897e-05, - "loss": 1.3041, - "step": 786 - }, - { - "epoch": 0.24752144806329388, - "grad_norm": 0.8046875, - "learning_rate": 1.9202792764201843e-05, - "loss": 1.4266, - "step": 788 - }, - { - "epoch": 0.24814967508883523, - "grad_norm": 0.68359375, - "learning_rate": 1.9200253887654714e-05, - "loss": 1.4958, - "step": 790 - }, - { - "epoch": 0.2487779021143766, - "grad_norm": 0.9375, - "learning_rate": 1.919771501110759e-05, - "loss": 1.4217, - "step": 792 - }, - { - "epoch": 0.24940612913991794, - "grad_norm": 0.70703125, - "learning_rate": 1.9195176134560457e-05, - "loss": 1.3905, - "step": 794 - }, - { - "epoch": 0.25003435616545927, - "grad_norm": 0.8203125, - "learning_rate": 1.919263725801333e-05, - "loss": 1.3715, - "step": 796 - }, - { - "epoch": 0.25066258319100065, - "grad_norm": 0.9296875, - "learning_rate": 1.9190098381466203e-05, - "loss": 1.4086, - "step": 798 - }, - { - "epoch": 0.251290810216542, - "grad_norm": 0.84375, - "learning_rate": 1.9187559504919074e-05, - "loss": 1.4157, - "step": 800 - }, - { - "epoch": 0.25191903724208337, - "grad_norm": 0.7109375, - "learning_rate": 1.9185020628371946e-05, - "loss": 1.2557, - "step": 802 - }, - { - "epoch": 0.2525472642676247, - "grad_norm": 0.7734375, - "learning_rate": 1.918248175182482e-05, - "loss": 1.3713, - "step": 804 - }, - { - "epoch": 0.2531754912931661, - "grad_norm": 0.7265625, - "learning_rate": 1.9179942875277692e-05, - "loss": 1.3549, - "step": 806 - }, - { - "epoch": 0.2538037183187074, - "grad_norm": 0.7734375, - "learning_rate": 1.9177403998730563e-05, - "loss": 1.4184, - "step": 808 - }, - { - "epoch": 0.2544319453442488, - "grad_norm": 0.828125, - "learning_rate": 1.9174865122183435e-05, - "loss": 1.3112, - "step": 810 - }, - { - "epoch": 0.2550601723697901, - "grad_norm": 0.7421875, - "learning_rate": 1.917232624563631e-05, - "loss": 1.3818, - "step": 812 - }, - { - "epoch": 0.2556883993953315, - "grad_norm": 0.796875, - "learning_rate": 1.9169787369089177e-05, - "loss": 1.4245, - "step": 814 - }, - { - "epoch": 0.25631662642087283, - "grad_norm": 0.91015625, - "learning_rate": 1.9167248492542052e-05, - "loss": 1.3986, - "step": 816 - }, - { - "epoch": 0.2569448534464142, - "grad_norm": 0.7421875, - "learning_rate": 1.9164709615994924e-05, - "loss": 1.3054, - "step": 818 - }, - { - "epoch": 0.25757308047195554, - "grad_norm": 0.8046875, - "learning_rate": 1.9162170739447795e-05, - "loss": 1.3303, - "step": 820 - }, - { - "epoch": 0.2582013074974969, - "grad_norm": 0.77734375, - "learning_rate": 1.9159631862900666e-05, - "loss": 1.3877, - "step": 822 - }, - { - "epoch": 0.25882953452303825, - "grad_norm": 0.8359375, - "learning_rate": 1.915709298635354e-05, - "loss": 1.3464, - "step": 824 - }, - { - "epoch": 0.25945776154857964, - "grad_norm": 0.74609375, - "learning_rate": 1.9154554109806412e-05, - "loss": 1.4358, - "step": 826 - }, - { - "epoch": 0.26008598857412096, - "grad_norm": 0.73046875, - "learning_rate": 1.9152015233259284e-05, - "loss": 1.2982, - "step": 828 - }, - { - "epoch": 0.26071421559966235, - "grad_norm": 0.796875, - "learning_rate": 1.9149476356712155e-05, - "loss": 1.398, - "step": 830 - }, - { - "epoch": 0.2613424426252037, - "grad_norm": 0.69921875, - "learning_rate": 1.914693748016503e-05, - "loss": 1.2641, - "step": 832 - }, - { - "epoch": 0.26197066965074506, - "grad_norm": 0.88671875, - "learning_rate": 1.91443986036179e-05, - "loss": 1.3669, - "step": 834 - }, - { - "epoch": 0.2625988966762864, - "grad_norm": 0.796875, - "learning_rate": 1.9141859727070773e-05, - "loss": 1.3182, - "step": 836 - }, - { - "epoch": 0.26322712370182777, - "grad_norm": 0.734375, - "learning_rate": 1.9139320850523644e-05, - "loss": 1.3939, - "step": 838 - }, - { - "epoch": 0.2638553507273691, - "grad_norm": 0.66796875, - "learning_rate": 1.9136781973976516e-05, - "loss": 1.4948, - "step": 840 - }, - { - "epoch": 0.2644835777529105, - "grad_norm": 0.88671875, - "learning_rate": 1.9134243097429387e-05, - "loss": 1.34, - "step": 842 - }, - { - "epoch": 0.2651118047784518, - "grad_norm": 0.890625, - "learning_rate": 1.913170422088226e-05, - "loss": 1.3576, - "step": 844 - }, - { - "epoch": 0.2657400318039932, - "grad_norm": 0.71875, - "learning_rate": 1.9129165344335133e-05, - "loss": 1.3366, - "step": 846 - }, - { - "epoch": 0.2663682588295345, - "grad_norm": 0.8359375, - "learning_rate": 1.9126626467788004e-05, - "loss": 1.4665, - "step": 848 - }, - { - "epoch": 0.2669964858550759, - "grad_norm": 0.69140625, - "learning_rate": 1.912408759124088e-05, - "loss": 1.4036, - "step": 850 - }, - { - "epoch": 0.26762471288061723, - "grad_norm": 0.73046875, - "learning_rate": 1.912154871469375e-05, - "loss": 1.2714, - "step": 852 - }, - { - "epoch": 0.26825293990615856, - "grad_norm": 0.71875, - "learning_rate": 1.9119009838146622e-05, - "loss": 1.3858, - "step": 854 - }, - { - "epoch": 0.26888116693169994, - "grad_norm": 0.734375, - "learning_rate": 1.9116470961599493e-05, - "loss": 1.4882, - "step": 856 - }, - { - "epoch": 0.26950939395724127, - "grad_norm": 0.77734375, - "learning_rate": 1.9113932085052368e-05, - "loss": 1.2592, - "step": 858 - }, - { - "epoch": 0.27013762098278266, - "grad_norm": 0.75, - "learning_rate": 1.911139320850524e-05, - "loss": 1.4349, - "step": 860 - }, - { - "epoch": 0.270765848008324, - "grad_norm": 0.921875, - "learning_rate": 1.910885433195811e-05, - "loss": 1.2003, - "step": 862 - }, - { - "epoch": 0.27139407503386537, - "grad_norm": 0.703125, - "learning_rate": 1.9106315455410982e-05, - "loss": 1.4485, - "step": 864 - }, - { - "epoch": 0.2720223020594067, - "grad_norm": 0.78125, - "learning_rate": 1.9103776578863854e-05, - "loss": 1.2389, - "step": 866 - }, - { - "epoch": 0.2726505290849481, - "grad_norm": 0.75, - "learning_rate": 1.9101237702316725e-05, - "loss": 1.4348, - "step": 868 - }, - { - "epoch": 0.2732787561104894, - "grad_norm": 0.78125, - "learning_rate": 1.90986988257696e-05, - "loss": 1.4559, - "step": 870 - }, - { - "epoch": 0.2739069831360308, - "grad_norm": 0.796875, - "learning_rate": 1.909615994922247e-05, - "loss": 1.4004, - "step": 872 - }, - { - "epoch": 0.2745352101615721, - "grad_norm": 0.8046875, - "learning_rate": 1.9093621072675343e-05, - "loss": 1.3105, - "step": 874 - }, - { - "epoch": 0.2751634371871135, - "grad_norm": 0.78125, - "learning_rate": 1.9091082196128214e-05, - "loss": 1.2796, - "step": 876 - }, - { - "epoch": 0.27579166421265483, - "grad_norm": 0.74609375, - "learning_rate": 1.908854331958109e-05, - "loss": 1.4628, - "step": 878 - }, - { - "epoch": 0.2764198912381962, - "grad_norm": 0.71484375, - "learning_rate": 1.908600444303396e-05, - "loss": 1.3618, - "step": 880 - }, - { - "epoch": 0.27704811826373754, - "grad_norm": 0.73828125, - "learning_rate": 1.908346556648683e-05, - "loss": 1.3635, - "step": 882 - }, - { - "epoch": 0.2776763452892789, - "grad_norm": 0.69921875, - "learning_rate": 1.9080926689939703e-05, - "loss": 1.3921, - "step": 884 - }, - { - "epoch": 0.27830457231482025, - "grad_norm": 0.70703125, - "learning_rate": 1.9078387813392578e-05, - "loss": 1.3431, - "step": 886 - }, - { - "epoch": 0.27893279934036164, - "grad_norm": 0.796875, - "learning_rate": 1.9075848936845446e-05, - "loss": 1.3725, - "step": 888 - }, - { - "epoch": 0.27956102636590296, - "grad_norm": 0.6640625, - "learning_rate": 1.907331006029832e-05, - "loss": 1.2754, - "step": 890 - }, - { - "epoch": 0.28018925339144435, - "grad_norm": 0.99609375, - "learning_rate": 1.9070771183751192e-05, - "loss": 1.1762, - "step": 892 - }, - { - "epoch": 0.2808174804169857, - "grad_norm": 0.80859375, - "learning_rate": 1.9068232307204063e-05, - "loss": 1.301, - "step": 894 - }, - { - "epoch": 0.28144570744252706, - "grad_norm": 0.68359375, - "learning_rate": 1.9065693430656935e-05, - "loss": 1.2999, - "step": 896 - }, - { - "epoch": 0.2820739344680684, - "grad_norm": 0.76171875, - "learning_rate": 1.906315455410981e-05, - "loss": 1.355, - "step": 898 - }, - { - "epoch": 0.28270216149360977, - "grad_norm": 0.71875, - "learning_rate": 1.906061567756268e-05, - "loss": 1.4332, - "step": 900 - }, - { - "epoch": 0.2833303885191511, - "grad_norm": 0.96875, - "learning_rate": 1.9058076801015552e-05, - "loss": 1.4116, - "step": 902 - }, - { - "epoch": 0.2839586155446925, - "grad_norm": 0.8203125, - "learning_rate": 1.9055537924468423e-05, - "loss": 1.3064, - "step": 904 - }, - { - "epoch": 0.2845868425702338, - "grad_norm": 0.81640625, - "learning_rate": 1.9052999047921298e-05, - "loss": 1.5111, - "step": 906 - }, - { - "epoch": 0.2852150695957752, - "grad_norm": 0.8125, - "learning_rate": 1.9050460171374166e-05, - "loss": 1.2457, - "step": 908 - }, - { - "epoch": 0.2858432966213165, - "grad_norm": 0.78125, - "learning_rate": 1.904792129482704e-05, - "loss": 1.346, - "step": 910 - }, - { - "epoch": 0.2864715236468579, - "grad_norm": 0.75390625, - "learning_rate": 1.9045382418279912e-05, - "loss": 1.3722, - "step": 912 - }, - { - "epoch": 0.28709975067239923, - "grad_norm": 0.8203125, - "learning_rate": 1.9042843541732784e-05, - "loss": 1.3245, - "step": 914 - }, - { - "epoch": 0.2877279776979406, - "grad_norm": 0.87109375, - "learning_rate": 1.9040304665185655e-05, - "loss": 1.42, - "step": 916 - }, - { - "epoch": 0.28835620472348195, - "grad_norm": 0.83984375, - "learning_rate": 1.903776578863853e-05, - "loss": 1.405, - "step": 918 - }, - { - "epoch": 0.28898443174902333, - "grad_norm": 0.703125, - "learning_rate": 1.90352269120914e-05, - "loss": 1.3066, - "step": 920 - }, - { - "epoch": 0.28961265877456466, - "grad_norm": 0.8046875, - "learning_rate": 1.9032688035544273e-05, - "loss": 1.3226, - "step": 922 - }, - { - "epoch": 0.29024088580010604, - "grad_norm": 0.875, - "learning_rate": 1.9030149158997144e-05, - "loss": 1.1937, - "step": 924 - }, - { - "epoch": 0.29086911282564737, - "grad_norm": 0.78125, - "learning_rate": 1.902761028245002e-05, - "loss": 1.3474, - "step": 926 - }, - { - "epoch": 0.2914973398511887, - "grad_norm": 0.76171875, - "learning_rate": 1.9025071405902887e-05, - "loss": 1.3306, - "step": 928 - }, - { - "epoch": 0.2921255668767301, - "grad_norm": 0.89453125, - "learning_rate": 1.902253252935576e-05, - "loss": 1.3498, - "step": 930 - }, - { - "epoch": 0.2927537939022714, - "grad_norm": 0.92578125, - "learning_rate": 1.9019993652808633e-05, - "loss": 1.4435, - "step": 932 - }, - { - "epoch": 0.2933820209278128, - "grad_norm": 0.75, - "learning_rate": 1.9017454776261504e-05, - "loss": 1.3682, - "step": 934 - }, - { - "epoch": 0.2940102479533541, - "grad_norm": 0.8203125, - "learning_rate": 1.901491589971438e-05, - "loss": 1.3391, - "step": 936 - }, - { - "epoch": 0.2946384749788955, - "grad_norm": 0.7109375, - "learning_rate": 1.901237702316725e-05, - "loss": 1.5098, - "step": 938 - }, - { - "epoch": 0.29526670200443683, - "grad_norm": 0.7109375, - "learning_rate": 1.9009838146620122e-05, - "loss": 1.4432, - "step": 940 - }, - { - "epoch": 0.2958949290299782, - "grad_norm": 0.703125, - "learning_rate": 1.9007299270072993e-05, - "loss": 1.3789, - "step": 942 - }, - { - "epoch": 0.29652315605551954, - "grad_norm": 0.703125, - "learning_rate": 1.9004760393525868e-05, - "loss": 1.2943, - "step": 944 - }, - { - "epoch": 0.2971513830810609, - "grad_norm": 0.8671875, - "learning_rate": 1.900222151697874e-05, - "loss": 1.3753, - "step": 946 - }, - { - "epoch": 0.29777961010660225, - "grad_norm": 0.70703125, - "learning_rate": 1.899968264043161e-05, - "loss": 1.3704, - "step": 948 - }, - { - "epoch": 0.29840783713214364, - "grad_norm": 0.78515625, - "learning_rate": 1.8997143763884482e-05, - "loss": 1.4176, - "step": 950 - }, - { - "epoch": 0.29903606415768497, - "grad_norm": 0.7890625, - "learning_rate": 1.8994604887337357e-05, - "loss": 1.2448, - "step": 952 - }, - { - "epoch": 0.29966429118322635, - "grad_norm": 0.7421875, - "learning_rate": 1.8992066010790225e-05, - "loss": 1.2357, - "step": 954 - }, - { - "epoch": 0.3002925182087677, - "grad_norm": 0.72265625, - "learning_rate": 1.89895271342431e-05, - "loss": 1.4002, - "step": 956 - }, - { - "epoch": 0.30092074523430906, - "grad_norm": 0.796875, - "learning_rate": 1.898698825769597e-05, - "loss": 1.3756, - "step": 958 - }, - { - "epoch": 0.3015489722598504, - "grad_norm": 0.75390625, - "learning_rate": 1.8984449381148842e-05, - "loss": 1.2851, - "step": 960 - }, - { - "epoch": 0.3021771992853918, - "grad_norm": 0.79296875, - "learning_rate": 1.8981910504601714e-05, - "loss": 1.3339, - "step": 962 - }, - { - "epoch": 0.3028054263109331, - "grad_norm": 0.6953125, - "learning_rate": 1.897937162805459e-05, - "loss": 1.4284, - "step": 964 - }, - { - "epoch": 0.3034336533364745, - "grad_norm": 0.83203125, - "learning_rate": 1.897683275150746e-05, - "loss": 1.3142, - "step": 966 - }, - { - "epoch": 0.3040618803620158, - "grad_norm": 0.76953125, - "learning_rate": 1.897429387496033e-05, - "loss": 1.4217, - "step": 968 - }, - { - "epoch": 0.3046901073875572, - "grad_norm": 0.7890625, - "learning_rate": 1.8971754998413203e-05, - "loss": 1.4308, - "step": 970 - }, - { - "epoch": 0.3053183344130985, - "grad_norm": 0.75390625, - "learning_rate": 1.8969216121866078e-05, - "loss": 1.2463, - "step": 972 - }, - { - "epoch": 0.3059465614386399, - "grad_norm": 0.72265625, - "learning_rate": 1.896667724531895e-05, - "loss": 1.3149, - "step": 974 - }, - { - "epoch": 0.30657478846418124, - "grad_norm": 0.91796875, - "learning_rate": 1.896413836877182e-05, - "loss": 1.3623, - "step": 976 - }, - { - "epoch": 0.3072030154897226, - "grad_norm": 0.69921875, - "learning_rate": 1.8961599492224692e-05, - "loss": 1.5585, - "step": 978 - }, - { - "epoch": 0.30783124251526395, - "grad_norm": 0.71875, - "learning_rate": 1.8959060615677563e-05, - "loss": 1.2155, - "step": 980 - }, - { - "epoch": 0.30845946954080533, - "grad_norm": 0.703125, - "learning_rate": 1.8956521739130434e-05, - "loss": 1.384, - "step": 982 - }, - { - "epoch": 0.30908769656634666, - "grad_norm": 0.890625, - "learning_rate": 1.895398286258331e-05, - "loss": 1.2724, - "step": 984 - }, - { - "epoch": 0.30971592359188804, - "grad_norm": 0.75, - "learning_rate": 1.895144398603618e-05, - "loss": 1.3504, - "step": 986 - }, - { - "epoch": 0.31034415061742937, - "grad_norm": 0.7578125, - "learning_rate": 1.8948905109489052e-05, - "loss": 1.3144, - "step": 988 - }, - { - "epoch": 0.31097237764297075, - "grad_norm": 0.71484375, - "learning_rate": 1.8946366232941923e-05, - "loss": 1.3399, - "step": 990 - }, - { - "epoch": 0.3116006046685121, - "grad_norm": 0.796875, - "learning_rate": 1.8943827356394798e-05, - "loss": 1.3355, - "step": 992 - }, - { - "epoch": 0.31222883169405347, - "grad_norm": 0.78125, - "learning_rate": 1.894128847984767e-05, - "loss": 1.2823, - "step": 994 - }, - { - "epoch": 0.3128570587195948, - "grad_norm": 0.78125, - "learning_rate": 1.893874960330054e-05, - "loss": 1.4969, - "step": 996 - }, - { - "epoch": 0.3134852857451361, - "grad_norm": 0.7890625, - "learning_rate": 1.8936210726753412e-05, - "loss": 1.3046, - "step": 998 - }, - { - "epoch": 0.3141135127706775, - "grad_norm": 0.7109375, - "learning_rate": 1.8933671850206287e-05, - "loss": 1.4317, - "step": 1000 - }, - { - "epoch": 0.31474173979621883, - "grad_norm": 0.71875, - "learning_rate": 1.8931132973659155e-05, - "loss": 1.3786, - "step": 1002 - }, - { - "epoch": 0.3153699668217602, - "grad_norm": 0.73046875, - "learning_rate": 1.892859409711203e-05, - "loss": 1.3259, - "step": 1004 - }, - { - "epoch": 0.31599819384730155, - "grad_norm": 0.72265625, - "learning_rate": 1.89260552205649e-05, - "loss": 1.3619, - "step": 1006 - }, - { - "epoch": 0.31662642087284293, - "grad_norm": 0.7578125, - "learning_rate": 1.8923516344017773e-05, - "loss": 1.4299, - "step": 1008 - }, - { - "epoch": 0.31725464789838426, - "grad_norm": 0.78515625, - "learning_rate": 1.8920977467470644e-05, - "loss": 1.389, - "step": 1010 - }, - { - "epoch": 0.31788287492392564, - "grad_norm": 0.8046875, - "learning_rate": 1.891843859092352e-05, - "loss": 1.3459, - "step": 1012 - }, - { - "epoch": 0.31851110194946697, - "grad_norm": 0.765625, - "learning_rate": 1.891589971437639e-05, - "loss": 1.4309, - "step": 1014 - }, - { - "epoch": 0.31913932897500835, - "grad_norm": 0.76953125, - "learning_rate": 1.891336083782926e-05, - "loss": 1.3712, - "step": 1016 - }, - { - "epoch": 0.3197675560005497, - "grad_norm": 0.80859375, - "learning_rate": 1.8910821961282133e-05, - "loss": 1.3044, - "step": 1018 - }, - { - "epoch": 0.32039578302609106, - "grad_norm": 0.6796875, - "learning_rate": 1.8908283084735008e-05, - "loss": 1.3589, - "step": 1020 - }, - { - "epoch": 0.3210240100516324, - "grad_norm": 0.69140625, - "learning_rate": 1.890574420818788e-05, - "loss": 1.2593, - "step": 1022 - }, - { - "epoch": 0.3216522370771738, - "grad_norm": 0.87109375, - "learning_rate": 1.890320533164075e-05, - "loss": 1.3657, - "step": 1024 - }, - { - "epoch": 0.3222804641027151, - "grad_norm": 0.6796875, - "learning_rate": 1.8900666455093625e-05, - "loss": 1.2129, - "step": 1026 - }, - { - "epoch": 0.3229086911282565, - "grad_norm": 0.71875, - "learning_rate": 1.8898127578546493e-05, - "loss": 1.09, - "step": 1028 - }, - { - "epoch": 0.3235369181537978, - "grad_norm": 0.8671875, - "learning_rate": 1.8895588701999368e-05, - "loss": 1.3569, - "step": 1030 - }, - { - "epoch": 0.3241651451793392, - "grad_norm": 0.78515625, - "learning_rate": 1.889304982545224e-05, - "loss": 1.4419, - "step": 1032 - }, - { - "epoch": 0.3247933722048805, - "grad_norm": 0.7578125, - "learning_rate": 1.889051094890511e-05, - "loss": 1.3802, - "step": 1034 - }, - { - "epoch": 0.3254215992304219, - "grad_norm": 0.75390625, - "learning_rate": 1.8887972072357982e-05, - "loss": 1.312, - "step": 1036 - }, - { - "epoch": 0.32604982625596324, - "grad_norm": 0.74609375, - "learning_rate": 1.8885433195810857e-05, - "loss": 1.4378, - "step": 1038 - }, - { - "epoch": 0.3266780532815046, - "grad_norm": 0.83203125, - "learning_rate": 1.8882894319263728e-05, - "loss": 1.2541, - "step": 1040 - }, - { - "epoch": 0.32730628030704595, - "grad_norm": 0.7421875, - "learning_rate": 1.88803554427166e-05, - "loss": 1.3656, - "step": 1042 - }, - { - "epoch": 0.32793450733258733, - "grad_norm": 0.7578125, - "learning_rate": 1.887781656616947e-05, - "loss": 1.4039, - "step": 1044 - }, - { - "epoch": 0.32856273435812866, - "grad_norm": 0.72265625, - "learning_rate": 1.8875277689622346e-05, - "loss": 1.3563, - "step": 1046 - }, - { - "epoch": 0.32919096138367004, - "grad_norm": 0.88671875, - "learning_rate": 1.8872738813075214e-05, - "loss": 1.285, - "step": 1048 - }, - { - "epoch": 0.3298191884092114, - "grad_norm": 0.84375, - "learning_rate": 1.887019993652809e-05, - "loss": 1.2465, - "step": 1050 - }, - { - "epoch": 0.33044741543475276, - "grad_norm": 0.92578125, - "learning_rate": 1.886766105998096e-05, - "loss": 1.2184, - "step": 1052 - }, - { - "epoch": 0.3310756424602941, - "grad_norm": 0.69921875, - "learning_rate": 1.886512218343383e-05, - "loss": 1.3098, - "step": 1054 - }, - { - "epoch": 0.33170386948583547, - "grad_norm": 0.76171875, - "learning_rate": 1.8862583306886703e-05, - "loss": 1.318, - "step": 1056 - }, - { - "epoch": 0.3323320965113768, - "grad_norm": 0.91015625, - "learning_rate": 1.8860044430339577e-05, - "loss": 1.2984, - "step": 1058 - }, - { - "epoch": 0.3329603235369182, - "grad_norm": 0.78515625, - "learning_rate": 1.885750555379245e-05, - "loss": 1.4075, - "step": 1060 - }, - { - "epoch": 0.3335885505624595, - "grad_norm": 0.94140625, - "learning_rate": 1.885496667724532e-05, - "loss": 1.354, - "step": 1062 - }, - { - "epoch": 0.3342167775880009, - "grad_norm": 0.74609375, - "learning_rate": 1.885242780069819e-05, - "loss": 1.2434, - "step": 1064 - }, - { - "epoch": 0.3348450046135422, - "grad_norm": 0.8359375, - "learning_rate": 1.8849888924151066e-05, - "loss": 1.4308, - "step": 1066 - }, - { - "epoch": 0.33547323163908355, - "grad_norm": 0.8984375, - "learning_rate": 1.8847350047603938e-05, - "loss": 1.2561, - "step": 1068 - }, - { - "epoch": 0.33610145866462493, - "grad_norm": 0.875, - "learning_rate": 1.884481117105681e-05, - "loss": 1.4753, - "step": 1070 - }, - { - "epoch": 0.33672968569016626, - "grad_norm": 0.69921875, - "learning_rate": 1.884227229450968e-05, - "loss": 1.369, - "step": 1072 - }, - { - "epoch": 0.33735791271570764, - "grad_norm": 0.76171875, - "learning_rate": 1.8839733417962552e-05, - "loss": 1.4776, - "step": 1074 - }, - { - "epoch": 0.33798613974124897, - "grad_norm": 0.73046875, - "learning_rate": 1.8837194541415423e-05, - "loss": 1.3619, - "step": 1076 - }, - { - "epoch": 0.33861436676679035, - "grad_norm": 0.77734375, - "learning_rate": 1.8834655664868298e-05, - "loss": 1.2684, - "step": 1078 - }, - { - "epoch": 0.3392425937923317, - "grad_norm": 0.7421875, - "learning_rate": 1.883211678832117e-05, - "loss": 1.4172, - "step": 1080 - }, - { - "epoch": 0.33987082081787306, - "grad_norm": 0.890625, - "learning_rate": 1.882957791177404e-05, - "loss": 1.501, - "step": 1082 - }, - { - "epoch": 0.3404990478434144, - "grad_norm": 0.82421875, - "learning_rate": 1.8827039035226912e-05, - "loss": 1.4823, - "step": 1084 - }, - { - "epoch": 0.3411272748689558, - "grad_norm": 0.8828125, - "learning_rate": 1.8824500158679787e-05, - "loss": 1.2784, - "step": 1086 - }, - { - "epoch": 0.3417555018944971, - "grad_norm": 0.76171875, - "learning_rate": 1.882196128213266e-05, - "loss": 1.359, - "step": 1088 - }, - { - "epoch": 0.3423837289200385, - "grad_norm": 0.79296875, - "learning_rate": 1.881942240558553e-05, - "loss": 1.2725, - "step": 1090 - }, - { - "epoch": 0.3430119559455798, - "grad_norm": 0.76171875, - "learning_rate": 1.88168835290384e-05, - "loss": 1.2185, - "step": 1092 - }, - { - "epoch": 0.3436401829711212, - "grad_norm": 0.703125, - "learning_rate": 1.8814344652491276e-05, - "loss": 1.3709, - "step": 1094 - }, - { - "epoch": 0.3442684099966625, - "grad_norm": 0.79296875, - "learning_rate": 1.8811805775944144e-05, - "loss": 1.4139, - "step": 1096 - }, - { - "epoch": 0.3448966370222039, - "grad_norm": 0.69921875, - "learning_rate": 1.880926689939702e-05, - "loss": 1.5253, - "step": 1098 - }, - { - "epoch": 0.34552486404774524, - "grad_norm": 0.72265625, - "learning_rate": 1.880672802284989e-05, - "loss": 1.2929, - "step": 1100 - }, - { - "epoch": 0.3461530910732866, - "grad_norm": 0.90625, - "learning_rate": 1.880418914630276e-05, - "loss": 1.3314, - "step": 1102 - }, - { - "epoch": 0.34678131809882795, - "grad_norm": 0.70703125, - "learning_rate": 1.8801650269755633e-05, - "loss": 1.1409, - "step": 1104 - }, - { - "epoch": 0.34740954512436933, - "grad_norm": 0.765625, - "learning_rate": 1.8799111393208508e-05, - "loss": 1.4453, - "step": 1106 - }, - { - "epoch": 0.34803777214991066, - "grad_norm": 0.671875, - "learning_rate": 1.879657251666138e-05, - "loss": 1.3495, - "step": 1108 - }, - { - "epoch": 0.34866599917545205, - "grad_norm": 0.77734375, - "learning_rate": 1.879403364011425e-05, - "loss": 1.3406, - "step": 1110 - }, - { - "epoch": 0.3492942262009934, - "grad_norm": 0.85546875, - "learning_rate": 1.8791494763567125e-05, - "loss": 1.2358, - "step": 1112 - }, - { - "epoch": 0.34992245322653476, - "grad_norm": 0.83984375, - "learning_rate": 1.8788955887019997e-05, - "loss": 1.3972, - "step": 1114 - }, - { - "epoch": 0.3505506802520761, - "grad_norm": 0.72265625, - "learning_rate": 1.8786417010472868e-05, - "loss": 1.3597, - "step": 1116 - }, - { - "epoch": 0.35117890727761747, - "grad_norm": 0.66015625, - "learning_rate": 1.878387813392574e-05, - "loss": 1.3003, - "step": 1118 - }, - { - "epoch": 0.3518071343031588, - "grad_norm": 0.86328125, - "learning_rate": 1.8781339257378614e-05, - "loss": 1.2663, - "step": 1120 - }, - { - "epoch": 0.3524353613287002, - "grad_norm": 0.73828125, - "learning_rate": 1.8778800380831482e-05, - "loss": 1.4089, - "step": 1122 - }, - { - "epoch": 0.3530635883542415, - "grad_norm": 0.828125, - "learning_rate": 1.8776261504284357e-05, - "loss": 1.3793, - "step": 1124 - }, - { - "epoch": 0.3536918153797829, - "grad_norm": 0.796875, - "learning_rate": 1.8773722627737228e-05, - "loss": 1.4041, - "step": 1126 - }, - { - "epoch": 0.3543200424053242, - "grad_norm": 0.8046875, - "learning_rate": 1.87711837511901e-05, - "loss": 1.252, - "step": 1128 - }, - { - "epoch": 0.3549482694308656, - "grad_norm": 0.76953125, - "learning_rate": 1.876864487464297e-05, - "loss": 1.3771, - "step": 1130 - }, - { - "epoch": 0.35557649645640693, - "grad_norm": 0.86328125, - "learning_rate": 1.8766105998095846e-05, - "loss": 1.2952, - "step": 1132 - }, - { - "epoch": 0.3562047234819483, - "grad_norm": 0.7734375, - "learning_rate": 1.8763567121548717e-05, - "loss": 1.2377, - "step": 1134 - }, - { - "epoch": 0.35683295050748964, - "grad_norm": 0.78125, - "learning_rate": 1.876102824500159e-05, - "loss": 1.429, - "step": 1136 - }, - { - "epoch": 0.35746117753303097, - "grad_norm": 0.7734375, - "learning_rate": 1.875848936845446e-05, - "loss": 1.3617, - "step": 1138 - }, - { - "epoch": 0.35808940455857236, - "grad_norm": 0.7109375, - "learning_rate": 1.8755950491907335e-05, - "loss": 1.4136, - "step": 1140 - }, - { - "epoch": 0.3587176315841137, - "grad_norm": 0.80859375, - "learning_rate": 1.8753411615360203e-05, - "loss": 1.2859, - "step": 1142 - }, - { - "epoch": 0.35934585860965507, - "grad_norm": 0.6796875, - "learning_rate": 1.8750872738813077e-05, - "loss": 1.2145, - "step": 1144 - }, - { - "epoch": 0.3599740856351964, - "grad_norm": 0.70703125, - "learning_rate": 1.874833386226595e-05, - "loss": 1.294, - "step": 1146 - }, - { - "epoch": 0.3606023126607378, - "grad_norm": 0.8203125, - "learning_rate": 1.874579498571882e-05, - "loss": 1.1749, - "step": 1148 - }, - { - "epoch": 0.3612305396862791, - "grad_norm": 0.75, - "learning_rate": 1.874325610917169e-05, - "loss": 1.2759, - "step": 1150 - }, - { - "epoch": 0.3618587667118205, - "grad_norm": 0.76953125, - "learning_rate": 1.8740717232624566e-05, - "loss": 1.2798, - "step": 1152 - }, - { - "epoch": 0.3624869937373618, - "grad_norm": 0.83203125, - "learning_rate": 1.8738178356077438e-05, - "loss": 1.3493, - "step": 1154 - }, - { - "epoch": 0.3631152207629032, - "grad_norm": 0.76953125, - "learning_rate": 1.873563947953031e-05, - "loss": 1.4311, - "step": 1156 - }, - { - "epoch": 0.36374344778844453, - "grad_norm": 0.765625, - "learning_rate": 1.873310060298318e-05, - "loss": 1.2613, - "step": 1158 - }, - { - "epoch": 0.3643716748139859, - "grad_norm": 1.0078125, - "learning_rate": 1.8730561726436055e-05, - "loss": 1.3474, - "step": 1160 - }, - { - "epoch": 0.36499990183952724, - "grad_norm": 0.7109375, - "learning_rate": 1.8728022849888923e-05, - "loss": 1.4257, - "step": 1162 - }, - { - "epoch": 0.3656281288650686, - "grad_norm": 0.78125, - "learning_rate": 1.8725483973341798e-05, - "loss": 1.3411, - "step": 1164 - }, - { - "epoch": 0.36625635589060995, - "grad_norm": 0.734375, - "learning_rate": 1.872294509679467e-05, - "loss": 1.3309, - "step": 1166 - }, - { - "epoch": 0.36688458291615134, - "grad_norm": 0.8984375, - "learning_rate": 1.872040622024754e-05, - "loss": 1.4467, - "step": 1168 - }, - { - "epoch": 0.36751280994169266, - "grad_norm": 0.8515625, - "learning_rate": 1.8717867343700412e-05, - "loss": 1.2754, - "step": 1170 - }, - { - "epoch": 0.36814103696723405, - "grad_norm": 0.7890625, - "learning_rate": 1.8715328467153287e-05, - "loss": 1.4556, - "step": 1172 - }, - { - "epoch": 0.3687692639927754, - "grad_norm": 0.84375, - "learning_rate": 1.871278959060616e-05, - "loss": 1.3598, - "step": 1174 - }, - { - "epoch": 0.36939749101831676, - "grad_norm": 0.6875, - "learning_rate": 1.871025071405903e-05, - "loss": 1.2428, - "step": 1176 - }, - { - "epoch": 0.3700257180438581, - "grad_norm": 0.8046875, - "learning_rate": 1.87077118375119e-05, - "loss": 1.3761, - "step": 1178 - }, - { - "epoch": 0.37065394506939947, - "grad_norm": 0.78515625, - "learning_rate": 1.8705172960964776e-05, - "loss": 1.3929, - "step": 1180 - }, - { - "epoch": 0.3712821720949408, - "grad_norm": 0.8671875, - "learning_rate": 1.8702634084417647e-05, - "loss": 1.2633, - "step": 1182 - }, - { - "epoch": 0.3719103991204822, - "grad_norm": 0.828125, - "learning_rate": 1.870009520787052e-05, - "loss": 1.4286, - "step": 1184 - }, - { - "epoch": 0.3725386261460235, - "grad_norm": 0.7734375, - "learning_rate": 1.869755633132339e-05, - "loss": 1.2967, - "step": 1186 - }, - { - "epoch": 0.3731668531715649, - "grad_norm": 1.015625, - "learning_rate": 1.869501745477626e-05, - "loss": 1.3566, - "step": 1188 - }, - { - "epoch": 0.3737950801971062, - "grad_norm": 0.71875, - "learning_rate": 1.8692478578229133e-05, - "loss": 1.3837, - "step": 1190 - }, - { - "epoch": 0.3744233072226476, - "grad_norm": 0.9296875, - "learning_rate": 1.8689939701682008e-05, - "loss": 1.369, - "step": 1192 - }, - { - "epoch": 0.37505153424818893, - "grad_norm": 0.71484375, - "learning_rate": 1.868740082513488e-05, - "loss": 1.4206, - "step": 1194 - }, - { - "epoch": 0.3756797612737303, - "grad_norm": 0.75390625, - "learning_rate": 1.868486194858775e-05, - "loss": 1.3345, - "step": 1196 - }, - { - "epoch": 0.37630798829927165, - "grad_norm": 0.84765625, - "learning_rate": 1.8682323072040625e-05, - "loss": 1.3843, - "step": 1198 - }, - { - "epoch": 0.37693621532481303, - "grad_norm": 0.71484375, - "learning_rate": 1.8679784195493496e-05, - "loss": 1.4273, - "step": 1200 - }, - { - "epoch": 0.37756444235035436, - "grad_norm": 0.7734375, - "learning_rate": 1.8677245318946368e-05, - "loss": 1.3729, - "step": 1202 - }, - { - "epoch": 0.37819266937589574, - "grad_norm": 1.15625, - "learning_rate": 1.867470644239924e-05, - "loss": 1.1632, - "step": 1204 - }, - { - "epoch": 0.37882089640143707, - "grad_norm": 0.6796875, - "learning_rate": 1.8672167565852114e-05, - "loss": 1.3493, - "step": 1206 - }, - { - "epoch": 0.37944912342697845, - "grad_norm": 0.7578125, - "learning_rate": 1.8669628689304985e-05, - "loss": 1.3056, - "step": 1208 - }, - { - "epoch": 0.3800773504525198, - "grad_norm": 0.7265625, - "learning_rate": 1.8667089812757857e-05, - "loss": 1.414, - "step": 1210 - }, - { - "epoch": 0.3807055774780611, - "grad_norm": 0.8359375, - "learning_rate": 1.8664550936210728e-05, - "loss": 1.33, - "step": 1212 - }, - { - "epoch": 0.3813338045036025, - "grad_norm": 0.80859375, - "learning_rate": 1.86620120596636e-05, - "loss": 1.378, - "step": 1214 - }, - { - "epoch": 0.3819620315291438, - "grad_norm": 0.95703125, - "learning_rate": 1.865947318311647e-05, - "loss": 1.2628, - "step": 1216 - }, - { - "epoch": 0.3825902585546852, - "grad_norm": 0.73046875, - "learning_rate": 1.8656934306569346e-05, - "loss": 1.2875, - "step": 1218 - }, - { - "epoch": 0.38321848558022653, - "grad_norm": 0.78515625, - "learning_rate": 1.8654395430022217e-05, - "loss": 1.3463, - "step": 1220 - }, - { - "epoch": 0.3838467126057679, - "grad_norm": 0.80078125, - "learning_rate": 1.865185655347509e-05, - "loss": 1.3272, - "step": 1222 - }, - { - "epoch": 0.38447493963130924, - "grad_norm": 0.71484375, - "learning_rate": 1.864931767692796e-05, - "loss": 1.3908, - "step": 1224 - }, - { - "epoch": 0.3851031666568506, - "grad_norm": 0.6796875, - "learning_rate": 1.8646778800380835e-05, - "loss": 1.3235, - "step": 1226 - }, - { - "epoch": 0.38573139368239195, - "grad_norm": 0.74609375, - "learning_rate": 1.8644239923833706e-05, - "loss": 1.2354, - "step": 1228 - }, - { - "epoch": 0.38635962070793334, - "grad_norm": 0.88671875, - "learning_rate": 1.8641701047286577e-05, - "loss": 1.2592, - "step": 1230 - }, - { - "epoch": 0.38698784773347467, - "grad_norm": 0.7265625, - "learning_rate": 1.863916217073945e-05, - "loss": 1.3272, - "step": 1232 - }, - { - "epoch": 0.38761607475901605, - "grad_norm": 0.77734375, - "learning_rate": 1.8636623294192323e-05, - "loss": 1.2147, - "step": 1234 - }, - { - "epoch": 0.3882443017845574, - "grad_norm": 0.7734375, - "learning_rate": 1.863408441764519e-05, - "loss": 1.3168, - "step": 1236 - }, - { - "epoch": 0.38887252881009876, - "grad_norm": 0.73828125, - "learning_rate": 1.8631545541098066e-05, - "loss": 1.2581, - "step": 1238 - }, - { - "epoch": 0.3895007558356401, - "grad_norm": 0.84375, - "learning_rate": 1.8629006664550938e-05, - "loss": 1.404, - "step": 1240 - }, - { - "epoch": 0.3901289828611815, - "grad_norm": 0.79296875, - "learning_rate": 1.862646778800381e-05, - "loss": 1.3546, - "step": 1242 - }, - { - "epoch": 0.3907572098867228, - "grad_norm": 0.74609375, - "learning_rate": 1.862392891145668e-05, - "loss": 1.2896, - "step": 1244 - }, - { - "epoch": 0.3913854369122642, - "grad_norm": 0.74609375, - "learning_rate": 1.8621390034909555e-05, - "loss": 1.3196, - "step": 1246 - }, - { - "epoch": 0.3920136639378055, - "grad_norm": 0.72265625, - "learning_rate": 1.8618851158362427e-05, - "loss": 1.3084, - "step": 1248 - }, - { - "epoch": 0.3926418909633469, - "grad_norm": 0.75390625, - "learning_rate": 1.8616312281815298e-05, - "loss": 1.2459, - "step": 1250 - }, - { - "epoch": 0.3932701179888882, - "grad_norm": 0.73828125, - "learning_rate": 1.861377340526817e-05, - "loss": 1.3642, - "step": 1252 - }, - { - "epoch": 0.3938983450144296, - "grad_norm": 0.9140625, - "learning_rate": 1.8611234528721044e-05, - "loss": 1.2232, - "step": 1254 - }, - { - "epoch": 0.39452657203997094, - "grad_norm": 0.6875, - "learning_rate": 1.8608695652173912e-05, - "loss": 1.2384, - "step": 1256 - }, - { - "epoch": 0.3951547990655123, - "grad_norm": 0.6640625, - "learning_rate": 1.8606156775626787e-05, - "loss": 1.3031, - "step": 1258 - }, - { - "epoch": 0.39578302609105365, - "grad_norm": 0.67578125, - "learning_rate": 1.8603617899079658e-05, - "loss": 1.3142, - "step": 1260 - }, - { - "epoch": 0.39641125311659503, - "grad_norm": 0.875, - "learning_rate": 1.860107902253253e-05, - "loss": 1.2851, - "step": 1262 - }, - { - "epoch": 0.39703948014213636, - "grad_norm": 0.73828125, - "learning_rate": 1.85985401459854e-05, - "loss": 1.3063, - "step": 1264 - }, - { - "epoch": 0.39766770716767774, - "grad_norm": 0.7578125, - "learning_rate": 1.8596001269438276e-05, - "loss": 1.4062, - "step": 1266 - }, - { - "epoch": 0.39829593419321907, - "grad_norm": 0.78125, - "learning_rate": 1.8593462392891147e-05, - "loss": 1.2698, - "step": 1268 - }, - { - "epoch": 0.39892416121876045, - "grad_norm": 0.6796875, - "learning_rate": 1.859092351634402e-05, - "loss": 1.3242, - "step": 1270 - }, - { - "epoch": 0.3995523882443018, - "grad_norm": 0.70703125, - "learning_rate": 1.858838463979689e-05, - "loss": 1.3655, - "step": 1272 - }, - { - "epoch": 0.40018061526984317, - "grad_norm": 0.75, - "learning_rate": 1.8585845763249765e-05, - "loss": 1.259, - "step": 1274 - }, - { - "epoch": 0.4008088422953845, - "grad_norm": 0.8984375, - "learning_rate": 1.8583306886702636e-05, - "loss": 1.2373, - "step": 1276 - }, - { - "epoch": 0.4014370693209259, - "grad_norm": 0.75390625, - "learning_rate": 1.8580768010155507e-05, - "loss": 1.3231, - "step": 1278 - }, - { - "epoch": 0.4020652963464672, - "grad_norm": 0.7421875, - "learning_rate": 1.8578229133608382e-05, - "loss": 1.3715, - "step": 1280 - }, - { - "epoch": 0.40269352337200853, - "grad_norm": 0.91015625, - "learning_rate": 1.857569025706125e-05, - "loss": 1.4227, - "step": 1282 - }, - { - "epoch": 0.4033217503975499, - "grad_norm": 0.72265625, - "learning_rate": 1.8573151380514125e-05, - "loss": 1.4352, - "step": 1284 - }, - { - "epoch": 0.40394997742309124, - "grad_norm": 0.8359375, - "learning_rate": 1.8570612503966996e-05, - "loss": 1.3358, - "step": 1286 - }, - { - "epoch": 0.40457820444863263, - "grad_norm": 0.7734375, - "learning_rate": 1.8568073627419868e-05, - "loss": 1.3508, - "step": 1288 - }, - { - "epoch": 0.40520643147417396, - "grad_norm": 0.94921875, - "learning_rate": 1.856553475087274e-05, - "loss": 1.4527, - "step": 1290 - }, - { - "epoch": 0.40583465849971534, - "grad_norm": 0.68359375, - "learning_rate": 1.8562995874325614e-05, - "loss": 1.4456, - "step": 1292 - }, - { - "epoch": 0.40646288552525667, - "grad_norm": 0.90625, - "learning_rate": 1.8560456997778485e-05, - "loss": 1.3093, - "step": 1294 - }, - { - "epoch": 0.40709111255079805, - "grad_norm": 0.74609375, - "learning_rate": 1.8557918121231357e-05, - "loss": 1.4534, - "step": 1296 - }, - { - "epoch": 0.4077193395763394, - "grad_norm": 0.9609375, - "learning_rate": 1.8555379244684228e-05, - "loss": 1.2337, - "step": 1298 - }, - { - "epoch": 0.40834756660188076, - "grad_norm": 0.71875, - "learning_rate": 1.8552840368137103e-05, - "loss": 1.212, - "step": 1300 - }, - { - "epoch": 0.4089757936274221, - "grad_norm": 0.70703125, - "learning_rate": 1.8550301491589974e-05, - "loss": 1.3673, - "step": 1302 - }, - { - "epoch": 0.4096040206529635, - "grad_norm": 0.6875, - "learning_rate": 1.8547762615042846e-05, - "loss": 1.3345, - "step": 1304 - }, - { - "epoch": 0.4102322476785048, - "grad_norm": 0.70703125, - "learning_rate": 1.8545223738495717e-05, - "loss": 1.3542, - "step": 1306 - }, - { - "epoch": 0.4108604747040462, - "grad_norm": 0.828125, - "learning_rate": 1.854268486194859e-05, - "loss": 1.4953, - "step": 1308 - }, - { - "epoch": 0.4114887017295875, - "grad_norm": 0.7421875, - "learning_rate": 1.854014598540146e-05, - "loss": 1.4254, - "step": 1310 - }, - { - "epoch": 0.4121169287551289, - "grad_norm": 0.71875, - "learning_rate": 1.8537607108854335e-05, - "loss": 1.3089, - "step": 1312 - }, - { - "epoch": 0.4127451557806702, - "grad_norm": 0.73046875, - "learning_rate": 1.8535068232307206e-05, - "loss": 1.3985, - "step": 1314 - }, - { - "epoch": 0.4133733828062116, - "grad_norm": 0.828125, - "learning_rate": 1.8532529355760077e-05, - "loss": 1.45, - "step": 1316 - }, - { - "epoch": 0.41400160983175294, - "grad_norm": 0.71875, - "learning_rate": 1.852999047921295e-05, - "loss": 1.472, - "step": 1318 - }, - { - "epoch": 0.4146298368572943, - "grad_norm": 0.69140625, - "learning_rate": 1.8527451602665823e-05, - "loss": 1.4135, - "step": 1320 - }, - { - "epoch": 0.41525806388283565, - "grad_norm": 0.76171875, - "learning_rate": 1.8524912726118695e-05, - "loss": 1.2985, - "step": 1322 - }, - { - "epoch": 0.41588629090837703, - "grad_norm": 0.84375, - "learning_rate": 1.8522373849571566e-05, - "loss": 1.292, - "step": 1324 - }, - { - "epoch": 0.41651451793391836, - "grad_norm": 0.73046875, - "learning_rate": 1.8519834973024438e-05, - "loss": 1.3459, - "step": 1326 - }, - { - "epoch": 0.41714274495945974, - "grad_norm": 0.72265625, - "learning_rate": 1.8517296096477312e-05, - "loss": 1.3259, - "step": 1328 - }, - { - "epoch": 0.41777097198500107, - "grad_norm": 0.70703125, - "learning_rate": 1.851475721993018e-05, - "loss": 1.3027, - "step": 1330 - }, - { - "epoch": 0.41839919901054246, - "grad_norm": 0.671875, - "learning_rate": 1.8512218343383055e-05, - "loss": 1.3385, - "step": 1332 - }, - { - "epoch": 0.4190274260360838, - "grad_norm": 0.7109375, - "learning_rate": 1.8509679466835926e-05, - "loss": 1.3775, - "step": 1334 - }, - { - "epoch": 0.41965565306162517, - "grad_norm": 0.79296875, - "learning_rate": 1.8507140590288798e-05, - "loss": 1.1561, - "step": 1336 - }, - { - "epoch": 0.4202838800871665, - "grad_norm": 0.8125, - "learning_rate": 1.850460171374167e-05, - "loss": 1.2644, - "step": 1338 - }, - { - "epoch": 0.4209121071127079, - "grad_norm": 0.72265625, - "learning_rate": 1.8502062837194544e-05, - "loss": 1.3686, - "step": 1340 - }, - { - "epoch": 0.4215403341382492, - "grad_norm": 0.79296875, - "learning_rate": 1.8499523960647415e-05, - "loss": 1.4161, - "step": 1342 - }, - { - "epoch": 0.4221685611637906, - "grad_norm": 0.796875, - "learning_rate": 1.8496985084100287e-05, - "loss": 1.3431, - "step": 1344 - }, - { - "epoch": 0.4227967881893319, - "grad_norm": 0.80859375, - "learning_rate": 1.8494446207553158e-05, - "loss": 1.3203, - "step": 1346 - }, - { - "epoch": 0.4234250152148733, - "grad_norm": 0.8359375, - "learning_rate": 1.8491907331006033e-05, - "loss": 1.3866, - "step": 1348 - }, - { - "epoch": 0.42405324224041463, - "grad_norm": 0.73828125, - "learning_rate": 1.84893684544589e-05, - "loss": 1.307, - "step": 1350 - }, - { - "epoch": 0.42468146926595596, - "grad_norm": 0.75390625, - "learning_rate": 1.8486829577911776e-05, - "loss": 1.3054, - "step": 1352 - }, - { - "epoch": 0.42530969629149734, - "grad_norm": 0.73828125, - "learning_rate": 1.8484290701364647e-05, - "loss": 1.263, - "step": 1354 - }, - { - "epoch": 0.42593792331703867, - "grad_norm": 0.7421875, - "learning_rate": 1.848175182481752e-05, - "loss": 1.2961, - "step": 1356 - }, - { - "epoch": 0.42656615034258005, - "grad_norm": 0.70703125, - "learning_rate": 1.847921294827039e-05, - "loss": 1.386, - "step": 1358 - }, - { - "epoch": 0.4271943773681214, - "grad_norm": 0.79296875, - "learning_rate": 1.8476674071723265e-05, - "loss": 1.2587, - "step": 1360 - }, - { - "epoch": 0.42782260439366276, - "grad_norm": 0.80078125, - "learning_rate": 1.8474135195176136e-05, - "loss": 1.3613, - "step": 1362 - }, - { - "epoch": 0.4284508314192041, - "grad_norm": 0.734375, - "learning_rate": 1.8471596318629007e-05, - "loss": 1.4578, - "step": 1364 - }, - { - "epoch": 0.4290790584447455, - "grad_norm": 0.75, - "learning_rate": 1.8469057442081882e-05, - "loss": 1.4915, - "step": 1366 - }, - { - "epoch": 0.4297072854702868, - "grad_norm": 0.984375, - "learning_rate": 1.8466518565534754e-05, - "loss": 1.2513, - "step": 1368 - }, - { - "epoch": 0.4303355124958282, - "grad_norm": 0.78125, - "learning_rate": 1.8463979688987625e-05, - "loss": 1.3317, - "step": 1370 - }, - { - "epoch": 0.4309637395213695, - "grad_norm": 0.76171875, - "learning_rate": 1.8461440812440496e-05, - "loss": 1.3281, - "step": 1372 - }, - { - "epoch": 0.4315919665469109, - "grad_norm": 0.89453125, - "learning_rate": 1.845890193589337e-05, - "loss": 1.2836, - "step": 1374 - }, - { - "epoch": 0.4322201935724522, - "grad_norm": 0.96875, - "learning_rate": 1.845636305934624e-05, - "loss": 1.3258, - "step": 1376 - }, - { - "epoch": 0.4328484205979936, - "grad_norm": 0.703125, - "learning_rate": 1.8453824182799114e-05, - "loss": 1.3192, - "step": 1378 - }, - { - "epoch": 0.43347664762353494, - "grad_norm": 0.7890625, - "learning_rate": 1.8451285306251985e-05, - "loss": 1.2383, - "step": 1380 - }, - { - "epoch": 0.4341048746490763, - "grad_norm": 0.6953125, - "learning_rate": 1.8448746429704857e-05, - "loss": 1.4198, - "step": 1382 - }, - { - "epoch": 0.43473310167461765, - "grad_norm": 0.84375, - "learning_rate": 1.8446207553157728e-05, - "loss": 1.3262, - "step": 1384 - }, - { - "epoch": 0.43536132870015903, - "grad_norm": 0.90234375, - "learning_rate": 1.8443668676610603e-05, - "loss": 1.3783, - "step": 1386 - }, - { - "epoch": 0.43598955572570036, - "grad_norm": 0.8046875, - "learning_rate": 1.8441129800063474e-05, - "loss": 1.3803, - "step": 1388 - }, - { - "epoch": 0.43661778275124175, - "grad_norm": 0.8359375, - "learning_rate": 1.8438590923516346e-05, - "loss": 1.2537, - "step": 1390 - }, - { - "epoch": 0.4372460097767831, - "grad_norm": 0.74609375, - "learning_rate": 1.8436052046969217e-05, - "loss": 1.4251, - "step": 1392 - }, - { - "epoch": 0.43787423680232446, - "grad_norm": 0.80078125, - "learning_rate": 1.843351317042209e-05, - "loss": 1.3708, - "step": 1394 - }, - { - "epoch": 0.4385024638278658, - "grad_norm": 0.81640625, - "learning_rate": 1.8430974293874963e-05, - "loss": 1.3983, - "step": 1396 - }, - { - "epoch": 0.43913069085340717, - "grad_norm": 0.703125, - "learning_rate": 1.8428435417327834e-05, - "loss": 1.3208, - "step": 1398 - }, - { - "epoch": 0.4397589178789485, - "grad_norm": 0.6484375, - "learning_rate": 1.8425896540780706e-05, - "loss": 1.2447, - "step": 1400 - }, - { - "epoch": 0.4403871449044899, - "grad_norm": 0.7265625, - "learning_rate": 1.8423357664233577e-05, - "loss": 1.4995, - "step": 1402 - }, - { - "epoch": 0.4410153719300312, - "grad_norm": 0.69140625, - "learning_rate": 1.842081878768645e-05, - "loss": 1.2333, - "step": 1404 - }, - { - "epoch": 0.4416435989555726, - "grad_norm": 0.72265625, - "learning_rate": 1.8418279911139323e-05, - "loss": 1.438, - "step": 1406 - }, - { - "epoch": 0.4422718259811139, - "grad_norm": 0.6796875, - "learning_rate": 1.8415741034592195e-05, - "loss": 1.3648, - "step": 1408 - }, - { - "epoch": 0.4429000530066553, - "grad_norm": 0.87890625, - "learning_rate": 1.8413202158045066e-05, - "loss": 1.3982, - "step": 1410 - }, - { - "epoch": 0.44352828003219663, - "grad_norm": 0.7734375, - "learning_rate": 1.8410663281497937e-05, - "loss": 1.2714, - "step": 1412 - }, - { - "epoch": 0.444156507057738, - "grad_norm": 0.66015625, - "learning_rate": 1.8408124404950812e-05, - "loss": 1.3464, - "step": 1414 - }, - { - "epoch": 0.44478473408327934, - "grad_norm": 0.671875, - "learning_rate": 1.8405585528403684e-05, - "loss": 1.3379, - "step": 1416 - }, - { - "epoch": 0.4454129611088207, - "grad_norm": 0.73046875, - "learning_rate": 1.8403046651856555e-05, - "loss": 1.3022, - "step": 1418 - }, - { - "epoch": 0.44604118813436205, - "grad_norm": 0.765625, - "learning_rate": 1.8400507775309426e-05, - "loss": 1.3677, - "step": 1420 - }, - { - "epoch": 0.4466694151599034, - "grad_norm": 0.6796875, - "learning_rate": 1.83979688987623e-05, - "loss": 1.3101, - "step": 1422 - }, - { - "epoch": 0.44729764218544477, - "grad_norm": 0.94140625, - "learning_rate": 1.839543002221517e-05, - "loss": 1.2118, - "step": 1424 - }, - { - "epoch": 0.4479258692109861, - "grad_norm": 2.84375, - "learning_rate": 1.8392891145668044e-05, - "loss": 1.2927, - "step": 1426 - }, - { - "epoch": 0.4485540962365275, - "grad_norm": 0.88671875, - "learning_rate": 1.8390352269120915e-05, - "loss": 1.4683, - "step": 1428 - }, - { - "epoch": 0.4491823232620688, - "grad_norm": 0.75, - "learning_rate": 1.8387813392573787e-05, - "loss": 1.2949, - "step": 1430 - }, - { - "epoch": 0.4498105502876102, - "grad_norm": 0.75, - "learning_rate": 1.8385274516026658e-05, - "loss": 1.3789, - "step": 1432 - }, - { - "epoch": 0.4504387773131515, - "grad_norm": 0.7265625, - "learning_rate": 1.8382735639479533e-05, - "loss": 1.3308, - "step": 1434 - }, - { - "epoch": 0.4510670043386929, - "grad_norm": 0.78125, - "learning_rate": 1.8380196762932404e-05, - "loss": 1.3221, - "step": 1436 - }, - { - "epoch": 0.45169523136423423, - "grad_norm": 0.703125, - "learning_rate": 1.8377657886385276e-05, - "loss": 1.353, - "step": 1438 - }, - { - "epoch": 0.4523234583897756, - "grad_norm": 0.84765625, - "learning_rate": 1.8375119009838147e-05, - "loss": 1.2386, - "step": 1440 - }, - { - "epoch": 0.45295168541531694, - "grad_norm": 0.70703125, - "learning_rate": 1.8372580133291022e-05, - "loss": 1.5192, - "step": 1442 - }, - { - "epoch": 0.4535799124408583, - "grad_norm": 0.7890625, - "learning_rate": 1.837004125674389e-05, - "loss": 1.4076, - "step": 1444 - }, - { - "epoch": 0.45420813946639965, - "grad_norm": 0.6953125, - "learning_rate": 1.8367502380196765e-05, - "loss": 1.4394, - "step": 1446 - }, - { - "epoch": 0.45483636649194104, - "grad_norm": 0.96484375, - "learning_rate": 1.836496350364964e-05, - "loss": 1.3238, - "step": 1448 - }, - { - "epoch": 0.45546459351748236, - "grad_norm": 0.75390625, - "learning_rate": 1.8362424627102507e-05, - "loss": 1.2685, - "step": 1450 - }, - { - "epoch": 0.45609282054302375, - "grad_norm": 0.7890625, - "learning_rate": 1.8359885750555382e-05, - "loss": 1.3883, - "step": 1452 - }, - { - "epoch": 0.4567210475685651, - "grad_norm": 0.71484375, - "learning_rate": 1.8357346874008253e-05, - "loss": 1.3735, - "step": 1454 - }, - { - "epoch": 0.45734927459410646, - "grad_norm": 0.7265625, - "learning_rate": 1.8354807997461125e-05, - "loss": 1.4432, - "step": 1456 - }, - { - "epoch": 0.4579775016196478, - "grad_norm": 0.8046875, - "learning_rate": 1.8352269120913996e-05, - "loss": 1.3395, - "step": 1458 - }, - { - "epoch": 0.45860572864518917, - "grad_norm": 0.78515625, - "learning_rate": 1.834973024436687e-05, - "loss": 1.2355, - "step": 1460 - }, - { - "epoch": 0.4592339556707305, - "grad_norm": 0.703125, - "learning_rate": 1.8347191367819742e-05, - "loss": 1.4257, - "step": 1462 - }, - { - "epoch": 0.4598621826962719, - "grad_norm": 0.78515625, - "learning_rate": 1.8344652491272614e-05, - "loss": 1.4014, - "step": 1464 - }, - { - "epoch": 0.4604904097218132, - "grad_norm": 0.66015625, - "learning_rate": 1.8342113614725485e-05, - "loss": 1.4452, - "step": 1466 - }, - { - "epoch": 0.4611186367473546, - "grad_norm": 0.7578125, - "learning_rate": 1.833957473817836e-05, - "loss": 1.2609, - "step": 1468 - }, - { - "epoch": 0.4617468637728959, - "grad_norm": 3.109375, - "learning_rate": 1.8337035861631228e-05, - "loss": 1.3392, - "step": 1470 - }, - { - "epoch": 0.4623750907984373, - "grad_norm": 0.8359375, - "learning_rate": 1.8334496985084103e-05, - "loss": 1.4992, - "step": 1472 - }, - { - "epoch": 0.46300331782397863, - "grad_norm": 0.71875, - "learning_rate": 1.8331958108536974e-05, - "loss": 1.3606, - "step": 1474 - }, - { - "epoch": 0.46363154484952, - "grad_norm": 0.73046875, - "learning_rate": 1.8329419231989845e-05, - "loss": 1.4007, - "step": 1476 - }, - { - "epoch": 0.46425977187506134, - "grad_norm": 0.71484375, - "learning_rate": 1.8326880355442717e-05, - "loss": 1.3096, - "step": 1478 - }, - { - "epoch": 0.46488799890060273, - "grad_norm": 0.75, - "learning_rate": 1.832434147889559e-05, - "loss": 1.3873, - "step": 1480 - }, - { - "epoch": 0.46551622592614406, - "grad_norm": 0.75390625, - "learning_rate": 1.8321802602348463e-05, - "loss": 1.3962, - "step": 1482 - }, - { - "epoch": 0.46614445295168544, - "grad_norm": 1.1171875, - "learning_rate": 1.8319263725801334e-05, - "loss": 1.3512, - "step": 1484 - }, - { - "epoch": 0.46677267997722677, - "grad_norm": 0.671875, - "learning_rate": 1.8316724849254206e-05, - "loss": 1.3546, - "step": 1486 - }, - { - "epoch": 0.46740090700276815, - "grad_norm": 0.76953125, - "learning_rate": 1.831418597270708e-05, - "loss": 1.3739, - "step": 1488 - }, - { - "epoch": 0.4680291340283095, - "grad_norm": 0.71875, - "learning_rate": 1.831164709615995e-05, - "loss": 1.3045, - "step": 1490 - }, - { - "epoch": 0.46865736105385086, - "grad_norm": 0.72265625, - "learning_rate": 1.8309108219612823e-05, - "loss": 1.385, - "step": 1492 - }, - { - "epoch": 0.4692855880793922, - "grad_norm": 0.765625, - "learning_rate": 1.8306569343065695e-05, - "loss": 1.282, - "step": 1494 - }, - { - "epoch": 0.4699138151049335, - "grad_norm": 0.671875, - "learning_rate": 1.8304030466518566e-05, - "loss": 1.5008, - "step": 1496 - }, - { - "epoch": 0.4705420421304749, - "grad_norm": 0.78515625, - "learning_rate": 1.8301491589971437e-05, - "loss": 1.422, - "step": 1498 - }, - { - "epoch": 0.47117026915601623, - "grad_norm": 0.70703125, - "learning_rate": 1.8298952713424312e-05, - "loss": 1.3419, - "step": 1500 - }, - { - "epoch": 0.4717984961815576, - "grad_norm": 0.72265625, - "learning_rate": 1.8296413836877184e-05, - "loss": 1.3878, - "step": 1502 - }, - { - "epoch": 0.47242672320709894, - "grad_norm": 0.7109375, - "learning_rate": 1.8293874960330055e-05, - "loss": 1.4378, - "step": 1504 - }, - { - "epoch": 0.4730549502326403, - "grad_norm": 0.69140625, - "learning_rate": 1.8291336083782926e-05, - "loss": 1.4115, - "step": 1506 - }, - { - "epoch": 0.47368317725818165, - "grad_norm": 0.75, - "learning_rate": 1.82887972072358e-05, - "loss": 1.2909, - "step": 1508 - }, - { - "epoch": 0.47431140428372304, - "grad_norm": 0.74609375, - "learning_rate": 1.8286258330688672e-05, - "loss": 1.3813, - "step": 1510 - }, - { - "epoch": 0.47493963130926437, - "grad_norm": 0.7890625, - "learning_rate": 1.8283719454141544e-05, - "loss": 1.3018, - "step": 1512 - }, - { - "epoch": 0.47556785833480575, - "grad_norm": 0.7109375, - "learning_rate": 1.8281180577594415e-05, - "loss": 1.228, - "step": 1514 - }, - { - "epoch": 0.4761960853603471, - "grad_norm": 0.65625, - "learning_rate": 1.8278641701047287e-05, - "loss": 1.3985, - "step": 1516 - }, - { - "epoch": 0.47682431238588846, - "grad_norm": 0.67578125, - "learning_rate": 1.8276102824500158e-05, - "loss": 1.4065, - "step": 1518 - }, - { - "epoch": 0.4774525394114298, - "grad_norm": 0.7421875, - "learning_rate": 1.8273563947953033e-05, - "loss": 1.34, - "step": 1520 - }, - { - "epoch": 0.47808076643697117, - "grad_norm": 0.73046875, - "learning_rate": 1.8271025071405904e-05, - "loss": 1.3451, - "step": 1522 - }, - { - "epoch": 0.4787089934625125, - "grad_norm": 0.75, - "learning_rate": 1.8268486194858776e-05, - "loss": 1.3477, - "step": 1524 - }, - { - "epoch": 0.4793372204880539, - "grad_norm": 0.734375, - "learning_rate": 1.8265947318311647e-05, - "loss": 1.3247, - "step": 1526 - }, - { - "epoch": 0.4799654475135952, - "grad_norm": 0.73046875, - "learning_rate": 1.8263408441764522e-05, - "loss": 1.21, - "step": 1528 - }, - { - "epoch": 0.4805936745391366, - "grad_norm": 0.71875, - "learning_rate": 1.8260869565217393e-05, - "loss": 1.3398, - "step": 1530 - }, - { - "epoch": 0.4812219015646779, - "grad_norm": 0.734375, - "learning_rate": 1.8258330688670264e-05, - "loss": 1.3262, - "step": 1532 - }, - { - "epoch": 0.4818501285902193, - "grad_norm": 0.75, - "learning_rate": 1.825579181212314e-05, - "loss": 1.4908, - "step": 1534 - }, - { - "epoch": 0.48247835561576063, - "grad_norm": 0.7890625, - "learning_rate": 1.825325293557601e-05, - "loss": 1.3113, - "step": 1536 - }, - { - "epoch": 0.483106582641302, - "grad_norm": 0.6640625, - "learning_rate": 1.8250714059028882e-05, - "loss": 1.2718, - "step": 1538 - }, - { - "epoch": 0.48373480966684335, - "grad_norm": 0.7265625, - "learning_rate": 1.8248175182481753e-05, - "loss": 1.406, - "step": 1540 - }, - { - "epoch": 0.48436303669238473, - "grad_norm": 0.6953125, - "learning_rate": 1.8245636305934625e-05, - "loss": 1.3577, - "step": 1542 - }, - { - "epoch": 0.48499126371792606, - "grad_norm": 0.7421875, - "learning_rate": 1.8243097429387496e-05, - "loss": 1.3054, - "step": 1544 - }, - { - "epoch": 0.48561949074346744, - "grad_norm": 0.78515625, - "learning_rate": 1.824055855284037e-05, - "loss": 1.3842, - "step": 1546 - }, - { - "epoch": 0.48624771776900877, - "grad_norm": 0.76953125, - "learning_rate": 1.8238019676293242e-05, - "loss": 1.3789, - "step": 1548 - }, - { - "epoch": 0.48687594479455015, - "grad_norm": 0.65625, - "learning_rate": 1.8235480799746114e-05, - "loss": 1.3439, - "step": 1550 - }, - { - "epoch": 0.4875041718200915, - "grad_norm": 0.69921875, - "learning_rate": 1.8232941923198985e-05, - "loss": 1.3715, - "step": 1552 - }, - { - "epoch": 0.48813239884563286, - "grad_norm": 0.70703125, - "learning_rate": 1.823040304665186e-05, - "loss": 1.4506, - "step": 1554 - }, - { - "epoch": 0.4887606258711742, - "grad_norm": 0.69140625, - "learning_rate": 1.822786417010473e-05, - "loss": 1.4064, - "step": 1556 - }, - { - "epoch": 0.4893888528967156, - "grad_norm": 0.7578125, - "learning_rate": 1.8225325293557603e-05, - "loss": 1.3322, - "step": 1558 - }, - { - "epoch": 0.4900170799222569, - "grad_norm": 0.76953125, - "learning_rate": 1.8222786417010474e-05, - "loss": 1.3075, - "step": 1560 - }, - { - "epoch": 0.4906453069477983, - "grad_norm": 0.703125, - "learning_rate": 1.822024754046335e-05, - "loss": 1.4187, - "step": 1562 - }, - { - "epoch": 0.4912735339733396, - "grad_norm": 0.8984375, - "learning_rate": 1.8217708663916217e-05, - "loss": 1.3365, - "step": 1564 - }, - { - "epoch": 0.49190176099888094, - "grad_norm": 0.7578125, - "learning_rate": 1.821516978736909e-05, - "loss": 1.3593, - "step": 1566 - }, - { - "epoch": 0.4925299880244223, - "grad_norm": 0.7421875, - "learning_rate": 1.8212630910821963e-05, - "loss": 1.298, - "step": 1568 - }, - { - "epoch": 0.49315821504996366, - "grad_norm": 0.71875, - "learning_rate": 1.8210092034274834e-05, - "loss": 1.4256, - "step": 1570 - }, - { - "epoch": 0.49378644207550504, - "grad_norm": 0.7890625, - "learning_rate": 1.8207553157727706e-05, - "loss": 1.4808, - "step": 1572 - }, - { - "epoch": 0.49441466910104637, - "grad_norm": 0.875, - "learning_rate": 1.820501428118058e-05, - "loss": 1.4233, - "step": 1574 - }, - { - "epoch": 0.49504289612658775, - "grad_norm": 0.7890625, - "learning_rate": 1.8202475404633452e-05, - "loss": 1.3273, - "step": 1576 - }, - { - "epoch": 0.4956711231521291, - "grad_norm": 0.73828125, - "learning_rate": 1.8199936528086323e-05, - "loss": 1.3384, - "step": 1578 - }, - { - "epoch": 0.49629935017767046, - "grad_norm": 0.7421875, - "learning_rate": 1.8197397651539195e-05, - "loss": 1.4455, - "step": 1580 - }, - { - "epoch": 0.4969275772032118, - "grad_norm": 0.7421875, - "learning_rate": 1.819485877499207e-05, - "loss": 1.3235, - "step": 1582 - }, - { - "epoch": 0.4975558042287532, - "grad_norm": 0.73828125, - "learning_rate": 1.8192319898444937e-05, - "loss": 1.3602, - "step": 1584 - }, - { - "epoch": 0.4981840312542945, - "grad_norm": 0.890625, - "learning_rate": 1.8189781021897812e-05, - "loss": 1.2892, - "step": 1586 - }, - { - "epoch": 0.4988122582798359, - "grad_norm": 0.85546875, - "learning_rate": 1.8187242145350684e-05, - "loss": 1.2329, - "step": 1588 - }, - { - "epoch": 0.4994404853053772, - "grad_norm": 0.68359375, - "learning_rate": 1.8184703268803555e-05, - "loss": 1.2801, - "step": 1590 - }, - { - "epoch": 0.5000687123309185, - "grad_norm": 0.76171875, - "learning_rate": 1.8182164392256426e-05, - "loss": 1.3857, - "step": 1592 - }, - { - "epoch": 0.50069693935646, - "grad_norm": 0.8671875, - "learning_rate": 1.81796255157093e-05, - "loss": 1.3113, - "step": 1594 - }, - { - "epoch": 0.5013251663820013, - "grad_norm": 0.78125, - "learning_rate": 1.8177086639162172e-05, - "loss": 1.4523, - "step": 1596 - }, - { - "epoch": 0.5019533934075426, - "grad_norm": 0.77734375, - "learning_rate": 1.8174547762615044e-05, - "loss": 1.3314, - "step": 1598 - }, - { - "epoch": 0.502581620433084, - "grad_norm": 0.796875, - "learning_rate": 1.8172008886067915e-05, - "loss": 1.3618, - "step": 1600 - }, - { - "epoch": 0.5032098474586254, - "grad_norm": 0.8203125, - "learning_rate": 1.816947000952079e-05, - "loss": 1.3553, - "step": 1602 - }, - { - "epoch": 0.5038380744841667, - "grad_norm": 0.671875, - "learning_rate": 1.816693113297366e-05, - "loss": 1.4201, - "step": 1604 - }, - { - "epoch": 0.5044663015097081, - "grad_norm": 0.8125, - "learning_rate": 1.8164392256426533e-05, - "loss": 1.309, - "step": 1606 - }, - { - "epoch": 0.5050945285352494, - "grad_norm": 0.7734375, - "learning_rate": 1.8161853379879404e-05, - "loss": 1.3145, - "step": 1608 - }, - { - "epoch": 0.5057227555607908, - "grad_norm": 0.87890625, - "learning_rate": 1.8159314503332275e-05, - "loss": 1.3546, - "step": 1610 - }, - { - "epoch": 0.5063509825863322, - "grad_norm": 0.7109375, - "learning_rate": 1.8156775626785147e-05, - "loss": 1.2818, - "step": 1612 - }, - { - "epoch": 0.5069792096118735, - "grad_norm": 0.796875, - "learning_rate": 1.815423675023802e-05, - "loss": 1.4176, - "step": 1614 - }, - { - "epoch": 0.5076074366374148, - "grad_norm": 0.734375, - "learning_rate": 1.8151697873690893e-05, - "loss": 1.3501, - "step": 1616 - }, - { - "epoch": 0.5082356636629562, - "grad_norm": 0.74609375, - "learning_rate": 1.8149158997143764e-05, - "loss": 1.3265, - "step": 1618 - }, - { - "epoch": 0.5088638906884976, - "grad_norm": 0.79296875, - "learning_rate": 1.814662012059664e-05, - "loss": 1.333, - "step": 1620 - }, - { - "epoch": 0.5094921177140389, - "grad_norm": 0.7265625, - "learning_rate": 1.814408124404951e-05, - "loss": 1.2086, - "step": 1622 - }, - { - "epoch": 0.5101203447395802, - "grad_norm": 0.8046875, - "learning_rate": 1.8141542367502382e-05, - "loss": 1.2181, - "step": 1624 - }, - { - "epoch": 0.5107485717651217, - "grad_norm": 0.72265625, - "learning_rate": 1.8139003490955253e-05, - "loss": 1.3269, - "step": 1626 - }, - { - "epoch": 0.511376798790663, - "grad_norm": 0.67578125, - "learning_rate": 1.8136464614408128e-05, - "loss": 1.2733, - "step": 1628 - }, - { - "epoch": 0.5120050258162043, - "grad_norm": 0.69921875, - "learning_rate": 1.8133925737861e-05, - "loss": 1.253, - "step": 1630 - }, - { - "epoch": 0.5126332528417457, - "grad_norm": 0.71484375, - "learning_rate": 1.813138686131387e-05, - "loss": 1.511, - "step": 1632 - }, - { - "epoch": 0.5132614798672871, - "grad_norm": 0.671875, - "learning_rate": 1.8128847984766742e-05, - "loss": 1.2451, - "step": 1634 - }, - { - "epoch": 0.5138897068928284, - "grad_norm": 0.66015625, - "learning_rate": 1.8126309108219614e-05, - "loss": 1.2587, - "step": 1636 - }, - { - "epoch": 0.5145179339183698, - "grad_norm": 0.875, - "learning_rate": 1.8123770231672485e-05, - "loss": 1.301, - "step": 1638 - }, - { - "epoch": 0.5151461609439111, - "grad_norm": 0.8046875, - "learning_rate": 1.812123135512536e-05, - "loss": 1.4174, - "step": 1640 - }, - { - "epoch": 0.5157743879694524, - "grad_norm": 0.7265625, - "learning_rate": 1.811869247857823e-05, - "loss": 1.2725, - "step": 1642 - }, - { - "epoch": 0.5164026149949938, - "grad_norm": 0.81640625, - "learning_rate": 1.8116153602031103e-05, - "loss": 1.3744, - "step": 1644 - }, - { - "epoch": 0.5170308420205352, - "grad_norm": 0.734375, - "learning_rate": 1.8113614725483974e-05, - "loss": 1.2455, - "step": 1646 - }, - { - "epoch": 0.5176590690460765, - "grad_norm": 0.68359375, - "learning_rate": 1.811107584893685e-05, - "loss": 1.4318, - "step": 1648 - }, - { - "epoch": 0.5182872960716178, - "grad_norm": 0.80859375, - "learning_rate": 1.810853697238972e-05, - "loss": 1.3426, - "step": 1650 - }, - { - "epoch": 0.5189155230971593, - "grad_norm": 0.73046875, - "learning_rate": 1.810599809584259e-05, - "loss": 1.1767, - "step": 1652 - }, - { - "epoch": 0.5195437501227006, - "grad_norm": 0.73046875, - "learning_rate": 1.8103459219295463e-05, - "loss": 1.2447, - "step": 1654 - }, - { - "epoch": 0.5201719771482419, - "grad_norm": 0.87890625, - "learning_rate": 1.8100920342748338e-05, - "loss": 1.3263, - "step": 1656 - }, - { - "epoch": 0.5208002041737833, - "grad_norm": 0.74609375, - "learning_rate": 1.8098381466201206e-05, - "loss": 1.3857, - "step": 1658 - }, - { - "epoch": 0.5214284311993247, - "grad_norm": 0.7421875, - "learning_rate": 1.809584258965408e-05, - "loss": 1.4577, - "step": 1660 - }, - { - "epoch": 0.522056658224866, - "grad_norm": 0.77734375, - "learning_rate": 1.8093303713106952e-05, - "loss": 1.4192, - "step": 1662 - }, - { - "epoch": 0.5226848852504073, - "grad_norm": 0.72265625, - "learning_rate": 1.8090764836559823e-05, - "loss": 1.2375, - "step": 1664 - }, - { - "epoch": 0.5233131122759487, - "grad_norm": 0.7578125, - "learning_rate": 1.8088225960012695e-05, - "loss": 1.3199, - "step": 1666 - }, - { - "epoch": 0.5239413393014901, - "grad_norm": 0.859375, - "learning_rate": 1.808568708346557e-05, - "loss": 1.3239, - "step": 1668 - }, - { - "epoch": 0.5245695663270314, - "grad_norm": 0.63671875, - "learning_rate": 1.808314820691844e-05, - "loss": 1.4124, - "step": 1670 - }, - { - "epoch": 0.5251977933525728, - "grad_norm": 0.8046875, - "learning_rate": 1.8080609330371312e-05, - "loss": 1.3539, - "step": 1672 - }, - { - "epoch": 0.5258260203781141, - "grad_norm": 0.75, - "learning_rate": 1.8078070453824183e-05, - "loss": 1.2979, - "step": 1674 - }, - { - "epoch": 0.5264542474036555, - "grad_norm": 0.67578125, - "learning_rate": 1.8075531577277058e-05, - "loss": 1.3582, - "step": 1676 - }, - { - "epoch": 0.5270824744291969, - "grad_norm": 0.7890625, - "learning_rate": 1.8072992700729926e-05, - "loss": 1.2866, - "step": 1678 - }, - { - "epoch": 0.5277107014547382, - "grad_norm": 0.83984375, - "learning_rate": 1.80704538241828e-05, - "loss": 1.1929, - "step": 1680 - }, - { - "epoch": 0.5283389284802795, - "grad_norm": 0.6875, - "learning_rate": 1.8067914947635672e-05, - "loss": 1.2081, - "step": 1682 - }, - { - "epoch": 0.528967155505821, - "grad_norm": 0.67578125, - "learning_rate": 1.8065376071088544e-05, - "loss": 1.4058, - "step": 1684 - }, - { - "epoch": 0.5295953825313623, - "grad_norm": 0.69140625, - "learning_rate": 1.8062837194541415e-05, - "loss": 1.3689, - "step": 1686 - }, - { - "epoch": 0.5302236095569036, - "grad_norm": 0.7734375, - "learning_rate": 1.806029831799429e-05, - "loss": 1.2963, - "step": 1688 - }, - { - "epoch": 0.530851836582445, - "grad_norm": 0.76953125, - "learning_rate": 1.805775944144716e-05, - "loss": 1.3622, - "step": 1690 - }, - { - "epoch": 0.5314800636079864, - "grad_norm": 0.765625, - "learning_rate": 1.8055220564900033e-05, - "loss": 1.2601, - "step": 1692 - }, - { - "epoch": 0.5321082906335277, - "grad_norm": 0.78515625, - "learning_rate": 1.8052681688352904e-05, - "loss": 1.2963, - "step": 1694 - }, - { - "epoch": 0.532736517659069, - "grad_norm": 0.7109375, - "learning_rate": 1.805014281180578e-05, - "loss": 1.3702, - "step": 1696 - }, - { - "epoch": 0.5333647446846104, - "grad_norm": 0.734375, - "learning_rate": 1.804760393525865e-05, - "loss": 1.2802, - "step": 1698 - }, - { - "epoch": 0.5339929717101518, - "grad_norm": 1.0078125, - "learning_rate": 1.804506505871152e-05, - "loss": 1.2703, - "step": 1700 - }, - { - "epoch": 0.5346211987356931, - "grad_norm": 0.71875, - "learning_rate": 1.8042526182164393e-05, - "loss": 1.2858, - "step": 1702 - }, - { - "epoch": 0.5352494257612345, - "grad_norm": 1.03125, - "learning_rate": 1.8039987305617264e-05, - "loss": 1.2983, - "step": 1704 - }, - { - "epoch": 0.5358776527867758, - "grad_norm": 0.71484375, - "learning_rate": 1.803744842907014e-05, - "loss": 1.4192, - "step": 1706 - }, - { - "epoch": 0.5365058798123171, - "grad_norm": 0.71484375, - "learning_rate": 1.803490955252301e-05, - "loss": 1.4011, - "step": 1708 - }, - { - "epoch": 0.5371341068378586, - "grad_norm": 0.6796875, - "learning_rate": 1.8032370675975882e-05, - "loss": 1.3894, - "step": 1710 - }, - { - "epoch": 0.5377623338633999, - "grad_norm": 0.75390625, - "learning_rate": 1.8029831799428753e-05, - "loss": 1.5168, - "step": 1712 - }, - { - "epoch": 0.5383905608889412, - "grad_norm": 1.4609375, - "learning_rate": 1.8027292922881628e-05, - "loss": 1.3708, - "step": 1714 - }, - { - "epoch": 0.5390187879144825, - "grad_norm": 0.76953125, - "learning_rate": 1.80247540463345e-05, - "loss": 1.3817, - "step": 1716 - }, - { - "epoch": 0.539647014940024, - "grad_norm": 0.7578125, - "learning_rate": 1.802221516978737e-05, - "loss": 1.3174, - "step": 1718 - }, - { - "epoch": 0.5402752419655653, - "grad_norm": 0.73828125, - "learning_rate": 1.8019676293240242e-05, - "loss": 1.3609, - "step": 1720 - }, - { - "epoch": 0.5409034689911066, - "grad_norm": 0.734375, - "learning_rate": 1.8017137416693117e-05, - "loss": 1.4835, - "step": 1722 - }, - { - "epoch": 0.541531696016648, - "grad_norm": 0.69921875, - "learning_rate": 1.801459854014599e-05, - "loss": 1.5052, - "step": 1724 - }, - { - "epoch": 0.5421599230421894, - "grad_norm": 0.72265625, - "learning_rate": 1.801205966359886e-05, - "loss": 1.3482, - "step": 1726 - }, - { - "epoch": 0.5427881500677307, - "grad_norm": 0.79296875, - "learning_rate": 1.800952078705173e-05, - "loss": 1.21, - "step": 1728 - }, - { - "epoch": 0.5434163770932721, - "grad_norm": 0.75390625, - "learning_rate": 1.8006981910504602e-05, - "loss": 1.3702, - "step": 1730 - }, - { - "epoch": 0.5440446041188134, - "grad_norm": 0.7578125, - "learning_rate": 1.8004443033957474e-05, - "loss": 1.4266, - "step": 1732 - }, - { - "epoch": 0.5446728311443548, - "grad_norm": 0.671875, - "learning_rate": 1.800190415741035e-05, - "loss": 1.339, - "step": 1734 - }, - { - "epoch": 0.5453010581698962, - "grad_norm": 0.74609375, - "learning_rate": 1.799936528086322e-05, - "loss": 1.3851, - "step": 1736 - }, - { - "epoch": 0.5459292851954375, - "grad_norm": 0.69140625, - "learning_rate": 1.799682640431609e-05, - "loss": 1.4017, - "step": 1738 - }, - { - "epoch": 0.5465575122209788, - "grad_norm": 0.734375, - "learning_rate": 1.7994287527768963e-05, - "loss": 1.2933, - "step": 1740 - }, - { - "epoch": 0.5471857392465203, - "grad_norm": 0.74609375, - "learning_rate": 1.7991748651221838e-05, - "loss": 1.3104, - "step": 1742 - }, - { - "epoch": 0.5478139662720616, - "grad_norm": 0.65625, - "learning_rate": 1.798920977467471e-05, - "loss": 1.231, - "step": 1744 - }, - { - "epoch": 0.5484421932976029, - "grad_norm": 0.7578125, - "learning_rate": 1.798667089812758e-05, - "loss": 1.4584, - "step": 1746 - }, - { - "epoch": 0.5490704203231442, - "grad_norm": 0.75390625, - "learning_rate": 1.798413202158045e-05, - "loss": 1.2988, - "step": 1748 - }, - { - "epoch": 0.5496986473486857, - "grad_norm": 0.7578125, - "learning_rate": 1.7981593145033326e-05, - "loss": 1.2553, - "step": 1750 - }, - { - "epoch": 0.550326874374227, - "grad_norm": 0.75, - "learning_rate": 1.7979054268486194e-05, - "loss": 1.3824, - "step": 1752 - }, - { - "epoch": 0.5509551013997683, - "grad_norm": 0.765625, - "learning_rate": 1.797651539193907e-05, - "loss": 1.4831, - "step": 1754 - }, - { - "epoch": 0.5515833284253097, - "grad_norm": 0.8046875, - "learning_rate": 1.797397651539194e-05, - "loss": 1.3839, - "step": 1756 - }, - { - "epoch": 0.5522115554508511, - "grad_norm": 0.7578125, - "learning_rate": 1.7971437638844812e-05, - "loss": 1.4556, - "step": 1758 - }, - { - "epoch": 0.5528397824763924, - "grad_norm": 0.67578125, - "learning_rate": 1.7968898762297683e-05, - "loss": 1.3564, - "step": 1760 - }, - { - "epoch": 0.5534680095019338, - "grad_norm": 0.81640625, - "learning_rate": 1.7966359885750558e-05, - "loss": 1.4027, - "step": 1762 - }, - { - "epoch": 0.5540962365274751, - "grad_norm": 0.93359375, - "learning_rate": 1.796382100920343e-05, - "loss": 1.3738, - "step": 1764 - }, - { - "epoch": 0.5547244635530165, - "grad_norm": 0.8203125, - "learning_rate": 1.79612821326563e-05, - "loss": 1.4116, - "step": 1766 - }, - { - "epoch": 0.5553526905785579, - "grad_norm": 0.9140625, - "learning_rate": 1.7958743256109172e-05, - "loss": 1.4383, - "step": 1768 - }, - { - "epoch": 0.5559809176040992, - "grad_norm": 0.76171875, - "learning_rate": 1.7956204379562047e-05, - "loss": 1.2174, - "step": 1770 - }, - { - "epoch": 0.5566091446296405, - "grad_norm": 0.75390625, - "learning_rate": 1.7953665503014915e-05, - "loss": 1.2893, - "step": 1772 - }, - { - "epoch": 0.557237371655182, - "grad_norm": 0.796875, - "learning_rate": 1.795112662646779e-05, - "loss": 1.291, - "step": 1774 - }, - { - "epoch": 0.5578655986807233, - "grad_norm": 0.82421875, - "learning_rate": 1.794858774992066e-05, - "loss": 1.4798, - "step": 1776 - }, - { - "epoch": 0.5584938257062646, - "grad_norm": 0.9296875, - "learning_rate": 1.7946048873373533e-05, - "loss": 1.2961, - "step": 1778 - }, - { - "epoch": 0.5591220527318059, - "grad_norm": 2.1875, - "learning_rate": 1.7943509996826404e-05, - "loss": 1.3342, - "step": 1780 - }, - { - "epoch": 0.5597502797573473, - "grad_norm": 0.890625, - "learning_rate": 1.794097112027928e-05, - "loss": 1.2582, - "step": 1782 - }, - { - "epoch": 0.5603785067828887, - "grad_norm": 0.66796875, - "learning_rate": 1.793843224373215e-05, - "loss": 1.3106, - "step": 1784 - }, - { - "epoch": 0.56100673380843, - "grad_norm": 0.8125, - "learning_rate": 1.793589336718502e-05, - "loss": 1.3369, - "step": 1786 - }, - { - "epoch": 0.5616349608339714, - "grad_norm": 0.859375, - "learning_rate": 1.7933354490637893e-05, - "loss": 1.2346, - "step": 1788 - }, - { - "epoch": 0.5622631878595127, - "grad_norm": 0.76171875, - "learning_rate": 1.7930815614090768e-05, - "loss": 1.2644, - "step": 1790 - }, - { - "epoch": 0.5628914148850541, - "grad_norm": 0.8359375, - "learning_rate": 1.792827673754364e-05, - "loss": 1.3247, - "step": 1792 - }, - { - "epoch": 0.5635196419105954, - "grad_norm": 0.7734375, - "learning_rate": 1.792573786099651e-05, - "loss": 1.2764, - "step": 1794 - }, - { - "epoch": 0.5641478689361368, - "grad_norm": 0.71484375, - "learning_rate": 1.7923198984449385e-05, - "loss": 1.2428, - "step": 1796 - }, - { - "epoch": 0.5647760959616781, - "grad_norm": 0.80078125, - "learning_rate": 1.7920660107902253e-05, - "loss": 1.4744, - "step": 1798 - }, - { - "epoch": 0.5654043229872195, - "grad_norm": 0.7421875, - "learning_rate": 1.7918121231355128e-05, - "loss": 1.3754, - "step": 1800 - }, - { - "epoch": 0.5660325500127609, - "grad_norm": 0.8828125, - "learning_rate": 1.7915582354808e-05, - "loss": 1.3084, - "step": 1802 - }, - { - "epoch": 0.5666607770383022, - "grad_norm": 0.75, - "learning_rate": 1.791304347826087e-05, - "loss": 1.3269, - "step": 1804 - }, - { - "epoch": 0.5672890040638435, - "grad_norm": 0.7265625, - "learning_rate": 1.7910504601713742e-05, - "loss": 1.3141, - "step": 1806 - }, - { - "epoch": 0.567917231089385, - "grad_norm": 0.90234375, - "learning_rate": 1.7907965725166617e-05, - "loss": 1.1482, - "step": 1808 - }, - { - "epoch": 0.5685454581149263, - "grad_norm": 0.69921875, - "learning_rate": 1.7905426848619488e-05, - "loss": 1.3093, - "step": 1810 - }, - { - "epoch": 0.5691736851404676, - "grad_norm": 0.6640625, - "learning_rate": 1.790288797207236e-05, - "loss": 1.4742, - "step": 1812 - }, - { - "epoch": 0.569801912166009, - "grad_norm": 0.8203125, - "learning_rate": 1.790034909552523e-05, - "loss": 1.3429, - "step": 1814 - }, - { - "epoch": 0.5704301391915504, - "grad_norm": 0.77734375, - "learning_rate": 1.7897810218978106e-05, - "loss": 1.3247, - "step": 1816 - }, - { - "epoch": 0.5710583662170917, - "grad_norm": 0.6875, - "learning_rate": 1.7895271342430974e-05, - "loss": 1.386, - "step": 1818 - }, - { - "epoch": 0.571686593242633, - "grad_norm": 0.6796875, - "learning_rate": 1.789273246588385e-05, - "loss": 1.3501, - "step": 1820 - }, - { - "epoch": 0.5723148202681744, - "grad_norm": 0.73828125, - "learning_rate": 1.789019358933672e-05, - "loss": 1.2759, - "step": 1822 - }, - { - "epoch": 0.5729430472937158, - "grad_norm": 0.78515625, - "learning_rate": 1.788765471278959e-05, - "loss": 1.2834, - "step": 1824 - }, - { - "epoch": 0.5735712743192571, - "grad_norm": 0.765625, - "learning_rate": 1.7885115836242463e-05, - "loss": 1.3764, - "step": 1826 - }, - { - "epoch": 0.5741995013447985, - "grad_norm": 0.80859375, - "learning_rate": 1.7882576959695337e-05, - "loss": 1.2428, - "step": 1828 - }, - { - "epoch": 0.5748277283703398, - "grad_norm": 0.78125, - "learning_rate": 1.788003808314821e-05, - "loss": 1.3577, - "step": 1830 - }, - { - "epoch": 0.5754559553958812, - "grad_norm": 0.94921875, - "learning_rate": 1.787749920660108e-05, - "loss": 1.2091, - "step": 1832 - }, - { - "epoch": 0.5760841824214226, - "grad_norm": 0.83203125, - "learning_rate": 1.787496033005395e-05, - "loss": 1.4, - "step": 1834 - }, - { - "epoch": 0.5767124094469639, - "grad_norm": 0.7109375, - "learning_rate": 1.7872421453506826e-05, - "loss": 1.3621, - "step": 1836 - }, - { - "epoch": 0.5773406364725052, - "grad_norm": 0.828125, - "learning_rate": 1.7869882576959698e-05, - "loss": 1.3756, - "step": 1838 - }, - { - "epoch": 0.5779688634980467, - "grad_norm": 0.68359375, - "learning_rate": 1.786734370041257e-05, - "loss": 1.3658, - "step": 1840 - }, - { - "epoch": 0.578597090523588, - "grad_norm": 0.7109375, - "learning_rate": 1.786480482386544e-05, - "loss": 1.2812, - "step": 1842 - }, - { - "epoch": 0.5792253175491293, - "grad_norm": 0.73828125, - "learning_rate": 1.7862265947318312e-05, - "loss": 1.4921, - "step": 1844 - }, - { - "epoch": 0.5798535445746706, - "grad_norm": 0.77734375, - "learning_rate": 1.7859727070771183e-05, - "loss": 1.3042, - "step": 1846 - }, - { - "epoch": 0.5804817716002121, - "grad_norm": 1.203125, - "learning_rate": 1.7857188194224058e-05, - "loss": 1.1429, - "step": 1848 - }, - { - "epoch": 0.5811099986257534, - "grad_norm": 0.73046875, - "learning_rate": 1.785464931767693e-05, - "loss": 1.4471, - "step": 1850 - }, - { - "epoch": 0.5817382256512947, - "grad_norm": 0.6953125, - "learning_rate": 1.78521104411298e-05, - "loss": 1.3808, - "step": 1852 - }, - { - "epoch": 0.5823664526768361, - "grad_norm": 0.94140625, - "learning_rate": 1.7849571564582672e-05, - "loss": 1.3266, - "step": 1854 - }, - { - "epoch": 0.5829946797023774, - "grad_norm": 0.68359375, - "learning_rate": 1.7847032688035547e-05, - "loss": 1.4399, - "step": 1856 - }, - { - "epoch": 0.5836229067279188, - "grad_norm": 0.75, - "learning_rate": 1.784449381148842e-05, - "loss": 1.2884, - "step": 1858 - }, - { - "epoch": 0.5842511337534602, - "grad_norm": 0.6796875, - "learning_rate": 1.784195493494129e-05, - "loss": 1.3308, - "step": 1860 - }, - { - "epoch": 0.5848793607790015, - "grad_norm": 0.7890625, - "learning_rate": 1.783941605839416e-05, - "loss": 1.3215, - "step": 1862 - }, - { - "epoch": 0.5855075878045428, - "grad_norm": 0.8671875, - "learning_rate": 1.7836877181847036e-05, - "loss": 1.4684, - "step": 1864 - }, - { - "epoch": 0.5861358148300843, - "grad_norm": 0.6875, - "learning_rate": 1.7834338305299904e-05, - "loss": 1.293, - "step": 1866 - }, - { - "epoch": 0.5867640418556256, - "grad_norm": 0.7578125, - "learning_rate": 1.783179942875278e-05, - "loss": 1.2667, - "step": 1868 - }, - { - "epoch": 0.5873922688811669, - "grad_norm": 0.76953125, - "learning_rate": 1.782926055220565e-05, - "loss": 1.3243, - "step": 1870 - }, - { - "epoch": 0.5880204959067082, - "grad_norm": 0.79296875, - "learning_rate": 1.782672167565852e-05, - "loss": 1.2651, - "step": 1872 - }, - { - "epoch": 0.5886487229322497, - "grad_norm": 0.69921875, - "learning_rate": 1.7824182799111393e-05, - "loss": 1.2973, - "step": 1874 - }, - { - "epoch": 0.589276949957791, - "grad_norm": 0.73828125, - "learning_rate": 1.7821643922564268e-05, - "loss": 1.2823, - "step": 1876 - }, - { - "epoch": 0.5899051769833323, - "grad_norm": 0.94921875, - "learning_rate": 1.781910504601714e-05, - "loss": 1.415, - "step": 1878 - }, - { - "epoch": 0.5905334040088737, - "grad_norm": 0.76171875, - "learning_rate": 1.781656616947001e-05, - "loss": 1.2477, - "step": 1880 - }, - { - "epoch": 0.5911616310344151, - "grad_norm": 0.80078125, - "learning_rate": 1.7814027292922885e-05, - "loss": 1.2649, - "step": 1882 - }, - { - "epoch": 0.5917898580599564, - "grad_norm": 0.64453125, - "learning_rate": 1.7811488416375756e-05, - "loss": 1.3797, - "step": 1884 - }, - { - "epoch": 0.5924180850854978, - "grad_norm": 0.75390625, - "learning_rate": 1.7808949539828628e-05, - "loss": 1.3717, - "step": 1886 - }, - { - "epoch": 0.5930463121110391, - "grad_norm": 0.70703125, - "learning_rate": 1.78064106632815e-05, - "loss": 1.2677, - "step": 1888 - }, - { - "epoch": 0.5936745391365805, - "grad_norm": 0.78515625, - "learning_rate": 1.7803871786734374e-05, - "loss": 1.4157, - "step": 1890 - }, - { - "epoch": 0.5943027661621219, - "grad_norm": 0.6875, - "learning_rate": 1.7801332910187242e-05, - "loss": 1.2478, - "step": 1892 - }, - { - "epoch": 0.5949309931876632, - "grad_norm": 0.73046875, - "learning_rate": 1.7798794033640117e-05, - "loss": 1.3108, - "step": 1894 - }, - { - "epoch": 0.5955592202132045, - "grad_norm": 0.75, - "learning_rate": 1.7796255157092988e-05, - "loss": 1.3043, - "step": 1896 - }, - { - "epoch": 0.596187447238746, - "grad_norm": 0.703125, - "learning_rate": 1.779371628054586e-05, - "loss": 1.259, - "step": 1898 - }, - { - "epoch": 0.5968156742642873, - "grad_norm": 0.83203125, - "learning_rate": 1.779117740399873e-05, - "loss": 1.2671, - "step": 1900 - }, - { - "epoch": 0.5974439012898286, - "grad_norm": 0.7734375, - "learning_rate": 1.7788638527451606e-05, - "loss": 1.3808, - "step": 1902 - }, - { - "epoch": 0.5980721283153699, - "grad_norm": 0.8828125, - "learning_rate": 1.7786099650904477e-05, - "loss": 1.3359, - "step": 1904 - }, - { - "epoch": 0.5987003553409114, - "grad_norm": 0.8359375, - "learning_rate": 1.778356077435735e-05, - "loss": 1.3205, - "step": 1906 - }, - { - "epoch": 0.5993285823664527, - "grad_norm": 0.73828125, - "learning_rate": 1.778102189781022e-05, - "loss": 1.3357, - "step": 1908 - }, - { - "epoch": 0.599956809391994, - "grad_norm": 0.76953125, - "learning_rate": 1.7778483021263095e-05, - "loss": 1.3952, - "step": 1910 - }, - { - "epoch": 0.6005850364175354, - "grad_norm": 0.828125, - "learning_rate": 1.7775944144715963e-05, - "loss": 1.2332, - "step": 1912 - }, - { - "epoch": 0.6012132634430768, - "grad_norm": 0.828125, - "learning_rate": 1.7773405268168837e-05, - "loss": 1.3406, - "step": 1914 - }, - { - "epoch": 0.6018414904686181, - "grad_norm": 0.71875, - "learning_rate": 1.777086639162171e-05, - "loss": 1.3423, - "step": 1916 - }, - { - "epoch": 0.6024697174941595, - "grad_norm": 0.74609375, - "learning_rate": 1.776832751507458e-05, - "loss": 1.3578, - "step": 1918 - }, - { - "epoch": 0.6030979445197008, - "grad_norm": 0.65625, - "learning_rate": 1.776578863852745e-05, - "loss": 1.3513, - "step": 1920 - }, - { - "epoch": 0.6037261715452421, - "grad_norm": 0.8203125, - "learning_rate": 1.7763249761980326e-05, - "loss": 1.2171, - "step": 1922 - }, - { - "epoch": 0.6043543985707835, - "grad_norm": 0.72265625, - "learning_rate": 1.7760710885433198e-05, - "loss": 1.3335, - "step": 1924 - }, - { - "epoch": 0.6049826255963249, - "grad_norm": 0.83203125, - "learning_rate": 1.775817200888607e-05, - "loss": 1.3946, - "step": 1926 - }, - { - "epoch": 0.6056108526218662, - "grad_norm": 0.76953125, - "learning_rate": 1.775563313233894e-05, - "loss": 1.2521, - "step": 1928 - }, - { - "epoch": 0.6062390796474075, - "grad_norm": 0.78125, - "learning_rate": 1.7753094255791815e-05, - "loss": 1.4663, - "step": 1930 - }, - { - "epoch": 0.606867306672949, - "grad_norm": 0.71484375, - "learning_rate": 1.7750555379244687e-05, - "loss": 1.1201, - "step": 1932 - }, - { - "epoch": 0.6074955336984903, - "grad_norm": 0.78515625, - "learning_rate": 1.7748016502697558e-05, - "loss": 1.4028, - "step": 1934 - }, - { - "epoch": 0.6081237607240316, - "grad_norm": 0.7734375, - "learning_rate": 1.774547762615043e-05, - "loss": 1.2642, - "step": 1936 - }, - { - "epoch": 0.608751987749573, - "grad_norm": 0.76953125, - "learning_rate": 1.77429387496033e-05, - "loss": 1.2945, - "step": 1938 - }, - { - "epoch": 0.6093802147751144, - "grad_norm": 0.76953125, - "learning_rate": 1.7740399873056172e-05, - "loss": 1.265, - "step": 1940 - }, - { - "epoch": 0.6100084418006557, - "grad_norm": 0.859375, - "learning_rate": 1.7737860996509047e-05, - "loss": 1.4148, - "step": 1942 - }, - { - "epoch": 0.610636668826197, - "grad_norm": 0.66796875, - "learning_rate": 1.7735322119961918e-05, - "loss": 1.2506, - "step": 1944 - }, - { - "epoch": 0.6112648958517384, - "grad_norm": 0.90234375, - "learning_rate": 1.773278324341479e-05, - "loss": 1.3281, - "step": 1946 - }, - { - "epoch": 0.6118931228772798, - "grad_norm": 0.7109375, - "learning_rate": 1.773024436686766e-05, - "loss": 1.273, - "step": 1948 - }, - { - "epoch": 0.6125213499028211, - "grad_norm": 0.75, - "learning_rate": 1.7727705490320536e-05, - "loss": 1.2533, - "step": 1950 - }, - { - "epoch": 0.6131495769283625, - "grad_norm": 0.77734375, - "learning_rate": 1.7725166613773407e-05, - "loss": 1.2262, - "step": 1952 - }, - { - "epoch": 0.6137778039539038, - "grad_norm": 0.72265625, - "learning_rate": 1.772262773722628e-05, - "loss": 1.2834, - "step": 1954 - }, - { - "epoch": 0.6144060309794452, - "grad_norm": 0.6796875, - "learning_rate": 1.772008886067915e-05, - "loss": 1.286, - "step": 1956 - }, - { - "epoch": 0.6150342580049866, - "grad_norm": 0.7265625, - "learning_rate": 1.7717549984132025e-05, - "loss": 1.2474, - "step": 1958 - }, - { - "epoch": 0.6156624850305279, - "grad_norm": 0.71484375, - "learning_rate": 1.7715011107584893e-05, - "loss": 1.345, - "step": 1960 - }, - { - "epoch": 0.6162907120560692, - "grad_norm": 0.79296875, - "learning_rate": 1.7712472231037767e-05, - "loss": 1.2253, - "step": 1962 - }, - { - "epoch": 0.6169189390816107, - "grad_norm": 0.76953125, - "learning_rate": 1.770993335449064e-05, - "loss": 1.3628, - "step": 1964 - }, - { - "epoch": 0.617547166107152, - "grad_norm": 0.76953125, - "learning_rate": 1.770739447794351e-05, - "loss": 1.2676, - "step": 1966 - }, - { - "epoch": 0.6181753931326933, - "grad_norm": 0.72265625, - "learning_rate": 1.7704855601396385e-05, - "loss": 1.2463, - "step": 1968 - }, - { - "epoch": 0.6188036201582346, - "grad_norm": 0.7109375, - "learning_rate": 1.7702316724849256e-05, - "loss": 1.3617, - "step": 1970 - }, - { - "epoch": 0.6194318471837761, - "grad_norm": 0.83984375, - "learning_rate": 1.7699777848302128e-05, - "loss": 1.4785, - "step": 1972 - }, - { - "epoch": 0.6200600742093174, - "grad_norm": 0.76953125, - "learning_rate": 1.7697238971755e-05, - "loss": 1.2933, - "step": 1974 - }, - { - "epoch": 0.6206883012348587, - "grad_norm": 0.70703125, - "learning_rate": 1.7694700095207874e-05, - "loss": 1.4211, - "step": 1976 - }, - { - "epoch": 0.6213165282604001, - "grad_norm": 0.734375, - "learning_rate": 1.7692161218660745e-05, - "loss": 1.4411, - "step": 1978 - }, - { - "epoch": 0.6219447552859415, - "grad_norm": 0.70703125, - "learning_rate": 1.7689622342113617e-05, - "loss": 1.2598, - "step": 1980 - }, - { - "epoch": 0.6225729823114828, - "grad_norm": 0.71875, - "learning_rate": 1.7687083465566488e-05, - "loss": 1.2098, - "step": 1982 - }, - { - "epoch": 0.6232012093370242, - "grad_norm": 0.69921875, - "learning_rate": 1.7684544589019363e-05, - "loss": 1.3236, - "step": 1984 - }, - { - "epoch": 0.6238294363625655, - "grad_norm": 0.73046875, - "learning_rate": 1.768200571247223e-05, - "loss": 1.3541, - "step": 1986 - }, - { - "epoch": 0.6244576633881069, - "grad_norm": 0.84765625, - "learning_rate": 1.7679466835925106e-05, - "loss": 1.2746, - "step": 1988 - }, - { - "epoch": 0.6250858904136483, - "grad_norm": 0.86328125, - "learning_rate": 1.7676927959377977e-05, - "loss": 1.3703, - "step": 1990 - }, - { - "epoch": 0.6257141174391896, - "grad_norm": 0.80859375, - "learning_rate": 1.767438908283085e-05, - "loss": 1.2673, - "step": 1992 - }, - { - "epoch": 0.6263423444647309, - "grad_norm": 0.88671875, - "learning_rate": 1.767185020628372e-05, - "loss": 1.2734, - "step": 1994 - }, - { - "epoch": 0.6269705714902722, - "grad_norm": 0.8125, - "learning_rate": 1.7669311329736595e-05, - "loss": 1.2994, - "step": 1996 - }, - { - "epoch": 0.6275987985158137, - "grad_norm": 0.84765625, - "learning_rate": 1.7666772453189466e-05, - "loss": 1.2314, - "step": 1998 - }, - { - "epoch": 0.628227025541355, - "grad_norm": 0.71875, - "learning_rate": 1.7664233576642337e-05, - "loss": 1.3692, - "step": 2000 - }, - { - "epoch": 0.6288552525668963, - "grad_norm": 0.703125, - "learning_rate": 1.766169470009521e-05, - "loss": 1.1083, - "step": 2002 - }, - { - "epoch": 0.6294834795924377, - "grad_norm": 0.71875, - "learning_rate": 1.7659155823548083e-05, - "loss": 1.3513, - "step": 2004 - }, - { - "epoch": 0.6301117066179791, - "grad_norm": 0.71875, - "learning_rate": 1.765661694700095e-05, - "loss": 1.2768, - "step": 2006 - }, - { - "epoch": 0.6307399336435204, - "grad_norm": 0.77734375, - "learning_rate": 1.7654078070453826e-05, - "loss": 1.399, - "step": 2008 - }, - { - "epoch": 0.6313681606690618, - "grad_norm": 0.7734375, - "learning_rate": 1.7651539193906698e-05, - "loss": 1.3596, - "step": 2010 - }, - { - "epoch": 0.6319963876946031, - "grad_norm": 0.99609375, - "learning_rate": 1.764900031735957e-05, - "loss": 1.3298, - "step": 2012 - }, - { - "epoch": 0.6326246147201445, - "grad_norm": 0.81640625, - "learning_rate": 1.764646144081244e-05, - "loss": 1.3194, - "step": 2014 - }, - { - "epoch": 0.6332528417456859, - "grad_norm": 0.78125, - "learning_rate": 1.7643922564265315e-05, - "loss": 1.2478, - "step": 2016 - }, - { - "epoch": 0.6338810687712272, - "grad_norm": 0.78125, - "learning_rate": 1.7641383687718187e-05, - "loss": 1.285, - "step": 2018 - }, - { - "epoch": 0.6345092957967685, - "grad_norm": 0.75, - "learning_rate": 1.7638844811171058e-05, - "loss": 1.4251, - "step": 2020 - }, - { - "epoch": 0.63513752282231, - "grad_norm": 0.97265625, - "learning_rate": 1.763630593462393e-05, - "loss": 1.281, - "step": 2022 - }, - { - "epoch": 0.6357657498478513, - "grad_norm": 0.859375, - "learning_rate": 1.7633767058076804e-05, - "loss": 1.3546, - "step": 2024 - }, - { - "epoch": 0.6363939768733926, - "grad_norm": 0.6796875, - "learning_rate": 1.7631228181529672e-05, - "loss": 1.2134, - "step": 2026 - }, - { - "epoch": 0.6370222038989339, - "grad_norm": 0.7734375, - "learning_rate": 1.7628689304982547e-05, - "loss": 1.352, - "step": 2028 - }, - { - "epoch": 0.6376504309244754, - "grad_norm": 0.69140625, - "learning_rate": 1.7626150428435418e-05, - "loss": 1.3923, - "step": 2030 - }, - { - "epoch": 0.6382786579500167, - "grad_norm": 0.69140625, - "learning_rate": 1.762361155188829e-05, - "loss": 1.3658, - "step": 2032 - }, - { - "epoch": 0.638906884975558, - "grad_norm": 0.96875, - "learning_rate": 1.762107267534116e-05, - "loss": 1.2421, - "step": 2034 - }, - { - "epoch": 0.6395351120010994, - "grad_norm": 0.71875, - "learning_rate": 1.7618533798794036e-05, - "loss": 1.3427, - "step": 2036 - }, - { - "epoch": 0.6401633390266408, - "grad_norm": 0.8515625, - "learning_rate": 1.7615994922246907e-05, - "loss": 1.3342, - "step": 2038 - }, - { - "epoch": 0.6407915660521821, - "grad_norm": 0.7578125, - "learning_rate": 1.761345604569978e-05, - "loss": 1.4221, - "step": 2040 - }, - { - "epoch": 0.6414197930777235, - "grad_norm": 0.73046875, - "learning_rate": 1.761091716915265e-05, - "loss": 1.3898, - "step": 2042 - }, - { - "epoch": 0.6420480201032648, - "grad_norm": 0.703125, - "learning_rate": 1.7608378292605525e-05, - "loss": 1.5576, - "step": 2044 - }, - { - "epoch": 0.6426762471288062, - "grad_norm": 0.79296875, - "learning_rate": 1.7605839416058396e-05, - "loss": 1.3117, - "step": 2046 - }, - { - "epoch": 0.6433044741543475, - "grad_norm": 0.76171875, - "learning_rate": 1.7603300539511267e-05, - "loss": 1.2932, - "step": 2048 - }, - { - "epoch": 0.6439327011798889, - "grad_norm": 0.7734375, - "learning_rate": 1.7600761662964142e-05, - "loss": 1.2463, - "step": 2050 - }, - { - "epoch": 0.6445609282054302, - "grad_norm": 0.703125, - "learning_rate": 1.759822278641701e-05, - "loss": 1.3657, - "step": 2052 - }, - { - "epoch": 0.6451891552309716, - "grad_norm": 0.7734375, - "learning_rate": 1.7595683909869885e-05, - "loss": 1.4386, - "step": 2054 - }, - { - "epoch": 0.645817382256513, - "grad_norm": 0.80078125, - "learning_rate": 1.7593145033322756e-05, - "loss": 1.3022, - "step": 2056 - }, - { - "epoch": 0.6464456092820543, - "grad_norm": 1.0546875, - "learning_rate": 1.7590606156775628e-05, - "loss": 1.5185, - "step": 2058 - }, - { - "epoch": 0.6470738363075956, - "grad_norm": 0.890625, - "learning_rate": 1.75880672802285e-05, - "loss": 1.1322, - "step": 2060 - }, - { - "epoch": 0.647702063333137, - "grad_norm": 0.671875, - "learning_rate": 1.7585528403681374e-05, - "loss": 1.3653, - "step": 2062 - }, - { - "epoch": 0.6483302903586784, - "grad_norm": 0.81640625, - "learning_rate": 1.7582989527134245e-05, - "loss": 1.2649, - "step": 2064 - }, - { - "epoch": 0.6489585173842197, - "grad_norm": 0.7578125, - "learning_rate": 1.7580450650587117e-05, - "loss": 1.4218, - "step": 2066 - }, - { - "epoch": 0.649586744409761, - "grad_norm": 0.84375, - "learning_rate": 1.7577911774039988e-05, - "loss": 1.42, - "step": 2068 - }, - { - "epoch": 0.6502149714353024, - "grad_norm": 0.83984375, - "learning_rate": 1.7575372897492863e-05, - "loss": 1.2207, - "step": 2070 - }, - { - "epoch": 0.6508431984608438, - "grad_norm": 0.73046875, - "learning_rate": 1.7572834020945734e-05, - "loss": 1.3358, - "step": 2072 - }, - { - "epoch": 0.6514714254863851, - "grad_norm": 0.921875, - "learning_rate": 1.7570295144398606e-05, - "loss": 1.2739, - "step": 2074 - }, - { - "epoch": 0.6520996525119265, - "grad_norm": 0.8125, - "learning_rate": 1.7567756267851477e-05, - "loss": 1.3296, - "step": 2076 - }, - { - "epoch": 0.6527278795374678, - "grad_norm": 0.76171875, - "learning_rate": 1.756521739130435e-05, - "loss": 1.3499, - "step": 2078 - }, - { - "epoch": 0.6533561065630092, - "grad_norm": 0.78515625, - "learning_rate": 1.756267851475722e-05, - "loss": 1.3431, - "step": 2080 - }, - { - "epoch": 0.6539843335885506, - "grad_norm": 0.6796875, - "learning_rate": 1.7560139638210094e-05, - "loss": 1.4784, - "step": 2082 - }, - { - "epoch": 0.6546125606140919, - "grad_norm": 0.7265625, - "learning_rate": 1.7557600761662966e-05, - "loss": 1.2925, - "step": 2084 - }, - { - "epoch": 0.6552407876396332, - "grad_norm": 0.828125, - "learning_rate": 1.7555061885115837e-05, - "loss": 1.2877, - "step": 2086 - }, - { - "epoch": 0.6558690146651747, - "grad_norm": 0.74609375, - "learning_rate": 1.755252300856871e-05, - "loss": 1.3971, - "step": 2088 - }, - { - "epoch": 0.656497241690716, - "grad_norm": 0.69921875, - "learning_rate": 1.7549984132021583e-05, - "loss": 1.2842, - "step": 2090 - }, - { - "epoch": 0.6571254687162573, - "grad_norm": 0.734375, - "learning_rate": 1.7547445255474455e-05, - "loss": 1.1632, - "step": 2092 - }, - { - "epoch": 0.6577536957417986, - "grad_norm": 0.828125, - "learning_rate": 1.7544906378927326e-05, - "loss": 1.3903, - "step": 2094 - }, - { - "epoch": 0.6583819227673401, - "grad_norm": 0.7421875, - "learning_rate": 1.7542367502380198e-05, - "loss": 1.4009, - "step": 2096 - }, - { - "epoch": 0.6590101497928814, - "grad_norm": 0.75390625, - "learning_rate": 1.7539828625833072e-05, - "loss": 1.353, - "step": 2098 - }, - { - "epoch": 0.6596383768184227, - "grad_norm": 0.7421875, - "learning_rate": 1.753728974928594e-05, - "loss": 1.4069, - "step": 2100 - }, - { - "epoch": 0.6602666038439641, - "grad_norm": 0.828125, - "learning_rate": 1.7534750872738815e-05, - "loss": 1.2694, - "step": 2102 - }, - { - "epoch": 0.6608948308695055, - "grad_norm": 0.6796875, - "learning_rate": 1.7532211996191686e-05, - "loss": 1.2923, - "step": 2104 - }, - { - "epoch": 0.6615230578950468, - "grad_norm": 0.75390625, - "learning_rate": 1.7529673119644558e-05, - "loss": 1.2756, - "step": 2106 - }, - { - "epoch": 0.6621512849205882, - "grad_norm": 0.7421875, - "learning_rate": 1.752713424309743e-05, - "loss": 1.4151, - "step": 2108 - }, - { - "epoch": 0.6627795119461295, - "grad_norm": 0.71875, - "learning_rate": 1.7524595366550304e-05, - "loss": 1.3067, - "step": 2110 - }, - { - "epoch": 0.6634077389716709, - "grad_norm": 0.70703125, - "learning_rate": 1.7522056490003175e-05, - "loss": 1.3295, - "step": 2112 - }, - { - "epoch": 0.6640359659972123, - "grad_norm": 0.7421875, - "learning_rate": 1.7519517613456047e-05, - "loss": 1.3994, - "step": 2114 - }, - { - "epoch": 0.6646641930227536, - "grad_norm": 0.79296875, - "learning_rate": 1.7516978736908918e-05, - "loss": 1.3512, - "step": 2116 - }, - { - "epoch": 0.6652924200482949, - "grad_norm": 0.71484375, - "learning_rate": 1.7514439860361793e-05, - "loss": 1.2393, - "step": 2118 - }, - { - "epoch": 0.6659206470738364, - "grad_norm": 0.7734375, - "learning_rate": 1.751190098381466e-05, - "loss": 1.2977, - "step": 2120 - }, - { - "epoch": 0.6665488740993777, - "grad_norm": 0.73828125, - "learning_rate": 1.7509362107267536e-05, - "loss": 1.4039, - "step": 2122 - }, - { - "epoch": 0.667177101124919, - "grad_norm": 0.7265625, - "learning_rate": 1.7506823230720407e-05, - "loss": 1.3294, - "step": 2124 - }, - { - "epoch": 0.6678053281504603, - "grad_norm": 0.69921875, - "learning_rate": 1.750428435417328e-05, - "loss": 1.2816, - "step": 2126 - }, - { - "epoch": 0.6684335551760018, - "grad_norm": 0.75, - "learning_rate": 1.750174547762615e-05, - "loss": 1.3298, - "step": 2128 - }, - { - "epoch": 0.6690617822015431, - "grad_norm": 0.6796875, - "learning_rate": 1.7499206601079025e-05, - "loss": 1.3823, - "step": 2130 - }, - { - "epoch": 0.6696900092270844, - "grad_norm": 0.72265625, - "learning_rate": 1.7496667724531896e-05, - "loss": 1.2973, - "step": 2132 - }, - { - "epoch": 0.6703182362526258, - "grad_norm": 0.67578125, - "learning_rate": 1.7494128847984767e-05, - "loss": 1.3873, - "step": 2134 - }, - { - "epoch": 0.6709464632781671, - "grad_norm": 0.71484375, - "learning_rate": 1.7491589971437642e-05, - "loss": 1.3746, - "step": 2136 - }, - { - "epoch": 0.6715746903037085, - "grad_norm": 0.71875, - "learning_rate": 1.7489051094890514e-05, - "loss": 1.2803, - "step": 2138 - }, - { - "epoch": 0.6722029173292499, - "grad_norm": 0.78515625, - "learning_rate": 1.7486512218343385e-05, - "loss": 1.3632, - "step": 2140 - }, - { - "epoch": 0.6728311443547912, - "grad_norm": 0.75, - "learning_rate": 1.7483973341796256e-05, - "loss": 1.3377, - "step": 2142 - }, - { - "epoch": 0.6734593713803325, - "grad_norm": 0.69921875, - "learning_rate": 1.748143446524913e-05, - "loss": 1.2896, - "step": 2144 - }, - { - "epoch": 0.674087598405874, - "grad_norm": 0.890625, - "learning_rate": 1.7478895588702e-05, - "loss": 1.2543, - "step": 2146 - }, - { - "epoch": 0.6747158254314153, - "grad_norm": 0.87109375, - "learning_rate": 1.7476356712154874e-05, - "loss": 1.2882, - "step": 2148 - }, - { - "epoch": 0.6753440524569566, - "grad_norm": 0.86328125, - "learning_rate": 1.7473817835607745e-05, - "loss": 1.3234, - "step": 2150 - }, - { - "epoch": 0.6759722794824979, - "grad_norm": 0.75390625, - "learning_rate": 1.7471278959060617e-05, - "loss": 1.2965, - "step": 2152 - }, - { - "epoch": 0.6766005065080394, - "grad_norm": 0.67578125, - "learning_rate": 1.7468740082513488e-05, - "loss": 1.4172, - "step": 2154 - }, - { - "epoch": 0.6772287335335807, - "grad_norm": 0.73828125, - "learning_rate": 1.7466201205966363e-05, - "loss": 1.3369, - "step": 2156 - }, - { - "epoch": 0.677856960559122, - "grad_norm": 0.7265625, - "learning_rate": 1.7463662329419234e-05, - "loss": 1.3239, - "step": 2158 - }, - { - "epoch": 0.6784851875846634, - "grad_norm": 0.7265625, - "learning_rate": 1.7461123452872105e-05, - "loss": 1.2926, - "step": 2160 - }, - { - "epoch": 0.6791134146102048, - "grad_norm": 0.83203125, - "learning_rate": 1.7458584576324977e-05, - "loss": 1.3588, - "step": 2162 - }, - { - "epoch": 0.6797416416357461, - "grad_norm": 0.87109375, - "learning_rate": 1.745604569977785e-05, - "loss": 1.1972, - "step": 2164 - }, - { - "epoch": 0.6803698686612875, - "grad_norm": 0.71484375, - "learning_rate": 1.7453506823230723e-05, - "loss": 1.2391, - "step": 2166 - }, - { - "epoch": 0.6809980956868288, - "grad_norm": 0.82421875, - "learning_rate": 1.7450967946683594e-05, - "loss": 1.3438, - "step": 2168 - }, - { - "epoch": 0.6816263227123702, - "grad_norm": 0.72265625, - "learning_rate": 1.7448429070136466e-05, - "loss": 1.4117, - "step": 2170 - }, - { - "epoch": 0.6822545497379116, - "grad_norm": 0.79296875, - "learning_rate": 1.7445890193589337e-05, - "loss": 1.388, - "step": 2172 - }, - { - "epoch": 0.6828827767634529, - "grad_norm": 0.6875, - "learning_rate": 1.744335131704221e-05, - "loss": 1.3602, - "step": 2174 - }, - { - "epoch": 0.6835110037889942, - "grad_norm": 1.0546875, - "learning_rate": 1.7440812440495083e-05, - "loss": 1.2999, - "step": 2176 - }, - { - "epoch": 0.6841392308145356, - "grad_norm": 0.828125, - "learning_rate": 1.7438273563947955e-05, - "loss": 1.3296, - "step": 2178 - }, - { - "epoch": 0.684767457840077, - "grad_norm": 0.78125, - "learning_rate": 1.7435734687400826e-05, - "loss": 1.302, - "step": 2180 - }, - { - "epoch": 0.6853956848656183, - "grad_norm": 0.73046875, - "learning_rate": 1.7433195810853697e-05, - "loss": 1.321, - "step": 2182 - }, - { - "epoch": 0.6860239118911596, - "grad_norm": 0.78515625, - "learning_rate": 1.7430656934306572e-05, - "loss": 1.3628, - "step": 2184 - }, - { - "epoch": 0.6866521389167011, - "grad_norm": 0.8671875, - "learning_rate": 1.7428118057759444e-05, - "loss": 1.3183, - "step": 2186 - }, - { - "epoch": 0.6872803659422424, - "grad_norm": 0.921875, - "learning_rate": 1.7425579181212315e-05, - "loss": 1.4956, - "step": 2188 - }, - { - "epoch": 0.6879085929677837, - "grad_norm": 0.7265625, - "learning_rate": 1.7423040304665186e-05, - "loss": 1.4264, - "step": 2190 - }, - { - "epoch": 0.688536819993325, - "grad_norm": 0.765625, - "learning_rate": 1.742050142811806e-05, - "loss": 1.3176, - "step": 2192 - }, - { - "epoch": 0.6891650470188665, - "grad_norm": 0.78515625, - "learning_rate": 1.741796255157093e-05, - "loss": 1.3268, - "step": 2194 - }, - { - "epoch": 0.6897932740444078, - "grad_norm": 1.046875, - "learning_rate": 1.7415423675023804e-05, - "loss": 1.346, - "step": 2196 - }, - { - "epoch": 0.6904215010699492, - "grad_norm": 0.80078125, - "learning_rate": 1.7412884798476675e-05, - "loss": 1.3614, - "step": 2198 - }, - { - "epoch": 0.6910497280954905, - "grad_norm": 0.7265625, - "learning_rate": 1.7410345921929547e-05, - "loss": 1.2779, - "step": 2200 - }, - { - "epoch": 0.6916779551210319, - "grad_norm": 0.7265625, - "learning_rate": 1.7407807045382418e-05, - "loss": 1.2913, - "step": 2202 - }, - { - "epoch": 0.6923061821465732, - "grad_norm": 0.86328125, - "learning_rate": 1.7405268168835293e-05, - "loss": 1.3669, - "step": 2204 - }, - { - "epoch": 0.6929344091721146, - "grad_norm": 0.71484375, - "learning_rate": 1.7402729292288164e-05, - "loss": 1.2992, - "step": 2206 - }, - { - "epoch": 0.6935626361976559, - "grad_norm": 0.80078125, - "learning_rate": 1.7400190415741036e-05, - "loss": 1.3157, - "step": 2208 - }, - { - "epoch": 0.6941908632231972, - "grad_norm": 0.828125, - "learning_rate": 1.7397651539193907e-05, - "loss": 1.1712, - "step": 2210 - }, - { - "epoch": 0.6948190902487387, - "grad_norm": 0.78515625, - "learning_rate": 1.7395112662646782e-05, - "loss": 1.1813, - "step": 2212 - }, - { - "epoch": 0.69544731727428, - "grad_norm": 0.76171875, - "learning_rate": 1.739257378609965e-05, - "loss": 1.3688, - "step": 2214 - }, - { - "epoch": 0.6960755442998213, - "grad_norm": 0.9453125, - "learning_rate": 1.7390034909552525e-05, - "loss": 1.3554, - "step": 2216 - }, - { - "epoch": 0.6967037713253627, - "grad_norm": 0.87890625, - "learning_rate": 1.7387496033005396e-05, - "loss": 1.3605, - "step": 2218 - }, - { - "epoch": 0.6973319983509041, - "grad_norm": 0.890625, - "learning_rate": 1.7384957156458267e-05, - "loss": 1.2138, - "step": 2220 - }, - { - "epoch": 0.6979602253764454, - "grad_norm": 0.8515625, - "learning_rate": 1.7382418279911142e-05, - "loss": 1.309, - "step": 2222 - }, - { - "epoch": 0.6985884524019867, - "grad_norm": 0.78515625, - "learning_rate": 1.7379879403364013e-05, - "loss": 1.2578, - "step": 2224 - }, - { - "epoch": 0.6992166794275281, - "grad_norm": 0.78515625, - "learning_rate": 1.7377340526816885e-05, - "loss": 1.3457, - "step": 2226 - }, - { - "epoch": 0.6998449064530695, - "grad_norm": 0.7890625, - "learning_rate": 1.7374801650269756e-05, - "loss": 1.2938, - "step": 2228 - }, - { - "epoch": 0.7004731334786108, - "grad_norm": 0.8984375, - "learning_rate": 1.737226277372263e-05, - "loss": 1.284, - "step": 2230 - }, - { - "epoch": 0.7011013605041522, - "grad_norm": 0.72265625, - "learning_rate": 1.7369723897175502e-05, - "loss": 1.415, - "step": 2232 - }, - { - "epoch": 0.7017295875296935, - "grad_norm": 0.94921875, - "learning_rate": 1.7367185020628374e-05, - "loss": 1.2291, - "step": 2234 - }, - { - "epoch": 0.7023578145552349, - "grad_norm": 0.74609375, - "learning_rate": 1.7364646144081245e-05, - "loss": 1.374, - "step": 2236 - }, - { - "epoch": 0.7029860415807763, - "grad_norm": 0.6953125, - "learning_rate": 1.736210726753412e-05, - "loss": 1.4697, - "step": 2238 - }, - { - "epoch": 0.7036142686063176, - "grad_norm": 0.71484375, - "learning_rate": 1.7359568390986988e-05, - "loss": 1.2784, - "step": 2240 - }, - { - "epoch": 0.7042424956318589, - "grad_norm": 0.73828125, - "learning_rate": 1.7357029514439863e-05, - "loss": 1.2381, - "step": 2242 - }, - { - "epoch": 0.7048707226574004, - "grad_norm": 0.78125, - "learning_rate": 1.7354490637892734e-05, - "loss": 1.2173, - "step": 2244 - }, - { - "epoch": 0.7054989496829417, - "grad_norm": 0.77734375, - "learning_rate": 1.7351951761345605e-05, - "loss": 1.2839, - "step": 2246 - }, - { - "epoch": 0.706127176708483, - "grad_norm": 0.6953125, - "learning_rate": 1.7349412884798477e-05, - "loss": 1.3768, - "step": 2248 - }, - { - "epoch": 0.7067554037340243, - "grad_norm": 0.81640625, - "learning_rate": 1.734687400825135e-05, - "loss": 1.3607, - "step": 2250 - }, - { - "epoch": 0.7073836307595658, - "grad_norm": 0.703125, - "learning_rate": 1.7344335131704223e-05, - "loss": 1.3943, - "step": 2252 - }, - { - "epoch": 0.7080118577851071, - "grad_norm": 0.6875, - "learning_rate": 1.7341796255157094e-05, - "loss": 1.4092, - "step": 2254 - }, - { - "epoch": 0.7086400848106484, - "grad_norm": 0.75, - "learning_rate": 1.7339257378609966e-05, - "loss": 1.2429, - "step": 2256 - }, - { - "epoch": 0.7092683118361898, - "grad_norm": 0.7109375, - "learning_rate": 1.733671850206284e-05, - "loss": 1.458, - "step": 2258 - }, - { - "epoch": 0.7098965388617312, - "grad_norm": 0.67578125, - "learning_rate": 1.7334179625515712e-05, - "loss": 1.3227, - "step": 2260 - }, - { - "epoch": 0.7105247658872725, - "grad_norm": 0.79296875, - "learning_rate": 1.7331640748968583e-05, - "loss": 1.4453, - "step": 2262 - }, - { - "epoch": 0.7111529929128139, - "grad_norm": 0.74609375, - "learning_rate": 1.7329101872421455e-05, - "loss": 1.2725, - "step": 2264 - }, - { - "epoch": 0.7117812199383552, - "grad_norm": 0.74609375, - "learning_rate": 1.7326562995874326e-05, - "loss": 1.2165, - "step": 2266 - }, - { - "epoch": 0.7124094469638966, - "grad_norm": 0.76171875, - "learning_rate": 1.7324024119327197e-05, - "loss": 1.4287, - "step": 2268 - }, - { - "epoch": 0.713037673989438, - "grad_norm": 0.6484375, - "learning_rate": 1.7321485242780072e-05, - "loss": 1.2783, - "step": 2270 - }, - { - "epoch": 0.7136659010149793, - "grad_norm": 0.6875, - "learning_rate": 1.7318946366232944e-05, - "loss": 1.3641, - "step": 2272 - }, - { - "epoch": 0.7142941280405206, - "grad_norm": 0.7109375, - "learning_rate": 1.7316407489685815e-05, - "loss": 1.3313, - "step": 2274 - }, - { - "epoch": 0.7149223550660619, - "grad_norm": 0.68359375, - "learning_rate": 1.7313868613138686e-05, - "loss": 1.3461, - "step": 2276 - }, - { - "epoch": 0.7155505820916034, - "grad_norm": 0.6953125, - "learning_rate": 1.731132973659156e-05, - "loss": 1.3159, - "step": 2278 - }, - { - "epoch": 0.7161788091171447, - "grad_norm": 0.765625, - "learning_rate": 1.7308790860044432e-05, - "loss": 1.2575, - "step": 2280 - }, - { - "epoch": 0.716807036142686, - "grad_norm": 0.65625, - "learning_rate": 1.7306251983497304e-05, - "loss": 1.3816, - "step": 2282 - }, - { - "epoch": 0.7174352631682274, - "grad_norm": 0.71875, - "learning_rate": 1.7303713106950175e-05, - "loss": 1.4548, - "step": 2284 - }, - { - "epoch": 0.7180634901937688, - "grad_norm": 0.83984375, - "learning_rate": 1.730117423040305e-05, - "loss": 1.2777, - "step": 2286 - }, - { - "epoch": 0.7186917172193101, - "grad_norm": 0.7421875, - "learning_rate": 1.7298635353855918e-05, - "loss": 1.3142, - "step": 2288 - }, - { - "epoch": 0.7193199442448515, - "grad_norm": 0.7890625, - "learning_rate": 1.7296096477308793e-05, - "loss": 1.2618, - "step": 2290 - }, - { - "epoch": 0.7199481712703928, - "grad_norm": 0.70703125, - "learning_rate": 1.7293557600761664e-05, - "loss": 1.3586, - "step": 2292 - }, - { - "epoch": 0.7205763982959342, - "grad_norm": 0.77734375, - "learning_rate": 1.7291018724214536e-05, - "loss": 1.2284, - "step": 2294 - }, - { - "epoch": 0.7212046253214756, - "grad_norm": 0.76953125, - "learning_rate": 1.7288479847667407e-05, - "loss": 1.3143, - "step": 2296 - }, - { - "epoch": 0.7218328523470169, - "grad_norm": 0.73828125, - "learning_rate": 1.728594097112028e-05, - "loss": 1.2988, - "step": 2298 - }, - { - "epoch": 0.7224610793725582, - "grad_norm": 0.78125, - "learning_rate": 1.7283402094573153e-05, - "loss": 1.3754, - "step": 2300 - }, - { - "epoch": 0.7230893063980997, - "grad_norm": 0.69140625, - "learning_rate": 1.7280863218026024e-05, - "loss": 1.3633, - "step": 2302 - }, - { - "epoch": 0.723717533423641, - "grad_norm": 0.71875, - "learning_rate": 1.7278324341478896e-05, - "loss": 1.4773, - "step": 2304 - }, - { - "epoch": 0.7243457604491823, - "grad_norm": 0.77734375, - "learning_rate": 1.727578546493177e-05, - "loss": 1.351, - "step": 2306 - }, - { - "epoch": 0.7249739874747236, - "grad_norm": 0.73828125, - "learning_rate": 1.7273246588384642e-05, - "loss": 1.3577, - "step": 2308 - }, - { - "epoch": 0.7256022145002651, - "grad_norm": 0.81640625, - "learning_rate": 1.7270707711837513e-05, - "loss": 1.2883, - "step": 2310 - }, - { - "epoch": 0.7262304415258064, - "grad_norm": 0.67578125, - "learning_rate": 1.7268168835290388e-05, - "loss": 1.4049, - "step": 2312 - }, - { - "epoch": 0.7268586685513477, - "grad_norm": 0.6796875, - "learning_rate": 1.7265629958743256e-05, - "loss": 1.3443, - "step": 2314 - }, - { - "epoch": 0.7274868955768891, - "grad_norm": 0.90234375, - "learning_rate": 1.726309108219613e-05, - "loss": 1.2131, - "step": 2316 - }, - { - "epoch": 0.7281151226024305, - "grad_norm": 0.71875, - "learning_rate": 1.7260552205649002e-05, - "loss": 1.353, - "step": 2318 - }, - { - "epoch": 0.7287433496279718, - "grad_norm": 0.73828125, - "learning_rate": 1.7258013329101874e-05, - "loss": 1.2911, - "step": 2320 - }, - { - "epoch": 0.7293715766535132, - "grad_norm": 0.90625, - "learning_rate": 1.7255474452554745e-05, - "loss": 1.3567, - "step": 2322 - }, - { - "epoch": 0.7299998036790545, - "grad_norm": 0.70703125, - "learning_rate": 1.725293557600762e-05, - "loss": 1.3589, - "step": 2324 - }, - { - "epoch": 0.7306280307045959, - "grad_norm": 0.87109375, - "learning_rate": 1.725039669946049e-05, - "loss": 1.3734, - "step": 2326 - }, - { - "epoch": 0.7312562577301372, - "grad_norm": 0.6484375, - "learning_rate": 1.7247857822913363e-05, - "loss": 1.3007, - "step": 2328 - }, - { - "epoch": 0.7318844847556786, - "grad_norm": 0.80859375, - "learning_rate": 1.7245318946366234e-05, - "loss": 1.3652, - "step": 2330 - }, - { - "epoch": 0.7325127117812199, - "grad_norm": 0.72265625, - "learning_rate": 1.724278006981911e-05, - "loss": 1.3929, - "step": 2332 - }, - { - "epoch": 0.7331409388067613, - "grad_norm": 0.6640625, - "learning_rate": 1.7240241193271977e-05, - "loss": 1.2893, - "step": 2334 - }, - { - "epoch": 0.7337691658323027, - "grad_norm": 0.77734375, - "learning_rate": 1.723770231672485e-05, - "loss": 1.4986, - "step": 2336 - }, - { - "epoch": 0.734397392857844, - "grad_norm": 0.6875, - "learning_rate": 1.7235163440177723e-05, - "loss": 1.3224, - "step": 2338 - }, - { - "epoch": 0.7350256198833853, - "grad_norm": 0.77734375, - "learning_rate": 1.7232624563630594e-05, - "loss": 1.422, - "step": 2340 - }, - { - "epoch": 0.7356538469089268, - "grad_norm": 0.703125, - "learning_rate": 1.7230085687083466e-05, - "loss": 1.4021, - "step": 2342 - }, - { - "epoch": 0.7362820739344681, - "grad_norm": 0.67578125, - "learning_rate": 1.722754681053634e-05, - "loss": 1.3948, - "step": 2344 - }, - { - "epoch": 0.7369103009600094, - "grad_norm": 0.73046875, - "learning_rate": 1.7225007933989212e-05, - "loss": 1.2958, - "step": 2346 - }, - { - "epoch": 0.7375385279855508, - "grad_norm": 0.734375, - "learning_rate": 1.7222469057442083e-05, - "loss": 1.2972, - "step": 2348 - }, - { - "epoch": 0.7381667550110921, - "grad_norm": 0.68359375, - "learning_rate": 1.7219930180894955e-05, - "loss": 1.3356, - "step": 2350 - }, - { - "epoch": 0.7387949820366335, - "grad_norm": 0.82421875, - "learning_rate": 1.721739130434783e-05, - "loss": 1.2247, - "step": 2352 - }, - { - "epoch": 0.7394232090621748, - "grad_norm": 0.70703125, - "learning_rate": 1.7214852427800697e-05, - "loss": 1.3243, - "step": 2354 - }, - { - "epoch": 0.7400514360877162, - "grad_norm": 0.7265625, - "learning_rate": 1.7212313551253572e-05, - "loss": 1.4064, - "step": 2356 - }, - { - "epoch": 0.7406796631132575, - "grad_norm": 0.77734375, - "learning_rate": 1.7209774674706443e-05, - "loss": 1.4806, - "step": 2358 - }, - { - "epoch": 0.7413078901387989, - "grad_norm": 0.85546875, - "learning_rate": 1.7207235798159315e-05, - "loss": 1.3769, - "step": 2360 - }, - { - "epoch": 0.7419361171643403, - "grad_norm": 0.71875, - "learning_rate": 1.7204696921612186e-05, - "loss": 1.2256, - "step": 2362 - }, - { - "epoch": 0.7425643441898816, - "grad_norm": 0.78125, - "learning_rate": 1.720215804506506e-05, - "loss": 1.389, - "step": 2364 - }, - { - "epoch": 0.7431925712154229, - "grad_norm": 0.6796875, - "learning_rate": 1.7199619168517932e-05, - "loss": 1.4362, - "step": 2366 - }, - { - "epoch": 0.7438207982409644, - "grad_norm": 0.8984375, - "learning_rate": 1.7197080291970804e-05, - "loss": 1.4191, - "step": 2368 - }, - { - "epoch": 0.7444490252665057, - "grad_norm": 0.7265625, - "learning_rate": 1.7194541415423675e-05, - "loss": 1.3115, - "step": 2370 - }, - { - "epoch": 0.745077252292047, - "grad_norm": 0.7578125, - "learning_rate": 1.719200253887655e-05, - "loss": 1.4019, - "step": 2372 - }, - { - "epoch": 0.7457054793175883, - "grad_norm": 0.734375, - "learning_rate": 1.718946366232942e-05, - "loss": 1.3587, - "step": 2374 - }, - { - "epoch": 0.7463337063431298, - "grad_norm": 0.87109375, - "learning_rate": 1.7186924785782293e-05, - "loss": 1.3749, - "step": 2376 - }, - { - "epoch": 0.7469619333686711, - "grad_norm": 0.6875, - "learning_rate": 1.7184385909235164e-05, - "loss": 1.3042, - "step": 2378 - }, - { - "epoch": 0.7475901603942124, - "grad_norm": 0.73828125, - "learning_rate": 1.7181847032688035e-05, - "loss": 1.2356, - "step": 2380 - }, - { - "epoch": 0.7482183874197538, - "grad_norm": 0.7734375, - "learning_rate": 1.7179308156140907e-05, - "loss": 1.2864, - "step": 2382 - }, - { - "epoch": 0.7488466144452952, - "grad_norm": 0.69921875, - "learning_rate": 1.717676927959378e-05, - "loss": 1.3995, - "step": 2384 - }, - { - "epoch": 0.7494748414708365, - "grad_norm": 0.78125, - "learning_rate": 1.7174230403046653e-05, - "loss": 1.2924, - "step": 2386 - }, - { - "epoch": 0.7501030684963779, - "grad_norm": 0.81640625, - "learning_rate": 1.7171691526499524e-05, - "loss": 1.2801, - "step": 2388 - }, - { - "epoch": 0.7507312955219192, - "grad_norm": 0.7890625, - "learning_rate": 1.7169152649952396e-05, - "loss": 1.2726, - "step": 2390 - }, - { - "epoch": 0.7513595225474606, - "grad_norm": 0.734375, - "learning_rate": 1.716661377340527e-05, - "loss": 1.35, - "step": 2392 - }, - { - "epoch": 0.751987749573002, - "grad_norm": 0.796875, - "learning_rate": 1.7164074896858142e-05, - "loss": 1.2783, - "step": 2394 - }, - { - "epoch": 0.7526159765985433, - "grad_norm": 0.78515625, - "learning_rate": 1.7161536020311013e-05, - "loss": 1.3665, - "step": 2396 - }, - { - "epoch": 0.7532442036240846, - "grad_norm": 0.98046875, - "learning_rate": 1.7158997143763888e-05, - "loss": 1.3679, - "step": 2398 - }, - { - "epoch": 0.7538724306496261, - "grad_norm": 0.78515625, - "learning_rate": 1.715645826721676e-05, - "loss": 1.3874, - "step": 2400 - }, - { - "epoch": 0.7545006576751674, - "grad_norm": 0.75390625, - "learning_rate": 1.715391939066963e-05, - "loss": 1.2631, - "step": 2402 - }, - { - "epoch": 0.7551288847007087, - "grad_norm": 0.796875, - "learning_rate": 1.7151380514122502e-05, - "loss": 1.2159, - "step": 2404 - }, - { - "epoch": 0.75575711172625, - "grad_norm": 0.74609375, - "learning_rate": 1.7148841637575374e-05, - "loss": 1.3067, - "step": 2406 - }, - { - "epoch": 0.7563853387517915, - "grad_norm": 0.7109375, - "learning_rate": 1.7146302761028245e-05, - "loss": 1.3503, - "step": 2408 - }, - { - "epoch": 0.7570135657773328, - "grad_norm": 0.7421875, - "learning_rate": 1.714376388448112e-05, - "loss": 1.369, - "step": 2410 - }, - { - "epoch": 0.7576417928028741, - "grad_norm": 0.88671875, - "learning_rate": 1.714122500793399e-05, - "loss": 1.2634, - "step": 2412 - }, - { - "epoch": 0.7582700198284155, - "grad_norm": 0.75390625, - "learning_rate": 1.7138686131386862e-05, - "loss": 1.2631, - "step": 2414 - }, - { - "epoch": 0.7588982468539569, - "grad_norm": 0.72265625, - "learning_rate": 1.7136147254839734e-05, - "loss": 1.33, - "step": 2416 - }, - { - "epoch": 0.7595264738794982, - "grad_norm": 0.72265625, - "learning_rate": 1.713360837829261e-05, - "loss": 1.3229, - "step": 2418 - }, - { - "epoch": 0.7601547009050396, - "grad_norm": 1.1640625, - "learning_rate": 1.713106950174548e-05, - "loss": 1.2857, - "step": 2420 - }, - { - "epoch": 0.7607829279305809, - "grad_norm": 0.875, - "learning_rate": 1.712853062519835e-05, - "loss": 1.3812, - "step": 2422 - }, - { - "epoch": 0.7614111549561222, - "grad_norm": 0.6953125, - "learning_rate": 1.7125991748651223e-05, - "loss": 1.3809, - "step": 2424 - }, - { - "epoch": 0.7620393819816637, - "grad_norm": 0.7890625, - "learning_rate": 1.7123452872104098e-05, - "loss": 1.3366, - "step": 2426 - }, - { - "epoch": 0.762667609007205, - "grad_norm": 0.77734375, - "learning_rate": 1.7120913995556966e-05, - "loss": 1.3628, - "step": 2428 - }, - { - "epoch": 0.7632958360327463, - "grad_norm": 0.8046875, - "learning_rate": 1.711837511900984e-05, - "loss": 1.3394, - "step": 2430 - }, - { - "epoch": 0.7639240630582876, - "grad_norm": 0.7265625, - "learning_rate": 1.7115836242462712e-05, - "loss": 1.4378, - "step": 2432 - }, - { - "epoch": 0.7645522900838291, - "grad_norm": 0.7890625, - "learning_rate": 1.7113297365915583e-05, - "loss": 1.1978, - "step": 2434 - }, - { - "epoch": 0.7651805171093704, - "grad_norm": 0.75, - "learning_rate": 1.7110758489368454e-05, - "loss": 1.2939, - "step": 2436 - }, - { - "epoch": 0.7658087441349117, - "grad_norm": 0.7109375, - "learning_rate": 1.710821961282133e-05, - "loss": 1.3248, - "step": 2438 - }, - { - "epoch": 0.7664369711604531, - "grad_norm": 0.7578125, - "learning_rate": 1.71056807362742e-05, - "loss": 1.2087, - "step": 2440 - }, - { - "epoch": 0.7670651981859945, - "grad_norm": 0.81640625, - "learning_rate": 1.7103141859727072e-05, - "loss": 1.1633, - "step": 2442 - }, - { - "epoch": 0.7676934252115358, - "grad_norm": 1.078125, - "learning_rate": 1.7100602983179943e-05, - "loss": 1.2432, - "step": 2444 - }, - { - "epoch": 0.7683216522370772, - "grad_norm": 0.75390625, - "learning_rate": 1.7098064106632818e-05, - "loss": 1.3272, - "step": 2446 - }, - { - "epoch": 0.7689498792626185, - "grad_norm": 0.78125, - "learning_rate": 1.7095525230085686e-05, - "loss": 1.2589, - "step": 2448 - }, - { - "epoch": 0.7695781062881599, - "grad_norm": 0.71484375, - "learning_rate": 1.709298635353856e-05, - "loss": 1.375, - "step": 2450 - }, - { - "epoch": 0.7702063333137013, - "grad_norm": 0.72265625, - "learning_rate": 1.7090447476991432e-05, - "loss": 1.2817, - "step": 2452 - }, - { - "epoch": 0.7708345603392426, - "grad_norm": 0.75390625, - "learning_rate": 1.7087908600444304e-05, - "loss": 1.2879, - "step": 2454 - }, - { - "epoch": 0.7714627873647839, - "grad_norm": 1.234375, - "learning_rate": 1.7085369723897175e-05, - "loss": 1.2573, - "step": 2456 - }, - { - "epoch": 0.7720910143903253, - "grad_norm": 0.7890625, - "learning_rate": 1.708283084735005e-05, - "loss": 1.343, - "step": 2458 - }, - { - "epoch": 0.7727192414158667, - "grad_norm": 0.7109375, - "learning_rate": 1.708029197080292e-05, - "loss": 1.357, - "step": 2460 - }, - { - "epoch": 0.773347468441408, - "grad_norm": 0.74609375, - "learning_rate": 1.7077753094255793e-05, - "loss": 1.3493, - "step": 2462 - }, - { - "epoch": 0.7739756954669493, - "grad_norm": 0.78515625, - "learning_rate": 1.7075214217708664e-05, - "loss": 1.2568, - "step": 2464 - }, - { - "epoch": 0.7746039224924908, - "grad_norm": 0.73828125, - "learning_rate": 1.707267534116154e-05, - "loss": 1.3476, - "step": 2466 - }, - { - "epoch": 0.7752321495180321, - "grad_norm": 0.76171875, - "learning_rate": 1.707013646461441e-05, - "loss": 1.2797, - "step": 2468 - }, - { - "epoch": 0.7758603765435734, - "grad_norm": 0.75390625, - "learning_rate": 1.706759758806728e-05, - "loss": 1.3368, - "step": 2470 - }, - { - "epoch": 0.7764886035691148, - "grad_norm": 0.671875, - "learning_rate": 1.7065058711520153e-05, - "loss": 1.2807, - "step": 2472 - }, - { - "epoch": 0.7771168305946562, - "grad_norm": 0.71484375, - "learning_rate": 1.7062519834973024e-05, - "loss": 1.5012, - "step": 2474 - }, - { - "epoch": 0.7777450576201975, - "grad_norm": 0.734375, - "learning_rate": 1.7059980958425896e-05, - "loss": 1.3204, - "step": 2476 - }, - { - "epoch": 0.7783732846457388, - "grad_norm": 0.8046875, - "learning_rate": 1.705744208187877e-05, - "loss": 1.3475, - "step": 2478 - }, - { - "epoch": 0.7790015116712802, - "grad_norm": 0.796875, - "learning_rate": 1.7054903205331642e-05, - "loss": 1.2051, - "step": 2480 - }, - { - "epoch": 0.7796297386968216, - "grad_norm": 0.68359375, - "learning_rate": 1.7052364328784513e-05, - "loss": 1.3502, - "step": 2482 - }, - { - "epoch": 0.780257965722363, - "grad_norm": 0.9140625, - "learning_rate": 1.7049825452237388e-05, - "loss": 1.2337, - "step": 2484 - }, - { - "epoch": 0.7808861927479043, - "grad_norm": 0.77734375, - "learning_rate": 1.704728657569026e-05, - "loss": 1.3524, - "step": 2486 - }, - { - "epoch": 0.7815144197734456, - "grad_norm": 0.82421875, - "learning_rate": 1.704474769914313e-05, - "loss": 1.3843, - "step": 2488 - }, - { - "epoch": 0.7821426467989869, - "grad_norm": 0.6953125, - "learning_rate": 1.7042208822596002e-05, - "loss": 1.3905, - "step": 2490 - }, - { - "epoch": 0.7827708738245284, - "grad_norm": 0.69921875, - "learning_rate": 1.7039669946048877e-05, - "loss": 1.3168, - "step": 2492 - }, - { - "epoch": 0.7833991008500697, - "grad_norm": 0.79296875, - "learning_rate": 1.7037131069501748e-05, - "loss": 1.233, - "step": 2494 - }, - { - "epoch": 0.784027327875611, - "grad_norm": 0.77734375, - "learning_rate": 1.703459219295462e-05, - "loss": 1.3278, - "step": 2496 - }, - { - "epoch": 0.7846555549011524, - "grad_norm": 0.6953125, - "learning_rate": 1.703205331640749e-05, - "loss": 1.2751, - "step": 2498 - }, - { - "epoch": 0.7852837819266938, - "grad_norm": 0.796875, - "learning_rate": 1.7029514439860362e-05, - "loss": 1.3463, - "step": 2500 - }, - { - "epoch": 0.7859120089522351, - "grad_norm": 0.80859375, - "learning_rate": 1.7026975563313234e-05, - "loss": 1.2921, - "step": 2502 - }, - { - "epoch": 0.7865402359777764, - "grad_norm": 0.71484375, - "learning_rate": 1.702443668676611e-05, - "loss": 1.1402, - "step": 2504 - }, - { - "epoch": 0.7871684630033178, - "grad_norm": 1.125, - "learning_rate": 1.702189781021898e-05, - "loss": 1.2382, - "step": 2506 - }, - { - "epoch": 0.7877966900288592, - "grad_norm": 0.63671875, - "learning_rate": 1.701935893367185e-05, - "loss": 1.3848, - "step": 2508 - }, - { - "epoch": 0.7884249170544005, - "grad_norm": 0.7578125, - "learning_rate": 1.7016820057124723e-05, - "loss": 1.2577, - "step": 2510 - }, - { - "epoch": 0.7890531440799419, - "grad_norm": 0.74609375, - "learning_rate": 1.7014281180577597e-05, - "loss": 1.4976, - "step": 2512 - }, - { - "epoch": 0.7896813711054832, - "grad_norm": 0.65234375, - "learning_rate": 1.701174230403047e-05, - "loss": 1.3051, - "step": 2514 - }, - { - "epoch": 0.7903095981310246, - "grad_norm": 0.75, - "learning_rate": 1.700920342748334e-05, - "loss": 1.3637, - "step": 2516 - }, - { - "epoch": 0.790937825156566, - "grad_norm": 0.828125, - "learning_rate": 1.700666455093621e-05, - "loss": 1.2335, - "step": 2518 - }, - { - "epoch": 0.7915660521821073, - "grad_norm": 0.73828125, - "learning_rate": 1.7004125674389086e-05, - "loss": 1.2534, - "step": 2520 - }, - { - "epoch": 0.7921942792076486, - "grad_norm": 0.78515625, - "learning_rate": 1.7001586797841954e-05, - "loss": 1.4272, - "step": 2522 - }, - { - "epoch": 0.7928225062331901, - "grad_norm": 0.66796875, - "learning_rate": 1.699904792129483e-05, - "loss": 1.2296, - "step": 2524 - }, - { - "epoch": 0.7934507332587314, - "grad_norm": 0.765625, - "learning_rate": 1.69965090447477e-05, - "loss": 1.3799, - "step": 2526 - }, - { - "epoch": 0.7940789602842727, - "grad_norm": 0.625, - "learning_rate": 1.6993970168200572e-05, - "loss": 1.4241, - "step": 2528 - }, - { - "epoch": 0.794707187309814, - "grad_norm": 0.8125, - "learning_rate": 1.6991431291653443e-05, - "loss": 1.2411, - "step": 2530 - }, - { - "epoch": 0.7953354143353555, - "grad_norm": 1.078125, - "learning_rate": 1.6988892415106318e-05, - "loss": 1.3962, - "step": 2532 - }, - { - "epoch": 0.7959636413608968, - "grad_norm": 0.8828125, - "learning_rate": 1.698635353855919e-05, - "loss": 1.3154, - "step": 2534 - }, - { - "epoch": 0.7965918683864381, - "grad_norm": 0.62890625, - "learning_rate": 1.698381466201206e-05, - "loss": 1.3236, - "step": 2536 - }, - { - "epoch": 0.7972200954119795, - "grad_norm": 0.80859375, - "learning_rate": 1.6981275785464932e-05, - "loss": 1.2605, - "step": 2538 - }, - { - "epoch": 0.7978483224375209, - "grad_norm": 0.7578125, - "learning_rate": 1.6978736908917807e-05, - "loss": 1.2216, - "step": 2540 - }, - { - "epoch": 0.7984765494630622, - "grad_norm": 0.6875, - "learning_rate": 1.6976198032370675e-05, - "loss": 1.3394, - "step": 2542 - }, - { - "epoch": 0.7991047764886036, - "grad_norm": 0.73828125, - "learning_rate": 1.697365915582355e-05, - "loss": 1.331, - "step": 2544 - }, - { - "epoch": 0.7997330035141449, - "grad_norm": 0.72265625, - "learning_rate": 1.697112027927642e-05, - "loss": 1.3703, - "step": 2546 - }, - { - "epoch": 0.8003612305396863, - "grad_norm": 0.828125, - "learning_rate": 1.6968581402729293e-05, - "loss": 1.3128, - "step": 2548 - }, - { - "epoch": 0.8009894575652277, - "grad_norm": 0.8125, - "learning_rate": 1.6966042526182164e-05, - "loss": 1.278, - "step": 2550 - }, - { - "epoch": 0.801617684590769, - "grad_norm": 0.65625, - "learning_rate": 1.696350364963504e-05, - "loss": 1.3876, - "step": 2552 - }, - { - "epoch": 0.8022459116163103, - "grad_norm": 0.71484375, - "learning_rate": 1.696096477308791e-05, - "loss": 1.2858, - "step": 2554 - }, - { - "epoch": 0.8028741386418518, - "grad_norm": 0.6953125, - "learning_rate": 1.695842589654078e-05, - "loss": 1.412, - "step": 2556 - }, - { - "epoch": 0.8035023656673931, - "grad_norm": 0.7109375, - "learning_rate": 1.6955887019993653e-05, - "loss": 1.4499, - "step": 2558 - }, - { - "epoch": 0.8041305926929344, - "grad_norm": 0.9140625, - "learning_rate": 1.6953348143446528e-05, - "loss": 1.291, - "step": 2560 - }, - { - "epoch": 0.8047588197184757, - "grad_norm": 0.90625, - "learning_rate": 1.69508092668994e-05, - "loss": 1.4154, - "step": 2562 - }, - { - "epoch": 0.8053870467440171, - "grad_norm": 0.82421875, - "learning_rate": 1.694827039035227e-05, - "loss": 1.4474, - "step": 2564 - }, - { - "epoch": 0.8060152737695585, - "grad_norm": 0.79296875, - "learning_rate": 1.6945731513805145e-05, - "loss": 1.3263, - "step": 2566 - }, - { - "epoch": 0.8066435007950998, - "grad_norm": 0.84375, - "learning_rate": 1.6943192637258013e-05, - "loss": 1.3238, - "step": 2568 - }, - { - "epoch": 0.8072717278206412, - "grad_norm": 0.83984375, - "learning_rate": 1.6940653760710888e-05, - "loss": 1.4225, - "step": 2570 - }, - { - "epoch": 0.8078999548461825, - "grad_norm": 0.70703125, - "learning_rate": 1.693811488416376e-05, - "loss": 1.2038, - "step": 2572 - }, - { - "epoch": 0.8085281818717239, - "grad_norm": 0.8359375, - "learning_rate": 1.693557600761663e-05, - "loss": 1.1913, - "step": 2574 - }, - { - "epoch": 0.8091564088972653, - "grad_norm": 0.76953125, - "learning_rate": 1.6933037131069502e-05, - "loss": 1.3431, - "step": 2576 - }, - { - "epoch": 0.8097846359228066, - "grad_norm": 0.87890625, - "learning_rate": 1.6930498254522377e-05, - "loss": 1.3336, - "step": 2578 - }, - { - "epoch": 0.8104128629483479, - "grad_norm": 0.87890625, - "learning_rate": 1.6927959377975248e-05, - "loss": 1.2205, - "step": 2580 - }, - { - "epoch": 0.8110410899738894, - "grad_norm": 0.69921875, - "learning_rate": 1.692542050142812e-05, - "loss": 1.3004, - "step": 2582 - }, - { - "epoch": 0.8116693169994307, - "grad_norm": 0.75390625, - "learning_rate": 1.692288162488099e-05, - "loss": 1.3125, - "step": 2584 - }, - { - "epoch": 0.812297544024972, - "grad_norm": 0.6953125, - "learning_rate": 1.6920342748333866e-05, - "loss": 1.4572, - "step": 2586 - }, - { - "epoch": 0.8129257710505133, - "grad_norm": 0.74609375, - "learning_rate": 1.6917803871786737e-05, - "loss": 1.2809, - "step": 2588 - }, - { - "epoch": 0.8135539980760548, - "grad_norm": 0.66796875, - "learning_rate": 1.691526499523961e-05, - "loss": 1.2979, - "step": 2590 - }, - { - "epoch": 0.8141822251015961, - "grad_norm": 0.890625, - "learning_rate": 1.691272611869248e-05, - "loss": 1.3751, - "step": 2592 - }, - { - "epoch": 0.8148104521271374, - "grad_norm": 0.8125, - "learning_rate": 1.691018724214535e-05, - "loss": 1.3556, - "step": 2594 - }, - { - "epoch": 0.8154386791526788, - "grad_norm": 0.734375, - "learning_rate": 1.6907648365598223e-05, - "loss": 1.2648, - "step": 2596 - }, - { - "epoch": 0.8160669061782202, - "grad_norm": 0.77734375, - "learning_rate": 1.6905109489051097e-05, - "loss": 1.3499, - "step": 2598 - }, - { - "epoch": 0.8166951332037615, - "grad_norm": 0.8359375, - "learning_rate": 1.690257061250397e-05, - "loss": 1.3424, - "step": 2600 - }, - { - "epoch": 0.8173233602293029, - "grad_norm": 0.72265625, - "learning_rate": 1.690003173595684e-05, - "loss": 1.3746, - "step": 2602 - }, - { - "epoch": 0.8179515872548442, - "grad_norm": 0.78515625, - "learning_rate": 1.689749285940971e-05, - "loss": 1.3152, - "step": 2604 - }, - { - "epoch": 0.8185798142803856, - "grad_norm": 0.7109375, - "learning_rate": 1.6894953982862586e-05, - "loss": 1.3755, - "step": 2606 - }, - { - "epoch": 0.819208041305927, - "grad_norm": 0.84765625, - "learning_rate": 1.6892415106315458e-05, - "loss": 1.2247, - "step": 2608 - }, - { - "epoch": 0.8198362683314683, - "grad_norm": 0.69921875, - "learning_rate": 1.688987622976833e-05, - "loss": 1.4328, - "step": 2610 - }, - { - "epoch": 0.8204644953570096, - "grad_norm": 0.6796875, - "learning_rate": 1.68873373532212e-05, - "loss": 1.2965, - "step": 2612 - }, - { - "epoch": 0.821092722382551, - "grad_norm": 0.91015625, - "learning_rate": 1.6884798476674075e-05, - "loss": 1.2175, - "step": 2614 - }, - { - "epoch": 0.8217209494080924, - "grad_norm": 0.8828125, - "learning_rate": 1.6882259600126943e-05, - "loss": 1.1868, - "step": 2616 - }, - { - "epoch": 0.8223491764336337, - "grad_norm": 0.9296875, - "learning_rate": 1.6879720723579818e-05, - "loss": 1.331, - "step": 2618 - }, - { - "epoch": 0.822977403459175, - "grad_norm": 0.69140625, - "learning_rate": 1.687718184703269e-05, - "loss": 1.3342, - "step": 2620 - }, - { - "epoch": 0.8236056304847165, - "grad_norm": 0.68359375, - "learning_rate": 1.687464297048556e-05, - "loss": 1.3036, - "step": 2622 - }, - { - "epoch": 0.8242338575102578, - "grad_norm": 0.75390625, - "learning_rate": 1.6872104093938432e-05, - "loss": 1.2481, - "step": 2624 - }, - { - "epoch": 0.8248620845357991, - "grad_norm": 0.703125, - "learning_rate": 1.6869565217391307e-05, - "loss": 1.3175, - "step": 2626 - }, - { - "epoch": 0.8254903115613405, - "grad_norm": 0.97265625, - "learning_rate": 1.686702634084418e-05, - "loss": 1.3181, - "step": 2628 - }, - { - "epoch": 0.8261185385868819, - "grad_norm": 0.7421875, - "learning_rate": 1.686448746429705e-05, - "loss": 1.3106, - "step": 2630 - }, - { - "epoch": 0.8267467656124232, - "grad_norm": 0.82421875, - "learning_rate": 1.686194858774992e-05, - "loss": 1.3216, - "step": 2632 - }, - { - "epoch": 0.8273749926379645, - "grad_norm": 0.85546875, - "learning_rate": 1.6859409711202796e-05, - "loss": 1.3221, - "step": 2634 - }, - { - "epoch": 0.8280032196635059, - "grad_norm": 0.7734375, - "learning_rate": 1.6856870834655664e-05, - "loss": 1.3614, - "step": 2636 - }, - { - "epoch": 0.8286314466890472, - "grad_norm": 0.7890625, - "learning_rate": 1.685433195810854e-05, - "loss": 1.3956, - "step": 2638 - }, - { - "epoch": 0.8292596737145886, - "grad_norm": 0.6875, - "learning_rate": 1.685179308156141e-05, - "loss": 1.1662, - "step": 2640 - }, - { - "epoch": 0.82988790074013, - "grad_norm": 0.76953125, - "learning_rate": 1.684925420501428e-05, - "loss": 1.2505, - "step": 2642 - }, - { - "epoch": 0.8305161277656713, - "grad_norm": 0.86328125, - "learning_rate": 1.6846715328467153e-05, - "loss": 1.251, - "step": 2644 - }, - { - "epoch": 0.8311443547912126, - "grad_norm": 0.78515625, - "learning_rate": 1.6844176451920028e-05, - "loss": 1.398, - "step": 2646 - }, - { - "epoch": 0.8317725818167541, - "grad_norm": 0.79296875, - "learning_rate": 1.68416375753729e-05, - "loss": 1.2618, - "step": 2648 - }, - { - "epoch": 0.8324008088422954, - "grad_norm": 0.66796875, - "learning_rate": 1.683909869882577e-05, - "loss": 1.3516, - "step": 2650 - }, - { - "epoch": 0.8330290358678367, - "grad_norm": 0.74609375, - "learning_rate": 1.6836559822278645e-05, - "loss": 1.4359, - "step": 2652 - }, - { - "epoch": 0.833657262893378, - "grad_norm": 0.703125, - "learning_rate": 1.6834020945731516e-05, - "loss": 1.3158, - "step": 2654 - }, - { - "epoch": 0.8342854899189195, - "grad_norm": 0.81640625, - "learning_rate": 1.6831482069184388e-05, - "loss": 1.2849, - "step": 2656 - }, - { - "epoch": 0.8349137169444608, - "grad_norm": 0.734375, - "learning_rate": 1.682894319263726e-05, - "loss": 1.4921, - "step": 2658 - }, - { - "epoch": 0.8355419439700021, - "grad_norm": 0.94921875, - "learning_rate": 1.6826404316090134e-05, - "loss": 1.2774, - "step": 2660 - }, - { - "epoch": 0.8361701709955435, - "grad_norm": 0.78125, - "learning_rate": 1.6823865439543002e-05, - "loss": 1.3282, - "step": 2662 - }, - { - "epoch": 0.8367983980210849, - "grad_norm": 0.75, - "learning_rate": 1.6821326562995877e-05, - "loss": 1.2604, - "step": 2664 - }, - { - "epoch": 0.8374266250466262, - "grad_norm": 0.75390625, - "learning_rate": 1.6818787686448748e-05, - "loss": 1.2322, - "step": 2666 - }, - { - "epoch": 0.8380548520721676, - "grad_norm": 0.75, - "learning_rate": 1.681624880990162e-05, - "loss": 1.3847, - "step": 2668 - }, - { - "epoch": 0.8386830790977089, - "grad_norm": 1.0, - "learning_rate": 1.681370993335449e-05, - "loss": 1.2521, - "step": 2670 - }, - { - "epoch": 0.8393113061232503, - "grad_norm": 0.73046875, - "learning_rate": 1.6811171056807366e-05, - "loss": 1.4187, - "step": 2672 - }, - { - "epoch": 0.8399395331487917, - "grad_norm": 0.7109375, - "learning_rate": 1.6808632180260237e-05, - "loss": 1.195, - "step": 2674 - }, - { - "epoch": 0.840567760174333, - "grad_norm": 1.015625, - "learning_rate": 1.680609330371311e-05, - "loss": 1.3454, - "step": 2676 - }, - { - "epoch": 0.8411959871998743, - "grad_norm": 0.78515625, - "learning_rate": 1.680355442716598e-05, - "loss": 1.453, - "step": 2678 - }, - { - "epoch": 0.8418242142254158, - "grad_norm": 0.68359375, - "learning_rate": 1.6801015550618855e-05, - "loss": 1.4218, - "step": 2680 - }, - { - "epoch": 0.8424524412509571, - "grad_norm": 0.85546875, - "learning_rate": 1.6798476674071723e-05, - "loss": 1.4194, - "step": 2682 - }, - { - "epoch": 0.8430806682764984, - "grad_norm": 0.80859375, - "learning_rate": 1.6795937797524597e-05, - "loss": 1.3225, - "step": 2684 - }, - { - "epoch": 0.8437088953020397, - "grad_norm": 0.7578125, - "learning_rate": 1.679339892097747e-05, - "loss": 1.205, - "step": 2686 - }, - { - "epoch": 0.8443371223275812, - "grad_norm": 1.046875, - "learning_rate": 1.679086004443034e-05, - "loss": 1.425, - "step": 2688 - }, - { - "epoch": 0.8449653493531225, - "grad_norm": 0.6875, - "learning_rate": 1.678832116788321e-05, - "loss": 1.3743, - "step": 2690 - }, - { - "epoch": 0.8455935763786638, - "grad_norm": 0.83203125, - "learning_rate": 1.6785782291336086e-05, - "loss": 1.2462, - "step": 2692 - }, - { - "epoch": 0.8462218034042052, - "grad_norm": 0.671875, - "learning_rate": 1.6783243414788958e-05, - "loss": 1.3989, - "step": 2694 - }, - { - "epoch": 0.8468500304297466, - "grad_norm": 0.76953125, - "learning_rate": 1.678070453824183e-05, - "loss": 1.4101, - "step": 2696 - }, - { - "epoch": 0.8474782574552879, - "grad_norm": 0.71484375, - "learning_rate": 1.67781656616947e-05, - "loss": 1.2639, - "step": 2698 - }, - { - "epoch": 0.8481064844808293, - "grad_norm": 0.79296875, - "learning_rate": 1.6775626785147575e-05, - "loss": 1.3388, - "step": 2700 - }, - { - "epoch": 0.8487347115063706, - "grad_norm": 0.78515625, - "learning_rate": 1.6773087908600447e-05, - "loss": 1.363, - "step": 2702 - }, - { - "epoch": 0.8493629385319119, - "grad_norm": 0.828125, - "learning_rate": 1.6770549032053318e-05, - "loss": 1.2831, - "step": 2704 - }, - { - "epoch": 0.8499911655574534, - "grad_norm": 0.7109375, - "learning_rate": 1.676801015550619e-05, - "loss": 1.2638, - "step": 2706 - }, - { - "epoch": 0.8506193925829947, - "grad_norm": 0.6875, - "learning_rate": 1.676547127895906e-05, - "loss": 1.3733, - "step": 2708 - }, - { - "epoch": 0.851247619608536, - "grad_norm": 0.6796875, - "learning_rate": 1.6762932402411932e-05, - "loss": 1.3726, - "step": 2710 - }, - { - "epoch": 0.8518758466340773, - "grad_norm": 0.73828125, - "learning_rate": 1.6760393525864807e-05, - "loss": 1.3406, - "step": 2712 - }, - { - "epoch": 0.8525040736596188, - "grad_norm": 0.69921875, - "learning_rate": 1.6757854649317678e-05, - "loss": 1.4331, - "step": 2714 - }, - { - "epoch": 0.8531323006851601, - "grad_norm": 0.7890625, - "learning_rate": 1.675531577277055e-05, - "loss": 1.302, - "step": 2716 - }, - { - "epoch": 0.8537605277107014, - "grad_norm": 0.79296875, - "learning_rate": 1.675277689622342e-05, - "loss": 1.3428, - "step": 2718 - }, - { - "epoch": 0.8543887547362428, - "grad_norm": 0.765625, - "learning_rate": 1.6750238019676296e-05, - "loss": 1.2827, - "step": 2720 - }, - { - "epoch": 0.8550169817617842, - "grad_norm": 0.67578125, - "learning_rate": 1.6747699143129167e-05, - "loss": 1.2744, - "step": 2722 - }, - { - "epoch": 0.8556452087873255, - "grad_norm": 0.75, - "learning_rate": 1.674516026658204e-05, - "loss": 1.2999, - "step": 2724 - }, - { - "epoch": 0.8562734358128669, - "grad_norm": 0.9765625, - "learning_rate": 1.674262139003491e-05, - "loss": 1.2288, - "step": 2726 - }, - { - "epoch": 0.8569016628384082, - "grad_norm": 0.7109375, - "learning_rate": 1.6740082513487785e-05, - "loss": 1.3101, - "step": 2728 - }, - { - "epoch": 0.8575298898639496, - "grad_norm": 0.71484375, - "learning_rate": 1.6737543636940653e-05, - "loss": 1.309, - "step": 2730 - }, - { - "epoch": 0.858158116889491, - "grad_norm": 0.69921875, - "learning_rate": 1.6735004760393527e-05, - "loss": 1.3683, - "step": 2732 - }, - { - "epoch": 0.8587863439150323, - "grad_norm": 1.015625, - "learning_rate": 1.67324658838464e-05, - "loss": 1.2708, - "step": 2734 - }, - { - "epoch": 0.8594145709405736, - "grad_norm": 0.7578125, - "learning_rate": 1.672992700729927e-05, - "loss": 1.5443, - "step": 2736 - }, - { - "epoch": 0.860042797966115, - "grad_norm": 0.73046875, - "learning_rate": 1.6727388130752145e-05, - "loss": 1.3305, - "step": 2738 - }, - { - "epoch": 0.8606710249916564, - "grad_norm": 0.86328125, - "learning_rate": 1.6724849254205016e-05, - "loss": 1.3512, - "step": 2740 - }, - { - "epoch": 0.8612992520171977, - "grad_norm": 0.73828125, - "learning_rate": 1.6722310377657888e-05, - "loss": 1.3854, - "step": 2742 - }, - { - "epoch": 0.861927479042739, - "grad_norm": 0.75390625, - "learning_rate": 1.671977150111076e-05, - "loss": 1.2901, - "step": 2744 - }, - { - "epoch": 0.8625557060682805, - "grad_norm": 0.68359375, - "learning_rate": 1.6717232624563634e-05, - "loss": 1.3502, - "step": 2746 - }, - { - "epoch": 0.8631839330938218, - "grad_norm": 0.7578125, - "learning_rate": 1.6714693748016505e-05, - "loss": 1.1293, - "step": 2748 - }, - { - "epoch": 0.8638121601193631, - "grad_norm": 0.74609375, - "learning_rate": 1.6712154871469377e-05, - "loss": 1.3325, - "step": 2750 - }, - { - "epoch": 0.8644403871449045, - "grad_norm": 0.7890625, - "learning_rate": 1.6709615994922248e-05, - "loss": 1.4138, - "step": 2752 - }, - { - "epoch": 0.8650686141704459, - "grad_norm": 0.69140625, - "learning_rate": 1.6707077118375123e-05, - "loss": 1.2818, - "step": 2754 - }, - { - "epoch": 0.8656968411959872, - "grad_norm": 0.73046875, - "learning_rate": 1.670453824182799e-05, - "loss": 1.2926, - "step": 2756 - }, - { - "epoch": 0.8663250682215285, - "grad_norm": 0.6953125, - "learning_rate": 1.6701999365280866e-05, - "loss": 1.3686, - "step": 2758 - }, - { - "epoch": 0.8669532952470699, - "grad_norm": 0.8359375, - "learning_rate": 1.6699460488733737e-05, - "loss": 1.2924, - "step": 2760 - }, - { - "epoch": 0.8675815222726113, - "grad_norm": 0.78515625, - "learning_rate": 1.669692161218661e-05, - "loss": 1.4022, - "step": 2762 - }, - { - "epoch": 0.8682097492981526, - "grad_norm": 0.8359375, - "learning_rate": 1.669438273563948e-05, - "loss": 1.429, - "step": 2764 - }, - { - "epoch": 0.868837976323694, - "grad_norm": 0.7890625, - "learning_rate": 1.6691843859092355e-05, - "loss": 1.2911, - "step": 2766 - }, - { - "epoch": 0.8694662033492353, - "grad_norm": 0.73046875, - "learning_rate": 1.6689304982545226e-05, - "loss": 1.4, - "step": 2768 - }, - { - "epoch": 0.8700944303747767, - "grad_norm": 0.88671875, - "learning_rate": 1.6686766105998097e-05, - "loss": 1.3409, - "step": 2770 - }, - { - "epoch": 0.8707226574003181, - "grad_norm": 1.0390625, - "learning_rate": 1.668422722945097e-05, - "loss": 1.2781, - "step": 2772 - }, - { - "epoch": 0.8713508844258594, - "grad_norm": 0.8359375, - "learning_rate": 1.6681688352903843e-05, - "loss": 1.3083, - "step": 2774 - }, - { - "epoch": 0.8719791114514007, - "grad_norm": 0.73046875, - "learning_rate": 1.667914947635671e-05, - "loss": 1.2491, - "step": 2776 - }, - { - "epoch": 0.872607338476942, - "grad_norm": 0.67578125, - "learning_rate": 1.6676610599809586e-05, - "loss": 1.3156, - "step": 2778 - }, - { - "epoch": 0.8732355655024835, - "grad_norm": 0.8515625, - "learning_rate": 1.6674071723262458e-05, - "loss": 1.2403, - "step": 2780 - }, - { - "epoch": 0.8738637925280248, - "grad_norm": 0.74609375, - "learning_rate": 1.667153284671533e-05, - "loss": 1.4226, - "step": 2782 - }, - { - "epoch": 0.8744920195535661, - "grad_norm": 0.84765625, - "learning_rate": 1.66689939701682e-05, - "loss": 1.2981, - "step": 2784 - }, - { - "epoch": 0.8751202465791075, - "grad_norm": 0.6953125, - "learning_rate": 1.6666455093621075e-05, - "loss": 1.256, - "step": 2786 - }, - { - "epoch": 0.8757484736046489, - "grad_norm": 0.734375, - "learning_rate": 1.6663916217073946e-05, - "loss": 1.255, - "step": 2788 - }, - { - "epoch": 0.8763767006301902, - "grad_norm": 0.7265625, - "learning_rate": 1.6661377340526818e-05, - "loss": 1.2185, - "step": 2790 - }, - { - "epoch": 0.8770049276557316, - "grad_norm": 0.6640625, - "learning_rate": 1.665883846397969e-05, - "loss": 1.4315, - "step": 2792 - }, - { - "epoch": 0.8776331546812729, - "grad_norm": 0.703125, - "learning_rate": 1.6656299587432564e-05, - "loss": 1.4531, - "step": 2794 - }, - { - "epoch": 0.8782613817068143, - "grad_norm": 0.8828125, - "learning_rate": 1.6653760710885435e-05, - "loss": 1.2937, - "step": 2796 - }, - { - "epoch": 0.8788896087323557, - "grad_norm": 0.9375, - "learning_rate": 1.6651221834338307e-05, - "loss": 1.2382, - "step": 2798 - }, - { - "epoch": 0.879517835757897, - "grad_norm": 0.8515625, - "learning_rate": 1.6648682957791178e-05, - "loss": 1.2398, - "step": 2800 - }, - { - "epoch": 0.8801460627834383, - "grad_norm": 0.7890625, - "learning_rate": 1.664614408124405e-05, - "loss": 1.3117, - "step": 2802 - }, - { - "epoch": 0.8807742898089798, - "grad_norm": 0.88671875, - "learning_rate": 1.664360520469692e-05, - "loss": 1.35, - "step": 2804 - }, - { - "epoch": 0.8814025168345211, - "grad_norm": 0.8125, - "learning_rate": 1.6641066328149796e-05, - "loss": 1.4186, - "step": 2806 - }, - { - "epoch": 0.8820307438600624, - "grad_norm": 0.67578125, - "learning_rate": 1.6638527451602667e-05, - "loss": 1.2733, - "step": 2808 - }, - { - "epoch": 0.8826589708856037, - "grad_norm": 0.734375, - "learning_rate": 1.663598857505554e-05, - "loss": 1.312, - "step": 2810 - }, - { - "epoch": 0.8832871979111452, - "grad_norm": 0.76171875, - "learning_rate": 1.663344969850841e-05, - "loss": 1.2711, - "step": 2812 - }, - { - "epoch": 0.8839154249366865, - "grad_norm": 0.7265625, - "learning_rate": 1.6630910821961285e-05, - "loss": 1.3649, - "step": 2814 - }, - { - "epoch": 0.8845436519622278, - "grad_norm": 0.734375, - "learning_rate": 1.6628371945414156e-05, - "loss": 1.5247, - "step": 2816 - }, - { - "epoch": 0.8851718789877692, - "grad_norm": 0.76171875, - "learning_rate": 1.6625833068867027e-05, - "loss": 1.2794, - "step": 2818 - }, - { - "epoch": 0.8858001060133106, - "grad_norm": 0.68359375, - "learning_rate": 1.66232941923199e-05, - "loss": 1.2475, - "step": 2820 - }, - { - "epoch": 0.8864283330388519, - "grad_norm": 0.8359375, - "learning_rate": 1.6620755315772774e-05, - "loss": 1.1927, - "step": 2822 - }, - { - "epoch": 0.8870565600643933, - "grad_norm": 0.8828125, - "learning_rate": 1.6618216439225645e-05, - "loss": 1.2817, - "step": 2824 - }, - { - "epoch": 0.8876847870899346, - "grad_norm": 0.72265625, - "learning_rate": 1.6615677562678516e-05, - "loss": 1.3958, - "step": 2826 - }, - { - "epoch": 0.888313014115476, - "grad_norm": 0.69140625, - "learning_rate": 1.6613138686131388e-05, - "loss": 1.4729, - "step": 2828 - }, - { - "epoch": 0.8889412411410174, - "grad_norm": 0.87109375, - "learning_rate": 1.661059980958426e-05, - "loss": 1.2705, - "step": 2830 - }, - { - "epoch": 0.8895694681665587, - "grad_norm": 0.73828125, - "learning_rate": 1.6608060933037134e-05, - "loss": 1.4104, - "step": 2832 - }, - { - "epoch": 0.8901976951921, - "grad_norm": 0.78125, - "learning_rate": 1.6605522056490005e-05, - "loss": 1.3625, - "step": 2834 - }, - { - "epoch": 0.8908259222176415, - "grad_norm": 0.8046875, - "learning_rate": 1.6602983179942877e-05, - "loss": 1.3754, - "step": 2836 - }, - { - "epoch": 0.8914541492431828, - "grad_norm": 0.7578125, - "learning_rate": 1.6600444303395748e-05, - "loss": 1.2763, - "step": 2838 - }, - { - "epoch": 0.8920823762687241, - "grad_norm": 0.6953125, - "learning_rate": 1.6597905426848623e-05, - "loss": 1.1745, - "step": 2840 - }, - { - "epoch": 0.8927106032942654, - "grad_norm": 0.75390625, - "learning_rate": 1.6595366550301494e-05, - "loss": 1.2782, - "step": 2842 - }, - { - "epoch": 0.8933388303198068, - "grad_norm": 0.796875, - "learning_rate": 1.6592827673754366e-05, - "loss": 1.3032, - "step": 2844 - }, - { - "epoch": 0.8939670573453482, - "grad_norm": 0.7265625, - "learning_rate": 1.6590288797207237e-05, - "loss": 1.4176, - "step": 2846 - }, - { - "epoch": 0.8945952843708895, - "grad_norm": 0.71875, - "learning_rate": 1.658774992066011e-05, - "loss": 1.3804, - "step": 2848 - }, - { - "epoch": 0.8952235113964309, - "grad_norm": 0.6796875, - "learning_rate": 1.658521104411298e-05, - "loss": 1.3173, - "step": 2850 - }, - { - "epoch": 0.8958517384219722, - "grad_norm": 0.86328125, - "learning_rate": 1.6582672167565854e-05, - "loss": 1.1673, - "step": 2852 - }, - { - "epoch": 0.8964799654475136, - "grad_norm": 0.69921875, - "learning_rate": 1.6580133291018726e-05, - "loss": 1.3201, - "step": 2854 - }, - { - "epoch": 0.897108192473055, - "grad_norm": 0.6953125, - "learning_rate": 1.6577594414471597e-05, - "loss": 1.3234, - "step": 2856 - }, - { - "epoch": 0.8977364194985963, - "grad_norm": 2.359375, - "learning_rate": 1.657505553792447e-05, - "loss": 1.4672, - "step": 2858 - }, - { - "epoch": 0.8983646465241376, - "grad_norm": 0.78515625, - "learning_rate": 1.6572516661377343e-05, - "loss": 1.3377, - "step": 2860 - }, - { - "epoch": 0.898992873549679, - "grad_norm": 0.71484375, - "learning_rate": 1.6569977784830215e-05, - "loss": 1.2545, - "step": 2862 - }, - { - "epoch": 0.8996211005752204, - "grad_norm": 0.8984375, - "learning_rate": 1.6567438908283086e-05, - "loss": 1.2684, - "step": 2864 - }, - { - "epoch": 0.9002493276007617, - "grad_norm": 0.7421875, - "learning_rate": 1.6564900031735957e-05, - "loss": 1.2329, - "step": 2866 - }, - { - "epoch": 0.900877554626303, - "grad_norm": 0.6796875, - "learning_rate": 1.6562361155188832e-05, - "loss": 1.4101, - "step": 2868 - }, - { - "epoch": 0.9015057816518445, - "grad_norm": 0.7578125, - "learning_rate": 1.65598222786417e-05, - "loss": 1.2965, - "step": 2870 - }, - { - "epoch": 0.9021340086773858, - "grad_norm": 0.90234375, - "learning_rate": 1.6557283402094575e-05, - "loss": 1.331, - "step": 2872 - }, - { - "epoch": 0.9027622357029271, - "grad_norm": 0.765625, - "learning_rate": 1.6554744525547446e-05, - "loss": 1.4061, - "step": 2874 - }, - { - "epoch": 0.9033904627284685, - "grad_norm": 0.76953125, - "learning_rate": 1.6552205649000318e-05, - "loss": 1.3482, - "step": 2876 - }, - { - "epoch": 0.9040186897540099, - "grad_norm": 0.68359375, - "learning_rate": 1.654966677245319e-05, - "loss": 1.3822, - "step": 2878 - }, - { - "epoch": 0.9046469167795512, - "grad_norm": 0.79296875, - "learning_rate": 1.6547127895906064e-05, - "loss": 1.2013, - "step": 2880 - }, - { - "epoch": 0.9052751438050926, - "grad_norm": 0.75390625, - "learning_rate": 1.6544589019358935e-05, - "loss": 1.2415, - "step": 2882 - }, - { - "epoch": 0.9059033708306339, - "grad_norm": 0.8984375, - "learning_rate": 1.6542050142811807e-05, - "loss": 1.3142, - "step": 2884 - }, - { - "epoch": 0.9065315978561753, - "grad_norm": 0.7734375, - "learning_rate": 1.6539511266264678e-05, - "loss": 1.3292, - "step": 2886 - }, - { - "epoch": 0.9071598248817166, - "grad_norm": 0.7421875, - "learning_rate": 1.6536972389717553e-05, - "loss": 1.3243, - "step": 2888 - }, - { - "epoch": 0.907788051907258, - "grad_norm": 0.75, - "learning_rate": 1.6534433513170424e-05, - "loss": 1.2548, - "step": 2890 - }, - { - "epoch": 0.9084162789327993, - "grad_norm": 0.78515625, - "learning_rate": 1.6531894636623296e-05, - "loss": 1.3526, - "step": 2892 - }, - { - "epoch": 0.9090445059583407, - "grad_norm": 0.7890625, - "learning_rate": 1.6529355760076167e-05, - "loss": 1.3198, - "step": 2894 - }, - { - "epoch": 0.9096727329838821, - "grad_norm": 0.6875, - "learning_rate": 1.652681688352904e-05, - "loss": 1.0987, - "step": 2896 - }, - { - "epoch": 0.9103009600094234, - "grad_norm": 0.7890625, - "learning_rate": 1.652427800698191e-05, - "loss": 1.2387, - "step": 2898 - }, - { - "epoch": 0.9109291870349647, - "grad_norm": 0.71484375, - "learning_rate": 1.6521739130434785e-05, - "loss": 1.1774, - "step": 2900 - }, - { - "epoch": 0.9115574140605062, - "grad_norm": 0.78515625, - "learning_rate": 1.6519200253887656e-05, - "loss": 1.2341, - "step": 2902 - }, - { - "epoch": 0.9121856410860475, - "grad_norm": 0.796875, - "learning_rate": 1.6516661377340527e-05, - "loss": 1.2046, - "step": 2904 - }, - { - "epoch": 0.9128138681115888, - "grad_norm": 0.7890625, - "learning_rate": 1.65141225007934e-05, - "loss": 1.477, - "step": 2906 - }, - { - "epoch": 0.9134420951371302, - "grad_norm": 0.7109375, - "learning_rate": 1.6511583624246273e-05, - "loss": 1.4045, - "step": 2908 - }, - { - "epoch": 0.9140703221626716, - "grad_norm": 0.7734375, - "learning_rate": 1.6509044747699145e-05, - "loss": 1.2798, - "step": 2910 - }, - { - "epoch": 0.9146985491882129, - "grad_norm": 0.703125, - "learning_rate": 1.6506505871152016e-05, - "loss": 1.3729, - "step": 2912 - }, - { - "epoch": 0.9153267762137542, - "grad_norm": 0.84375, - "learning_rate": 1.650396699460489e-05, - "loss": 1.3434, - "step": 2914 - }, - { - "epoch": 0.9159550032392956, - "grad_norm": 0.73046875, - "learning_rate": 1.650142811805776e-05, - "loss": 1.3866, - "step": 2916 - }, - { - "epoch": 0.9165832302648369, - "grad_norm": 0.71875, - "learning_rate": 1.6498889241510634e-05, - "loss": 1.2233, - "step": 2918 - }, - { - "epoch": 0.9172114572903783, - "grad_norm": 0.95703125, - "learning_rate": 1.6496350364963505e-05, - "loss": 1.2631, - "step": 2920 - }, - { - "epoch": 0.9178396843159197, - "grad_norm": 0.6640625, - "learning_rate": 1.6493811488416377e-05, - "loss": 1.3768, - "step": 2922 - }, - { - "epoch": 0.918467911341461, - "grad_norm": 0.84375, - "learning_rate": 1.6491272611869248e-05, - "loss": 1.2722, - "step": 2924 - }, - { - "epoch": 0.9190961383670023, - "grad_norm": 0.83203125, - "learning_rate": 1.6488733735322123e-05, - "loss": 1.2799, - "step": 2926 - }, - { - "epoch": 0.9197243653925438, - "grad_norm": 0.859375, - "learning_rate": 1.6486194858774994e-05, - "loss": 1.2571, - "step": 2928 - }, - { - "epoch": 0.9203525924180851, - "grad_norm": 0.71875, - "learning_rate": 1.6483655982227865e-05, - "loss": 1.2148, - "step": 2930 - }, - { - "epoch": 0.9209808194436264, - "grad_norm": 0.74609375, - "learning_rate": 1.6481117105680737e-05, - "loss": 1.3129, - "step": 2932 - }, - { - "epoch": 0.9216090464691677, - "grad_norm": 0.71484375, - "learning_rate": 1.647857822913361e-05, - "loss": 1.2683, - "step": 2934 - }, - { - "epoch": 0.9222372734947092, - "grad_norm": 0.703125, - "learning_rate": 1.6476039352586483e-05, - "loss": 1.356, - "step": 2936 - }, - { - "epoch": 0.9228655005202505, - "grad_norm": 0.74609375, - "learning_rate": 1.6473500476039354e-05, - "loss": 1.2901, - "step": 2938 - }, - { - "epoch": 0.9234937275457918, - "grad_norm": 0.68359375, - "learning_rate": 1.6470961599492226e-05, - "loss": 1.4158, - "step": 2940 - }, - { - "epoch": 0.9241219545713332, - "grad_norm": 0.796875, - "learning_rate": 1.6468422722945097e-05, - "loss": 1.2391, - "step": 2942 - }, - { - "epoch": 0.9247501815968746, - "grad_norm": 0.74609375, - "learning_rate": 1.646588384639797e-05, - "loss": 1.3964, - "step": 2944 - }, - { - "epoch": 0.9253784086224159, - "grad_norm": 0.71875, - "learning_rate": 1.6463344969850843e-05, - "loss": 1.3187, - "step": 2946 - }, - { - "epoch": 0.9260066356479573, - "grad_norm": 0.6640625, - "learning_rate": 1.6460806093303715e-05, - "loss": 1.3794, - "step": 2948 - }, - { - "epoch": 0.9266348626734986, - "grad_norm": 0.69140625, - "learning_rate": 1.6458267216756586e-05, - "loss": 1.3897, - "step": 2950 - }, - { - "epoch": 0.92726308969904, - "grad_norm": 0.73046875, - "learning_rate": 1.6455728340209457e-05, - "loss": 1.2514, - "step": 2952 - }, - { - "epoch": 0.9278913167245814, - "grad_norm": 0.7265625, - "learning_rate": 1.6453189463662332e-05, - "loss": 1.2275, - "step": 2954 - }, - { - "epoch": 0.9285195437501227, - "grad_norm": 0.8828125, - "learning_rate": 1.6450650587115204e-05, - "loss": 1.3661, - "step": 2956 - }, - { - "epoch": 0.929147770775664, - "grad_norm": 0.703125, - "learning_rate": 1.6448111710568075e-05, - "loss": 1.3095, - "step": 2958 - }, - { - "epoch": 0.9297759978012055, - "grad_norm": 0.80859375, - "learning_rate": 1.6445572834020946e-05, - "loss": 1.4244, - "step": 2960 - }, - { - "epoch": 0.9304042248267468, - "grad_norm": 0.69921875, - "learning_rate": 1.644303395747382e-05, - "loss": 1.3683, - "step": 2962 - }, - { - "epoch": 0.9310324518522881, - "grad_norm": 0.71875, - "learning_rate": 1.644049508092669e-05, - "loss": 1.512, - "step": 2964 - }, - { - "epoch": 0.9316606788778294, - "grad_norm": 0.80859375, - "learning_rate": 1.6437956204379564e-05, - "loss": 1.4732, - "step": 2966 - }, - { - "epoch": 0.9322889059033709, - "grad_norm": 0.734375, - "learning_rate": 1.6435417327832435e-05, - "loss": 1.34, - "step": 2968 - }, - { - "epoch": 0.9329171329289122, - "grad_norm": 0.77734375, - "learning_rate": 1.6432878451285307e-05, - "loss": 1.2436, - "step": 2970 - }, - { - "epoch": 0.9335453599544535, - "grad_norm": 0.7421875, - "learning_rate": 1.6430339574738178e-05, - "loss": 1.3719, - "step": 2972 - }, - { - "epoch": 0.9341735869799949, - "grad_norm": 0.79296875, - "learning_rate": 1.6427800698191053e-05, - "loss": 1.3081, - "step": 2974 - }, - { - "epoch": 0.9348018140055363, - "grad_norm": 0.74609375, - "learning_rate": 1.6425261821643924e-05, - "loss": 1.3141, - "step": 2976 - }, - { - "epoch": 0.9354300410310776, - "grad_norm": 0.73828125, - "learning_rate": 1.6422722945096796e-05, - "loss": 1.3385, - "step": 2978 - }, - { - "epoch": 0.936058268056619, - "grad_norm": 0.73046875, - "learning_rate": 1.6420184068549667e-05, - "loss": 1.3452, - "step": 2980 - }, - { - "epoch": 0.9366864950821603, - "grad_norm": 0.80078125, - "learning_rate": 1.6417645192002542e-05, - "loss": 1.2058, - "step": 2982 - }, - { - "epoch": 0.9373147221077017, - "grad_norm": 0.75390625, - "learning_rate": 1.641510631545541e-05, - "loss": 1.3226, - "step": 2984 - }, - { - "epoch": 0.937942949133243, - "grad_norm": 0.7109375, - "learning_rate": 1.6412567438908284e-05, - "loss": 1.362, - "step": 2986 - }, - { - "epoch": 0.9385711761587844, - "grad_norm": 0.734375, - "learning_rate": 1.6410028562361156e-05, - "loss": 1.2904, - "step": 2988 - }, - { - "epoch": 0.9391994031843257, - "grad_norm": 0.875, - "learning_rate": 1.6407489685814027e-05, - "loss": 1.3076, - "step": 2990 - }, - { - "epoch": 0.939827630209867, - "grad_norm": 1.2890625, - "learning_rate": 1.64049508092669e-05, - "loss": 1.2634, - "step": 2992 - }, - { - "epoch": 0.9404558572354085, - "grad_norm": 0.77734375, - "learning_rate": 1.6402411932719773e-05, - "loss": 1.3545, - "step": 2994 - }, - { - "epoch": 0.9410840842609498, - "grad_norm": 0.73046875, - "learning_rate": 1.6399873056172645e-05, - "loss": 1.3319, - "step": 2996 - }, - { - "epoch": 0.9417123112864911, - "grad_norm": 0.75, - "learning_rate": 1.6397334179625516e-05, - "loss": 1.3534, - "step": 2998 - }, - { - "epoch": 0.9423405383120325, - "grad_norm": 0.68359375, - "learning_rate": 1.639479530307839e-05, - "loss": 1.2353, - "step": 3000 - }, - { - "epoch": 0.9429687653375739, - "grad_norm": 0.68359375, - "learning_rate": 1.6392256426531262e-05, - "loss": 1.2108, - "step": 3002 - }, - { - "epoch": 0.9435969923631152, - "grad_norm": 0.71875, - "learning_rate": 1.6389717549984134e-05, - "loss": 1.2961, - "step": 3004 - }, - { - "epoch": 0.9442252193886566, - "grad_norm": 0.734375, - "learning_rate": 1.6387178673437005e-05, - "loss": 1.2746, - "step": 3006 - }, - { - "epoch": 0.9448534464141979, - "grad_norm": 0.72265625, - "learning_rate": 1.638463979688988e-05, - "loss": 1.2231, - "step": 3008 - }, - { - "epoch": 0.9454816734397393, - "grad_norm": 0.85546875, - "learning_rate": 1.6382100920342748e-05, - "loss": 1.4304, - "step": 3010 - }, - { - "epoch": 0.9461099004652807, - "grad_norm": 0.87890625, - "learning_rate": 1.6379562043795623e-05, - "loss": 1.336, - "step": 3012 - }, - { - "epoch": 0.946738127490822, - "grad_norm": 0.78125, - "learning_rate": 1.6377023167248494e-05, - "loss": 1.2532, - "step": 3014 - }, - { - "epoch": 0.9473663545163633, - "grad_norm": 0.73046875, - "learning_rate": 1.6374484290701365e-05, - "loss": 1.3438, - "step": 3016 - }, - { - "epoch": 0.9479945815419047, - "grad_norm": 0.765625, - "learning_rate": 1.6371945414154237e-05, - "loss": 1.3412, - "step": 3018 - }, - { - "epoch": 0.9486228085674461, - "grad_norm": 0.7578125, - "learning_rate": 1.636940653760711e-05, - "loss": 1.2943, - "step": 3020 - }, - { - "epoch": 0.9492510355929874, - "grad_norm": 0.71484375, - "learning_rate": 1.6366867661059983e-05, - "loss": 1.3679, - "step": 3022 - }, - { - "epoch": 0.9498792626185287, - "grad_norm": 0.6953125, - "learning_rate": 1.6364328784512854e-05, - "loss": 1.2899, - "step": 3024 - }, - { - "epoch": 0.9505074896440702, - "grad_norm": 0.87109375, - "learning_rate": 1.6361789907965726e-05, - "loss": 1.4329, - "step": 3026 - }, - { - "epoch": 0.9511357166696115, - "grad_norm": 0.6875, - "learning_rate": 1.63592510314186e-05, - "loss": 1.2883, - "step": 3028 - }, - { - "epoch": 0.9517639436951528, - "grad_norm": 0.66796875, - "learning_rate": 1.6356712154871472e-05, - "loss": 1.2225, - "step": 3030 - }, - { - "epoch": 0.9523921707206942, - "grad_norm": 0.796875, - "learning_rate": 1.6354173278324343e-05, - "loss": 1.3128, - "step": 3032 - }, - { - "epoch": 0.9530203977462356, - "grad_norm": 0.8828125, - "learning_rate": 1.6351634401777215e-05, - "loss": 1.3125, - "step": 3034 - }, - { - "epoch": 0.9536486247717769, - "grad_norm": 0.7265625, - "learning_rate": 1.6349095525230086e-05, - "loss": 1.3294, - "step": 3036 - }, - { - "epoch": 0.9542768517973182, - "grad_norm": 0.79296875, - "learning_rate": 1.6346556648682957e-05, - "loss": 1.2799, - "step": 3038 - }, - { - "epoch": 0.9549050788228596, - "grad_norm": 0.703125, - "learning_rate": 1.6344017772135832e-05, - "loss": 1.3259, - "step": 3040 - }, - { - "epoch": 0.955533305848401, - "grad_norm": 0.66796875, - "learning_rate": 1.6341478895588704e-05, - "loss": 1.2925, - "step": 3042 - }, - { - "epoch": 0.9561615328739423, - "grad_norm": 0.81640625, - "learning_rate": 1.6338940019041575e-05, - "loss": 1.4347, - "step": 3044 - }, - { - "epoch": 0.9567897598994837, - "grad_norm": 0.71875, - "learning_rate": 1.6336401142494446e-05, - "loss": 1.1746, - "step": 3046 - }, - { - "epoch": 0.957417986925025, - "grad_norm": 0.8671875, - "learning_rate": 1.633386226594732e-05, - "loss": 1.3328, - "step": 3048 - }, - { - "epoch": 0.9580462139505664, - "grad_norm": 0.6953125, - "learning_rate": 1.6331323389400192e-05, - "loss": 1.3283, - "step": 3050 - }, - { - "epoch": 0.9586744409761078, - "grad_norm": 0.67578125, - "learning_rate": 1.6328784512853064e-05, - "loss": 1.4036, - "step": 3052 - }, - { - "epoch": 0.9593026680016491, - "grad_norm": 0.83984375, - "learning_rate": 1.6326245636305935e-05, - "loss": 1.2255, - "step": 3054 - }, - { - "epoch": 0.9599308950271904, - "grad_norm": 0.7265625, - "learning_rate": 1.632370675975881e-05, - "loss": 1.2382, - "step": 3056 - }, - { - "epoch": 0.9605591220527318, - "grad_norm": 0.72265625, - "learning_rate": 1.6321167883211678e-05, - "loss": 1.311, - "step": 3058 - }, - { - "epoch": 0.9611873490782732, - "grad_norm": 0.7109375, - "learning_rate": 1.6318629006664553e-05, - "loss": 1.2916, - "step": 3060 - }, - { - "epoch": 0.9618155761038145, - "grad_norm": 0.734375, - "learning_rate": 1.6316090130117424e-05, - "loss": 1.2996, - "step": 3062 - }, - { - "epoch": 0.9624438031293558, - "grad_norm": 0.6796875, - "learning_rate": 1.6313551253570295e-05, - "loss": 1.3253, - "step": 3064 - }, - { - "epoch": 0.9630720301548972, - "grad_norm": 0.66796875, - "learning_rate": 1.6311012377023167e-05, - "loss": 1.3582, - "step": 3066 - }, - { - "epoch": 0.9637002571804386, - "grad_norm": 0.671875, - "learning_rate": 1.630847350047604e-05, - "loss": 1.3637, - "step": 3068 - }, - { - "epoch": 0.9643284842059799, - "grad_norm": 0.80859375, - "learning_rate": 1.6305934623928913e-05, - "loss": 1.2882, - "step": 3070 - }, - { - "epoch": 0.9649567112315213, - "grad_norm": 1.1171875, - "learning_rate": 1.6303395747381784e-05, - "loss": 1.2281, - "step": 3072 - }, - { - "epoch": 0.9655849382570626, - "grad_norm": 0.80078125, - "learning_rate": 1.6300856870834656e-05, - "loss": 1.3915, - "step": 3074 - }, - { - "epoch": 0.966213165282604, - "grad_norm": 0.75, - "learning_rate": 1.629831799428753e-05, - "loss": 1.401, - "step": 3076 - }, - { - "epoch": 0.9668413923081454, - "grad_norm": 0.73828125, - "learning_rate": 1.62957791177404e-05, - "loss": 1.2698, - "step": 3078 - }, - { - "epoch": 0.9674696193336867, - "grad_norm": 0.75, - "learning_rate": 1.6293240241193273e-05, - "loss": 1.3833, - "step": 3080 - }, - { - "epoch": 0.968097846359228, - "grad_norm": 0.84375, - "learning_rate": 1.6290701364646148e-05, - "loss": 1.2167, - "step": 3082 - }, - { - "epoch": 0.9687260733847695, - "grad_norm": 0.6875, - "learning_rate": 1.6288162488099016e-05, - "loss": 1.3872, - "step": 3084 - }, - { - "epoch": 0.9693543004103108, - "grad_norm": 0.65625, - "learning_rate": 1.628562361155189e-05, - "loss": 1.1942, - "step": 3086 - }, - { - "epoch": 0.9699825274358521, - "grad_norm": 0.91015625, - "learning_rate": 1.6283084735004762e-05, - "loss": 1.3442, - "step": 3088 - }, - { - "epoch": 0.9706107544613934, - "grad_norm": 0.68359375, - "learning_rate": 1.6280545858457634e-05, - "loss": 1.1895, - "step": 3090 - }, - { - "epoch": 0.9712389814869349, - "grad_norm": 0.765625, - "learning_rate": 1.6278006981910505e-05, - "loss": 1.2817, - "step": 3092 - }, - { - "epoch": 0.9718672085124762, - "grad_norm": 0.85546875, - "learning_rate": 1.627546810536338e-05, - "loss": 1.3469, - "step": 3094 - }, - { - "epoch": 0.9724954355380175, - "grad_norm": 0.76171875, - "learning_rate": 1.627292922881625e-05, - "loss": 1.3717, - "step": 3096 - }, - { - "epoch": 0.9731236625635589, - "grad_norm": 0.6953125, - "learning_rate": 1.6270390352269123e-05, - "loss": 1.3015, - "step": 3098 - }, - { - "epoch": 0.9737518895891003, - "grad_norm": 0.703125, - "learning_rate": 1.6267851475721994e-05, - "loss": 1.4265, - "step": 3100 - }, - { - "epoch": 0.9743801166146416, - "grad_norm": 0.80859375, - "learning_rate": 1.626531259917487e-05, - "loss": 1.2424, - "step": 3102 - }, - { - "epoch": 0.975008343640183, - "grad_norm": 0.8125, - "learning_rate": 1.6262773722627737e-05, - "loss": 1.3252, - "step": 3104 - }, - { - "epoch": 0.9756365706657243, - "grad_norm": 0.87890625, - "learning_rate": 1.626023484608061e-05, - "loss": 1.3464, - "step": 3106 - }, - { - "epoch": 0.9762647976912657, - "grad_norm": 0.81640625, - "learning_rate": 1.6257695969533483e-05, - "loss": 1.3007, - "step": 3108 - }, - { - "epoch": 0.9768930247168071, - "grad_norm": 0.90234375, - "learning_rate": 1.6255157092986354e-05, - "loss": 1.2549, - "step": 3110 - }, - { - "epoch": 0.9775212517423484, - "grad_norm": 0.76171875, - "learning_rate": 1.6252618216439226e-05, - "loss": 1.3299, - "step": 3112 - }, - { - "epoch": 0.9781494787678897, - "grad_norm": 0.77734375, - "learning_rate": 1.62500793398921e-05, - "loss": 1.3118, - "step": 3114 - }, - { - "epoch": 0.9787777057934312, - "grad_norm": 0.765625, - "learning_rate": 1.6247540463344972e-05, - "loss": 1.3114, - "step": 3116 - }, - { - "epoch": 0.9794059328189725, - "grad_norm": 0.66796875, - "learning_rate": 1.6245001586797843e-05, - "loss": 1.2729, - "step": 3118 - }, - { - "epoch": 0.9800341598445138, - "grad_norm": 0.69921875, - "learning_rate": 1.6242462710250715e-05, - "loss": 1.3769, - "step": 3120 - }, - { - "epoch": 0.9806623868700551, - "grad_norm": 0.65234375, - "learning_rate": 1.623992383370359e-05, - "loss": 1.2581, - "step": 3122 - }, - { - "epoch": 0.9812906138955966, - "grad_norm": 0.73046875, - "learning_rate": 1.623738495715646e-05, - "loss": 1.3578, - "step": 3124 - }, - { - "epoch": 0.9819188409211379, - "grad_norm": 0.67578125, - "learning_rate": 1.6234846080609332e-05, - "loss": 1.3055, - "step": 3126 - }, - { - "epoch": 0.9825470679466792, - "grad_norm": 1.3125, - "learning_rate": 1.6232307204062203e-05, - "loss": 1.2103, - "step": 3128 - }, - { - "epoch": 0.9831752949722206, - "grad_norm": 0.79296875, - "learning_rate": 1.6229768327515075e-05, - "loss": 1.3818, - "step": 3130 - }, - { - "epoch": 0.9838035219977619, - "grad_norm": 0.703125, - "learning_rate": 1.6227229450967946e-05, - "loss": 1.3531, - "step": 3132 - }, - { - "epoch": 0.9844317490233033, - "grad_norm": 0.73046875, - "learning_rate": 1.622469057442082e-05, - "loss": 1.2813, - "step": 3134 - }, - { - "epoch": 0.9850599760488447, - "grad_norm": 0.7578125, - "learning_rate": 1.6222151697873692e-05, - "loss": 1.3331, - "step": 3136 - }, - { - "epoch": 0.985688203074386, - "grad_norm": 0.72265625, - "learning_rate": 1.6219612821326564e-05, - "loss": 1.3681, - "step": 3138 - }, - { - "epoch": 0.9863164300999273, - "grad_norm": 0.7421875, - "learning_rate": 1.6217073944779435e-05, - "loss": 1.3545, - "step": 3140 - }, - { - "epoch": 0.9869446571254687, - "grad_norm": 0.703125, - "learning_rate": 1.621453506823231e-05, - "loss": 1.392, - "step": 3142 - }, - { - "epoch": 0.9875728841510101, - "grad_norm": 0.79296875, - "learning_rate": 1.621199619168518e-05, - "loss": 1.3232, - "step": 3144 - }, - { - "epoch": 0.9882011111765514, - "grad_norm": 0.7578125, - "learning_rate": 1.6209457315138053e-05, - "loss": 1.2144, - "step": 3146 - }, - { - "epoch": 0.9888293382020927, - "grad_norm": 0.6796875, - "learning_rate": 1.6206918438590924e-05, - "loss": 1.3129, - "step": 3148 - }, - { - "epoch": 0.9894575652276342, - "grad_norm": 0.74609375, - "learning_rate": 1.62043795620438e-05, - "loss": 1.3398, - "step": 3150 - }, - { - "epoch": 0.9900857922531755, - "grad_norm": 0.79296875, - "learning_rate": 1.6201840685496667e-05, - "loss": 1.3094, - "step": 3152 - }, - { - "epoch": 0.9907140192787168, - "grad_norm": 0.66796875, - "learning_rate": 1.619930180894954e-05, - "loss": 1.436, - "step": 3154 - }, - { - "epoch": 0.9913422463042582, - "grad_norm": 0.828125, - "learning_rate": 1.6196762932402413e-05, - "loss": 1.4225, - "step": 3156 - }, - { - "epoch": 0.9919704733297996, - "grad_norm": 0.76953125, - "learning_rate": 1.6194224055855284e-05, - "loss": 1.2521, - "step": 3158 - }, - { - "epoch": 0.9925987003553409, - "grad_norm": 0.7265625, - "learning_rate": 1.6191685179308156e-05, - "loss": 1.2926, - "step": 3160 - }, - { - "epoch": 0.9932269273808823, - "grad_norm": 0.66796875, - "learning_rate": 1.618914630276103e-05, - "loss": 1.222, - "step": 3162 - }, - { - "epoch": 0.9938551544064236, - "grad_norm": 0.87890625, - "learning_rate": 1.6186607426213902e-05, - "loss": 1.3083, - "step": 3164 - }, - { - "epoch": 0.994483381431965, - "grad_norm": 0.66015625, - "learning_rate": 1.6184068549666773e-05, - "loss": 1.3349, - "step": 3166 - }, - { - "epoch": 0.9951116084575063, - "grad_norm": 0.66015625, - "learning_rate": 1.6181529673119648e-05, - "loss": 1.306, - "step": 3168 - }, - { - "epoch": 0.9957398354830477, - "grad_norm": 0.796875, - "learning_rate": 1.617899079657252e-05, - "loss": 1.2053, - "step": 3170 - }, - { - "epoch": 0.996368062508589, - "grad_norm": 0.91015625, - "learning_rate": 1.617645192002539e-05, - "loss": 1.2241, - "step": 3172 - }, - { - "epoch": 0.9969962895341304, - "grad_norm": 0.78125, - "learning_rate": 1.6173913043478262e-05, - "loss": 1.1861, - "step": 3174 - }, - { - "epoch": 0.9976245165596718, - "grad_norm": 0.6953125, - "learning_rate": 1.6171374166931137e-05, - "loss": 1.3181, - "step": 3176 - }, - { - "epoch": 0.9982527435852131, - "grad_norm": 0.7265625, - "learning_rate": 1.6168835290384005e-05, - "loss": 1.2654, - "step": 3178 - }, - { - "epoch": 0.9988809706107544, - "grad_norm": 0.77734375, - "learning_rate": 1.616629641383688e-05, - "loss": 1.435, - "step": 3180 - }, - { - "epoch": 0.9995091976362959, - "grad_norm": 0.75, - "learning_rate": 1.616375753728975e-05, - "loss": 1.4596, - "step": 3182 - }, - { - "epoch": 1.000137424661837, - "grad_norm": 0.68359375, - "learning_rate": 1.6161218660742622e-05, - "loss": 1.3658, - "step": 3184 - }, - { - "epoch": 1.0007656516873786, - "grad_norm": 0.67578125, - "learning_rate": 1.6158679784195494e-05, - "loss": 1.278, - "step": 3186 - }, - { - "epoch": 1.00139387871292, - "grad_norm": 0.6640625, - "learning_rate": 1.615614090764837e-05, - "loss": 1.3165, - "step": 3188 - }, - { - "epoch": 1.0020221057384613, - "grad_norm": 0.68359375, - "learning_rate": 1.615360203110124e-05, - "loss": 1.292, - "step": 3190 - }, - { - "epoch": 1.0026503327640026, - "grad_norm": 0.7109375, - "learning_rate": 1.615106315455411e-05, - "loss": 1.2257, - "step": 3192 - }, - { - "epoch": 1.003278559789544, - "grad_norm": 0.78125, - "learning_rate": 1.6148524278006983e-05, - "loss": 1.256, - "step": 3194 - }, - { - "epoch": 1.0039067868150853, - "grad_norm": 0.83984375, - "learning_rate": 1.6145985401459858e-05, - "loss": 1.2126, - "step": 3196 - }, - { - "epoch": 1.0045350138406266, - "grad_norm": 0.75390625, - "learning_rate": 1.6143446524912726e-05, - "loss": 1.2712, - "step": 3198 - }, - { - "epoch": 1.005163240866168, - "grad_norm": 0.76953125, - "learning_rate": 1.61409076483656e-05, - "loss": 1.1845, - "step": 3200 - } - ], - "logging_steps": 2, - "max_steps": 15915, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 8.666866260954317e+18, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}