diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20498 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 11696, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.3728413581848145, + "learning_rate": 4.998290013679891e-05, + "loss": 10.0117, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 2.599367380142212, + "learning_rate": 4.996580027359781e-05, + "loss": 9.1745, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 2.4951720237731934, + "learning_rate": 4.994870041039672e-05, + "loss": 8.7624, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 2.4932913780212402, + "learning_rate": 4.9931600547195625e-05, + "loss": 8.5709, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 2.3983781337738037, + "learning_rate": 4.991450068399453e-05, + "loss": 8.4411, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.2340972423553467, + "learning_rate": 4.989740082079344e-05, + "loss": 8.1957, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.1964590549468994, + "learning_rate": 4.988030095759234e-05, + "loss": 8.0598, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 1.9686617851257324, + "learning_rate": 4.986320109439125e-05, + "loss": 7.8679, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 1.8235087394714355, + "learning_rate": 4.984610123119015e-05, + "loss": 7.7621, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.7598885297775269, + "learning_rate": 4.9829001367989056e-05, + "loss": 7.5417, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 1.94789719581604, + "learning_rate": 4.981190150478797e-05, + "loss": 7.3331, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 1.9366331100463867, + "learning_rate": 4.979480164158687e-05, + "loss": 7.1039, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 2.1228675842285156, + "learning_rate": 4.977770177838578e-05, + "loss": 6.9965, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 1.9855167865753174, + "learning_rate": 4.976060191518468e-05, + "loss": 6.8865, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.8035985231399536, + "learning_rate": 4.9743502051983585e-05, + "loss": 6.6322, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.8589977025985718, + "learning_rate": 4.9726402188782486e-05, + "loss": 6.5616, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.745139241218567, + "learning_rate": 4.97093023255814e-05, + "loss": 6.3479, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.266196608543396, + "learning_rate": 4.969220246238031e-05, + "loss": 6.2343, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 2.005223035812378, + "learning_rate": 4.967510259917921e-05, + "loss": 6.1866, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.5017377138137817, + "learning_rate": 4.9658002735978115e-05, + "loss": 5.972, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.6136974096298218, + "learning_rate": 4.9640902872777016e-05, + "loss": 6.1335, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.3865970373153687, + "learning_rate": 4.962380300957592e-05, + "loss": 5.944, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.283933162689209, + "learning_rate": 4.960670314637483e-05, + "loss": 5.8713, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.1588549613952637, + "learning_rate": 4.958960328317374e-05, + "loss": 5.7731, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.3081687688827515, + "learning_rate": 4.9572503419972645e-05, + "loss": 5.7429, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.6212745904922485, + "learning_rate": 4.9555403556771546e-05, + "loss": 5.481, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.709843397140503, + "learning_rate": 4.953830369357045e-05, + "loss": 5.4855, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.2814812660217285, + "learning_rate": 4.952120383036936e-05, + "loss": 5.4367, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.5539968013763428, + "learning_rate": 4.950410396716826e-05, + "loss": 5.2652, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.500375509262085, + "learning_rate": 4.948700410396717e-05, + "loss": 5.5137, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.7938899993896484, + "learning_rate": 4.9469904240766076e-05, + "loss": 5.4663, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.6169573068618774, + "learning_rate": 4.945280437756498e-05, + "loss": 5.3137, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.1904114484786987, + "learning_rate": 4.943570451436389e-05, + "loss": 5.1871, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 2.5564723014831543, + "learning_rate": 4.941860465116279e-05, + "loss": 5.4058, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.6187268495559692, + "learning_rate": 4.94015047879617e-05, + "loss": 5.1919, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.2222367525100708, + "learning_rate": 4.93844049247606e-05, + "loss": 5.1876, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.5898586511611938, + "learning_rate": 4.936730506155951e-05, + "loss": 5.1217, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.446902871131897, + "learning_rate": 4.935020519835842e-05, + "loss": 5.0582, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 1.862309217453003, + "learning_rate": 4.933310533515732e-05, + "loss": 4.9028, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.5455704927444458, + "learning_rate": 4.931600547195623e-05, + "loss": 5.0352, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.772558569908142, + "learning_rate": 4.929890560875513e-05, + "loss": 4.9707, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 1.8154480457305908, + "learning_rate": 4.9281805745554036e-05, + "loss": 5.0588, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 1.4536504745483398, + "learning_rate": 4.9264705882352944e-05, + "loss": 4.9997, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 1.6166527271270752, + "learning_rate": 4.924760601915185e-05, + "loss": 4.9472, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 2.1260979175567627, + "learning_rate": 4.923050615595076e-05, + "loss": 4.944, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 1.4778213500976562, + "learning_rate": 4.921340629274966e-05, + "loss": 4.8657, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.255488395690918, + "learning_rate": 4.9196306429548566e-05, + "loss": 4.7728, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.5698314905166626, + "learning_rate": 4.917920656634747e-05, + "loss": 4.9433, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.342402696609497, + "learning_rate": 4.9162106703146374e-05, + "loss": 4.6336, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.4427978992462158, + "learning_rate": 4.914500683994528e-05, + "loss": 4.7112, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.6528425216674805, + "learning_rate": 4.912790697674419e-05, + "loss": 4.876, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 1.706386923789978, + "learning_rate": 4.9110807113543096e-05, + "loss": 4.5971, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.7773737907409668, + "learning_rate": 4.9093707250342e-05, + "loss": 4.7944, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 1.7884222269058228, + "learning_rate": 4.9076607387140904e-05, + "loss": 4.5912, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 1.5389587879180908, + "learning_rate": 4.905950752393981e-05, + "loss": 4.6656, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 1.7006316184997559, + "learning_rate": 4.904240766073871e-05, + "loss": 4.6589, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.6314700841903687, + "learning_rate": 4.9025307797537626e-05, + "loss": 4.5036, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 1.7512930631637573, + "learning_rate": 4.900820793433653e-05, + "loss": 4.5105, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 1.9109033346176147, + "learning_rate": 4.8991108071135434e-05, + "loss": 4.4449, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 1.547317385673523, + "learning_rate": 4.897400820793434e-05, + "loss": 4.4906, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.5143060684204102, + "learning_rate": 4.895690834473324e-05, + "loss": 4.3545, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 1.6648136377334595, + "learning_rate": 4.893980848153215e-05, + "loss": 4.4093, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 1.5027951002120972, + "learning_rate": 4.892270861833106e-05, + "loss": 4.3444, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.9429397583007812, + "learning_rate": 4.8905608755129964e-05, + "loss": 4.4937, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 1.4009640216827393, + "learning_rate": 4.888850889192887e-05, + "loss": 4.3625, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 1.7445279359817505, + "learning_rate": 4.887140902872777e-05, + "loss": 4.2759, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 1.4381403923034668, + "learning_rate": 4.885430916552668e-05, + "loss": 4.574, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 1.5582979917526245, + "learning_rate": 4.883720930232558e-05, + "loss": 4.3831, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 1.461166501045227, + "learning_rate": 4.882010943912449e-05, + "loss": 4.5049, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 1.756549596786499, + "learning_rate": 4.8803009575923394e-05, + "loss": 4.1334, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 1.9006497859954834, + "learning_rate": 4.87859097127223e-05, + "loss": 4.3406, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 1.4551759958267212, + "learning_rate": 4.876880984952121e-05, + "loss": 4.4688, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.5335747003555298, + "learning_rate": 4.875170998632011e-05, + "loss": 4.2096, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 1.4802557229995728, + "learning_rate": 4.873461012311902e-05, + "loss": 4.0918, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 1.9993939399719238, + "learning_rate": 4.8717510259917924e-05, + "loss": 4.3636, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.706895351409912, + "learning_rate": 4.8700410396716825e-05, + "loss": 4.0891, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 1.4630484580993652, + "learning_rate": 4.868331053351574e-05, + "loss": 4.3254, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 1.4353913068771362, + "learning_rate": 4.866621067031464e-05, + "loss": 4.1935, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 1.5042275190353394, + "learning_rate": 4.864911080711355e-05, + "loss": 4.0913, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 1.792472004890442, + "learning_rate": 4.863201094391245e-05, + "loss": 4.2159, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 1.4059948921203613, + "learning_rate": 4.8614911080711355e-05, + "loss": 4.2934, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.8408161401748657, + "learning_rate": 4.859781121751026e-05, + "loss": 4.1482, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 1.716046690940857, + "learning_rate": 4.858071135430917e-05, + "loss": 4.0273, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.8415429592132568, + "learning_rate": 4.856361149110808e-05, + "loss": 4.0837, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.4880731105804443, + "learning_rate": 4.854651162790698e-05, + "loss": 4.1957, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.8150016069412231, + "learning_rate": 4.8529411764705885e-05, + "loss": 4.225, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 1.7828190326690674, + "learning_rate": 4.851231190150479e-05, + "loss": 4.1053, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 1.8549565076828003, + "learning_rate": 4.849521203830369e-05, + "loss": 3.8068, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 1.4295508861541748, + "learning_rate": 4.84781121751026e-05, + "loss": 3.9358, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 1.4277971982955933, + "learning_rate": 4.846101231190151e-05, + "loss": 4.1512, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 1.4937466382980347, + "learning_rate": 4.8443912448700415e-05, + "loss": 4.0162, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 1.3076913356781006, + "learning_rate": 4.842681258549932e-05, + "loss": 4.0336, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 1.673946738243103, + "learning_rate": 4.840971272229822e-05, + "loss": 4.0915, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 1.5978224277496338, + "learning_rate": 4.839261285909713e-05, + "loss": 4.137, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 2.214573383331299, + "learning_rate": 4.837551299589603e-05, + "loss": 3.8231, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 1.4263286590576172, + "learning_rate": 4.835841313269494e-05, + "loss": 3.9326, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 1.694931149482727, + "learning_rate": 4.8341313269493845e-05, + "loss": 4.0673, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 1.558498740196228, + "learning_rate": 4.832421340629275e-05, + "loss": 3.9454, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 1.5231866836547852, + "learning_rate": 4.830711354309166e-05, + "loss": 4.0214, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 1.6748324632644653, + "learning_rate": 4.829001367989056e-05, + "loss": 3.9983, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 1.3733506202697754, + "learning_rate": 4.827291381668947e-05, + "loss": 3.8526, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 1.6695655584335327, + "learning_rate": 4.8255813953488375e-05, + "loss": 3.8948, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 1.3915668725967407, + "learning_rate": 4.8238714090287276e-05, + "loss": 3.8511, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 1.5736514329910278, + "learning_rate": 4.822161422708619e-05, + "loss": 3.9446, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 1.562687635421753, + "learning_rate": 4.820451436388509e-05, + "loss": 3.6682, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 1.4215232133865356, + "learning_rate": 4.8187414500684e-05, + "loss": 3.6618, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 1.8750267028808594, + "learning_rate": 4.8170314637482905e-05, + "loss": 3.8246, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 1.5843207836151123, + "learning_rate": 4.8153214774281806e-05, + "loss": 3.807, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 1.6384755373001099, + "learning_rate": 4.813611491108071e-05, + "loss": 3.8329, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 1.646612286567688, + "learning_rate": 4.811901504787962e-05, + "loss": 3.9736, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 1.592463493347168, + "learning_rate": 4.810191518467853e-05, + "loss": 3.6107, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 1.6230803728103638, + "learning_rate": 4.808481532147743e-05, + "loss": 3.8141, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 1.7098625898361206, + "learning_rate": 4.8067715458276336e-05, + "loss": 3.8751, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 1.441146731376648, + "learning_rate": 4.805061559507524e-05, + "loss": 3.8461, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 1.4387036561965942, + "learning_rate": 4.8033515731874144e-05, + "loss": 3.8432, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 1.6620376110076904, + "learning_rate": 4.801641586867305e-05, + "loss": 3.7844, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 1.5403114557266235, + "learning_rate": 4.799931600547196e-05, + "loss": 3.6556, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 1.5642478466033936, + "learning_rate": 4.7982216142270866e-05, + "loss": 3.7526, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 1.5027506351470947, + "learning_rate": 4.796511627906977e-05, + "loss": 3.7862, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 1.5570485591888428, + "learning_rate": 4.7948016415868674e-05, + "loss": 3.7656, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 2.2708029747009277, + "learning_rate": 4.793091655266758e-05, + "loss": 3.8742, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 1.5446516275405884, + "learning_rate": 4.791381668946648e-05, + "loss": 3.6401, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 1.5480107069015503, + "learning_rate": 4.789671682626539e-05, + "loss": 3.7441, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 1.5571659803390503, + "learning_rate": 4.78796169630643e-05, + "loss": 3.6849, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 1.9021155834197998, + "learning_rate": 4.7862517099863204e-05, + "loss": 3.7989, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.5292832851409912, + "learning_rate": 4.784541723666211e-05, + "loss": 3.8129, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 1.5750986337661743, + "learning_rate": 4.782831737346101e-05, + "loss": 3.648, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 1.6995611190795898, + "learning_rate": 4.781121751025992e-05, + "loss": 3.7686, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 1.7573764324188232, + "learning_rate": 4.7794117647058826e-05, + "loss": 3.6636, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 1.5131741762161255, + "learning_rate": 4.7777017783857733e-05, + "loss": 3.5793, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 1.6838548183441162, + "learning_rate": 4.775991792065664e-05, + "loss": 3.6336, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 1.9402954578399658, + "learning_rate": 4.774281805745554e-05, + "loss": 3.6733, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 1.7752444744110107, + "learning_rate": 4.772571819425445e-05, + "loss": 3.8021, + "step": 532 + }, + { + "epoch": 0.05, + "grad_norm": 1.4470579624176025, + "learning_rate": 4.7708618331053356e-05, + "loss": 3.6526, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 1.5560115575790405, + "learning_rate": 4.769151846785226e-05, + "loss": 3.7236, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 1.6849740743637085, + "learning_rate": 4.7674418604651164e-05, + "loss": 3.7586, + "step": 544 + }, + { + "epoch": 0.05, + "grad_norm": 1.4917711019515991, + "learning_rate": 4.765731874145007e-05, + "loss": 3.7791, + "step": 548 + }, + { + "epoch": 0.05, + "grad_norm": 1.791801929473877, + "learning_rate": 4.764021887824898e-05, + "loss": 3.5659, + "step": 552 + }, + { + "epoch": 0.05, + "grad_norm": 1.531817078590393, + "learning_rate": 4.7623119015047886e-05, + "loss": 3.5811, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 1.8725793361663818, + "learning_rate": 4.7606019151846787e-05, + "loss": 3.7337, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 1.5851105451583862, + "learning_rate": 4.7588919288645694e-05, + "loss": 3.4899, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 1.4937329292297363, + "learning_rate": 4.7571819425444594e-05, + "loss": 3.5653, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 1.8102850914001465, + "learning_rate": 4.75547195622435e-05, + "loss": 3.517, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 1.4572982788085938, + "learning_rate": 4.753761969904241e-05, + "loss": 3.723, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 1.5815645456314087, + "learning_rate": 4.7520519835841317e-05, + "loss": 3.6615, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 1.6406457424163818, + "learning_rate": 4.7503419972640224e-05, + "loss": 3.6796, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 1.8512486219406128, + "learning_rate": 4.7486320109439124e-05, + "loss": 3.5586, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 1.6738507747650146, + "learning_rate": 4.746922024623803e-05, + "loss": 3.4724, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 1.766518235206604, + "learning_rate": 4.745212038303693e-05, + "loss": 3.7213, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 1.681229591369629, + "learning_rate": 4.7435020519835846e-05, + "loss": 3.6538, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 1.7900785207748413, + "learning_rate": 4.7417920656634754e-05, + "loss": 3.7952, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 1.5610637664794922, + "learning_rate": 4.7400820793433654e-05, + "loss": 3.5255, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 1.3365826606750488, + "learning_rate": 4.738372093023256e-05, + "loss": 3.5621, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 1.8946609497070312, + "learning_rate": 4.736662106703146e-05, + "loss": 3.5664, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 1.728330135345459, + "learning_rate": 4.734952120383037e-05, + "loss": 3.7577, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 1.7583847045898438, + "learning_rate": 4.733242134062928e-05, + "loss": 3.5722, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 1.9061710834503174, + "learning_rate": 4.7315321477428184e-05, + "loss": 3.3436, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 1.7074024677276611, + "learning_rate": 4.729822161422709e-05, + "loss": 3.5426, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 1.649793028831482, + "learning_rate": 4.728112175102599e-05, + "loss": 3.5826, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 1.7260462045669556, + "learning_rate": 4.72640218878249e-05, + "loss": 3.5353, + "step": 640 + }, + { + "epoch": 0.06, + "grad_norm": 1.745391607284546, + "learning_rate": 4.724692202462381e-05, + "loss": 3.3534, + "step": 644 + }, + { + "epoch": 0.06, + "grad_norm": 1.374226450920105, + "learning_rate": 4.722982216142271e-05, + "loss": 3.386, + "step": 648 + }, + { + "epoch": 0.06, + "grad_norm": 1.8732798099517822, + "learning_rate": 4.7212722298221615e-05, + "loss": 3.6073, + "step": 652 + }, + { + "epoch": 0.06, + "grad_norm": 2.4075376987457275, + "learning_rate": 4.719562243502052e-05, + "loss": 3.5393, + "step": 656 + }, + { + "epoch": 0.06, + "grad_norm": 1.54508638381958, + "learning_rate": 4.717852257181943e-05, + "loss": 3.475, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 1.5737498998641968, + "learning_rate": 4.716142270861834e-05, + "loss": 3.5497, + "step": 664 + }, + { + "epoch": 0.06, + "grad_norm": 1.8623074293136597, + "learning_rate": 4.714432284541724e-05, + "loss": 3.4603, + "step": 668 + }, + { + "epoch": 0.06, + "grad_norm": 1.7199251651763916, + "learning_rate": 4.7127222982216145e-05, + "loss": 3.5801, + "step": 672 + }, + { + "epoch": 0.06, + "grad_norm": 1.5866843461990356, + "learning_rate": 4.7110123119015045e-05, + "loss": 3.3454, + "step": 676 + }, + { + "epoch": 0.06, + "grad_norm": 1.9907779693603516, + "learning_rate": 4.709302325581396e-05, + "loss": 3.6247, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 1.3720687627792358, + "learning_rate": 4.707592339261287e-05, + "loss": 3.3614, + "step": 684 + }, + { + "epoch": 0.06, + "grad_norm": 1.739660620689392, + "learning_rate": 4.705882352941177e-05, + "loss": 3.4211, + "step": 688 + }, + { + "epoch": 0.06, + "grad_norm": 1.6425236463546753, + "learning_rate": 4.7041723666210675e-05, + "loss": 3.423, + "step": 692 + }, + { + "epoch": 0.06, + "grad_norm": 1.5457091331481934, + "learning_rate": 4.7024623803009575e-05, + "loss": 3.3258, + "step": 696 + }, + { + "epoch": 0.06, + "grad_norm": 1.5979949235916138, + "learning_rate": 4.700752393980848e-05, + "loss": 3.2838, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 1.6761040687561035, + "learning_rate": 4.699042407660739e-05, + "loss": 3.334, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 1.552573323249817, + "learning_rate": 4.69733242134063e-05, + "loss": 3.5355, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 1.6743354797363281, + "learning_rate": 4.6956224350205205e-05, + "loss": 3.4076, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 4.555662155151367, + "learning_rate": 4.6939124487004105e-05, + "loss": 3.7446, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 1.5942860841751099, + "learning_rate": 4.692202462380301e-05, + "loss": 3.553, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 1.9089189767837524, + "learning_rate": 4.690492476060191e-05, + "loss": 3.3584, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 2.0768373012542725, + "learning_rate": 4.688782489740082e-05, + "loss": 3.4295, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 3.3060362339019775, + "learning_rate": 4.687072503419973e-05, + "loss": 3.415, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 1.9307421445846558, + "learning_rate": 4.6853625170998635e-05, + "loss": 3.4901, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 1.7142446041107178, + "learning_rate": 4.683652530779754e-05, + "loss": 3.6096, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 1.7756657600402832, + "learning_rate": 4.681942544459644e-05, + "loss": 3.4534, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 1.5459442138671875, + "learning_rate": 4.680232558139535e-05, + "loss": 3.3813, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 2.1961963176727295, + "learning_rate": 4.678522571819426e-05, + "loss": 3.3467, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 1.7320613861083984, + "learning_rate": 4.676812585499316e-05, + "loss": 3.3927, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 1.6255303621292114, + "learning_rate": 4.6751025991792066e-05, + "loss": 3.4644, + "step": 760 + }, + { + "epoch": 0.07, + "grad_norm": 1.5142388343811035, + "learning_rate": 4.673392612859097e-05, + "loss": 3.4657, + "step": 764 + }, + { + "epoch": 0.07, + "grad_norm": 1.5093744993209839, + "learning_rate": 4.671682626538988e-05, + "loss": 3.3144, + "step": 768 + }, + { + "epoch": 0.07, + "grad_norm": 1.4730446338653564, + "learning_rate": 4.669972640218879e-05, + "loss": 3.3382, + "step": 772 + }, + { + "epoch": 0.07, + "grad_norm": 1.7650116682052612, + "learning_rate": 4.668262653898769e-05, + "loss": 3.4519, + "step": 776 + }, + { + "epoch": 0.07, + "grad_norm": 1.637071132659912, + "learning_rate": 4.6665526675786596e-05, + "loss": 3.3723, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 1.4582788944244385, + "learning_rate": 4.6648426812585496e-05, + "loss": 3.4839, + "step": 784 + }, + { + "epoch": 0.07, + "grad_norm": 1.4654922485351562, + "learning_rate": 4.663132694938441e-05, + "loss": 3.1257, + "step": 788 + }, + { + "epoch": 0.07, + "grad_norm": 1.4668978452682495, + "learning_rate": 4.661422708618332e-05, + "loss": 3.2506, + "step": 792 + }, + { + "epoch": 0.07, + "grad_norm": 1.5190536975860596, + "learning_rate": 4.659712722298222e-05, + "loss": 3.4055, + "step": 796 + }, + { + "epoch": 0.07, + "grad_norm": 1.9057687520980835, + "learning_rate": 4.6580027359781126e-05, + "loss": 3.4804, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 1.4573813676834106, + "learning_rate": 4.6562927496580026e-05, + "loss": 3.3792, + "step": 804 + }, + { + "epoch": 0.07, + "grad_norm": 2.3757171630859375, + "learning_rate": 4.6545827633378933e-05, + "loss": 3.4649, + "step": 808 + }, + { + "epoch": 0.07, + "grad_norm": 1.7024015188217163, + "learning_rate": 4.652872777017784e-05, + "loss": 3.3149, + "step": 812 + }, + { + "epoch": 0.07, + "grad_norm": 1.5424153804779053, + "learning_rate": 4.651162790697675e-05, + "loss": 3.3, + "step": 816 + }, + { + "epoch": 0.07, + "grad_norm": 1.4667370319366455, + "learning_rate": 4.6494528043775655e-05, + "loss": 3.2845, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 1.5216193199157715, + "learning_rate": 4.6477428180574556e-05, + "loss": 3.3062, + "step": 824 + }, + { + "epoch": 0.07, + "grad_norm": 1.5499827861785889, + "learning_rate": 4.6460328317373463e-05, + "loss": 3.263, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 1.644148588180542, + "learning_rate": 4.6443228454172364e-05, + "loss": 3.4374, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 1.6420782804489136, + "learning_rate": 4.642612859097127e-05, + "loss": 3.4049, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 1.4325385093688965, + "learning_rate": 4.640902872777018e-05, + "loss": 3.3499, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 1.6447879076004028, + "learning_rate": 4.6391928864569086e-05, + "loss": 3.4098, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 1.4675137996673584, + "learning_rate": 4.637482900136799e-05, + "loss": 3.2778, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 1.5847750902175903, + "learning_rate": 4.6357729138166894e-05, + "loss": 3.1695, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 1.514772653579712, + "learning_rate": 4.63406292749658e-05, + "loss": 3.1556, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 1.511367917060852, + "learning_rate": 4.632352941176471e-05, + "loss": 3.3307, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 1.7640196084976196, + "learning_rate": 4.630642954856361e-05, + "loss": 3.2985, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 1.7074265480041504, + "learning_rate": 4.628932968536252e-05, + "loss": 3.2598, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 1.52577805519104, + "learning_rate": 4.6272229822161424e-05, + "loss": 3.0872, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 1.6908501386642456, + "learning_rate": 4.625512995896033e-05, + "loss": 3.1931, + "step": 876 + }, + { + "epoch": 0.08, + "grad_norm": 1.6583012342453003, + "learning_rate": 4.623803009575924e-05, + "loss": 3.1849, + "step": 880 + }, + { + "epoch": 0.08, + "grad_norm": 1.4067027568817139, + "learning_rate": 4.622093023255814e-05, + "loss": 3.1912, + "step": 884 + }, + { + "epoch": 0.08, + "grad_norm": 1.388487458229065, + "learning_rate": 4.6203830369357046e-05, + "loss": 3.2845, + "step": 888 + }, + { + "epoch": 0.08, + "grad_norm": 1.7910948991775513, + "learning_rate": 4.6186730506155954e-05, + "loss": 3.1146, + "step": 892 + }, + { + "epoch": 0.08, + "grad_norm": 1.4454721212387085, + "learning_rate": 4.616963064295486e-05, + "loss": 3.2114, + "step": 896 + }, + { + "epoch": 0.08, + "grad_norm": 1.4809476137161255, + "learning_rate": 4.615253077975377e-05, + "loss": 3.3329, + "step": 900 + }, + { + "epoch": 0.08, + "grad_norm": 1.6668082475662231, + "learning_rate": 4.613543091655267e-05, + "loss": 3.2517, + "step": 904 + }, + { + "epoch": 0.08, + "grad_norm": 1.72063410282135, + "learning_rate": 4.6118331053351576e-05, + "loss": 3.0895, + "step": 908 + }, + { + "epoch": 0.08, + "grad_norm": 1.5995526313781738, + "learning_rate": 4.610123119015048e-05, + "loss": 3.2703, + "step": 912 + }, + { + "epoch": 0.08, + "grad_norm": 1.453747034072876, + "learning_rate": 4.6084131326949384e-05, + "loss": 3.3619, + "step": 916 + }, + { + "epoch": 0.08, + "grad_norm": 1.7982878684997559, + "learning_rate": 4.606703146374829e-05, + "loss": 3.2657, + "step": 920 + }, + { + "epoch": 0.08, + "grad_norm": 1.3937926292419434, + "learning_rate": 4.60499316005472e-05, + "loss": 3.1727, + "step": 924 + }, + { + "epoch": 0.08, + "grad_norm": 1.5705368518829346, + "learning_rate": 4.6032831737346106e-05, + "loss": 3.0657, + "step": 928 + }, + { + "epoch": 0.08, + "grad_norm": 1.6821500062942505, + "learning_rate": 4.601573187414501e-05, + "loss": 3.3191, + "step": 932 + }, + { + "epoch": 0.08, + "grad_norm": 1.5288244485855103, + "learning_rate": 4.5998632010943914e-05, + "loss": 3.0876, + "step": 936 + }, + { + "epoch": 0.08, + "grad_norm": 1.7748781442642212, + "learning_rate": 4.598153214774282e-05, + "loss": 3.18, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 1.7024669647216797, + "learning_rate": 4.596443228454172e-05, + "loss": 3.3601, + "step": 944 + }, + { + "epoch": 0.08, + "grad_norm": 1.7017521858215332, + "learning_rate": 4.5947332421340636e-05, + "loss": 2.9751, + "step": 948 + }, + { + "epoch": 0.08, + "grad_norm": 1.7405025959014893, + "learning_rate": 4.593023255813954e-05, + "loss": 3.2073, + "step": 952 + }, + { + "epoch": 0.08, + "grad_norm": 1.6610547304153442, + "learning_rate": 4.5913132694938444e-05, + "loss": 3.2363, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 1.4785616397857666, + "learning_rate": 4.5896032831737345e-05, + "loss": 3.3254, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 3.453533411026001, + "learning_rate": 4.587893296853625e-05, + "loss": 3.21, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 1.4303959608078003, + "learning_rate": 4.586183310533516e-05, + "loss": 3.3226, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 1.6272943019866943, + "learning_rate": 4.584473324213407e-05, + "loss": 3.5944, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 1.823716402053833, + "learning_rate": 4.5827633378932974e-05, + "loss": 3.234, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 1.7006231546401978, + "learning_rate": 4.5810533515731875e-05, + "loss": 3.0123, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 1.3645117282867432, + "learning_rate": 4.579343365253078e-05, + "loss": 3.2243, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 1.7724872827529907, + "learning_rate": 4.577633378932969e-05, + "loss": 3.1705, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 1.6377599239349365, + "learning_rate": 4.575923392612859e-05, + "loss": 3.2647, + "step": 992 + }, + { + "epoch": 0.09, + "grad_norm": 1.4828346967697144, + "learning_rate": 4.57421340629275e-05, + "loss": 3.0613, + "step": 996 + }, + { + "epoch": 0.09, + "grad_norm": 1.515553593635559, + "learning_rate": 4.5725034199726405e-05, + "loss": 3.0368, + "step": 1000 + }, + { + "epoch": 0.09, + "grad_norm": 1.498844861984253, + "learning_rate": 4.570793433652531e-05, + "loss": 3.102, + "step": 1004 + }, + { + "epoch": 0.09, + "grad_norm": 1.4339860677719116, + "learning_rate": 4.569083447332422e-05, + "loss": 3.1874, + "step": 1008 + }, + { + "epoch": 0.09, + "grad_norm": 1.5611178874969482, + "learning_rate": 4.567373461012312e-05, + "loss": 3.075, + "step": 1012 + }, + { + "epoch": 0.09, + "grad_norm": 1.6089402437210083, + "learning_rate": 4.565663474692203e-05, + "loss": 3.1544, + "step": 1016 + }, + { + "epoch": 0.09, + "grad_norm": 1.8874709606170654, + "learning_rate": 4.563953488372093e-05, + "loss": 3.2806, + "step": 1020 + }, + { + "epoch": 0.09, + "grad_norm": 1.5624805688858032, + "learning_rate": 4.5622435020519835e-05, + "loss": 3.1442, + "step": 1024 + }, + { + "epoch": 0.09, + "grad_norm": 1.968634009361267, + "learning_rate": 4.560533515731875e-05, + "loss": 3.2743, + "step": 1028 + }, + { + "epoch": 0.09, + "grad_norm": 1.4064279794692993, + "learning_rate": 4.558823529411765e-05, + "loss": 3.0726, + "step": 1032 + }, + { + "epoch": 0.09, + "grad_norm": 1.577048897743225, + "learning_rate": 4.557113543091656e-05, + "loss": 3.1129, + "step": 1036 + }, + { + "epoch": 0.09, + "grad_norm": 1.548323392868042, + "learning_rate": 4.555403556771546e-05, + "loss": 3.144, + "step": 1040 + }, + { + "epoch": 0.09, + "grad_norm": 1.5542961359024048, + "learning_rate": 4.5536935704514365e-05, + "loss": 3.182, + "step": 1044 + }, + { + "epoch": 0.09, + "grad_norm": 1.5506356954574585, + "learning_rate": 4.551983584131327e-05, + "loss": 3.2186, + "step": 1048 + }, + { + "epoch": 0.09, + "grad_norm": 1.6962274312973022, + "learning_rate": 4.550273597811218e-05, + "loss": 3.0385, + "step": 1052 + }, + { + "epoch": 0.09, + "grad_norm": 1.4382833242416382, + "learning_rate": 4.548563611491109e-05, + "loss": 3.1268, + "step": 1056 + }, + { + "epoch": 0.09, + "grad_norm": 2.269392728805542, + "learning_rate": 4.546853625170999e-05, + "loss": 3.2579, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 1.5414201021194458, + "learning_rate": 4.5451436388508895e-05, + "loss": 3.1501, + "step": 1064 + }, + { + "epoch": 0.09, + "grad_norm": 1.936946153640747, + "learning_rate": 4.54343365253078e-05, + "loss": 3.1711, + "step": 1068 + }, + { + "epoch": 0.09, + "grad_norm": 1.7430529594421387, + "learning_rate": 4.54172366621067e-05, + "loss": 3.1299, + "step": 1072 + }, + { + "epoch": 0.09, + "grad_norm": 1.3500404357910156, + "learning_rate": 4.540013679890561e-05, + "loss": 3.0935, + "step": 1076 + }, + { + "epoch": 0.09, + "grad_norm": 1.5768132209777832, + "learning_rate": 4.538303693570452e-05, + "loss": 3.1091, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 1.4829493761062622, + "learning_rate": 4.5365937072503425e-05, + "loss": 3.0862, + "step": 1084 + }, + { + "epoch": 0.09, + "grad_norm": 1.5560483932495117, + "learning_rate": 4.5348837209302326e-05, + "loss": 3.1024, + "step": 1088 + }, + { + "epoch": 0.09, + "grad_norm": 1.6295199394226074, + "learning_rate": 4.533173734610123e-05, + "loss": 2.9539, + "step": 1092 + }, + { + "epoch": 0.09, + "grad_norm": 2.1724135875701904, + "learning_rate": 4.531463748290014e-05, + "loss": 3.1936, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 1.7400479316711426, + "learning_rate": 4.529753761969904e-05, + "loss": 3.1873, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 1.554962396621704, + "learning_rate": 4.528043775649795e-05, + "loss": 3.0643, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 1.3722119331359863, + "learning_rate": 4.5263337893296855e-05, + "loss": 3.0088, + "step": 1108 + }, + { + "epoch": 0.1, + "grad_norm": 1.395577311515808, + "learning_rate": 4.524623803009576e-05, + "loss": 3.0584, + "step": 1112 + }, + { + "epoch": 0.1, + "grad_norm": 1.509710431098938, + "learning_rate": 4.522913816689467e-05, + "loss": 3.2276, + "step": 1116 + }, + { + "epoch": 0.1, + "grad_norm": 1.5213786363601685, + "learning_rate": 4.521203830369357e-05, + "loss": 2.9433, + "step": 1120 + }, + { + "epoch": 0.1, + "grad_norm": 1.51616632938385, + "learning_rate": 4.519493844049248e-05, + "loss": 2.9902, + "step": 1124 + }, + { + "epoch": 0.1, + "grad_norm": 1.5429805517196655, + "learning_rate": 4.517783857729138e-05, + "loss": 3.0884, + "step": 1128 + }, + { + "epoch": 0.1, + "grad_norm": 1.4714343547821045, + "learning_rate": 4.5160738714090286e-05, + "loss": 3.0322, + "step": 1132 + }, + { + "epoch": 0.1, + "grad_norm": 1.4637647867202759, + "learning_rate": 4.51436388508892e-05, + "loss": 3.0352, + "step": 1136 + }, + { + "epoch": 0.1, + "grad_norm": 1.3424347639083862, + "learning_rate": 4.51265389876881e-05, + "loss": 3.0866, + "step": 1140 + }, + { + "epoch": 0.1, + "grad_norm": 1.689968228340149, + "learning_rate": 4.510943912448701e-05, + "loss": 3.0002, + "step": 1144 + }, + { + "epoch": 0.1, + "grad_norm": 1.9935293197631836, + "learning_rate": 4.509233926128591e-05, + "loss": 2.9518, + "step": 1148 + }, + { + "epoch": 0.1, + "grad_norm": 1.5461673736572266, + "learning_rate": 4.5075239398084816e-05, + "loss": 3.0394, + "step": 1152 + }, + { + "epoch": 0.1, + "grad_norm": 1.5787737369537354, + "learning_rate": 4.505813953488372e-05, + "loss": 3.0934, + "step": 1156 + }, + { + "epoch": 0.1, + "grad_norm": 1.4527121782302856, + "learning_rate": 4.504103967168263e-05, + "loss": 2.9734, + "step": 1160 + }, + { + "epoch": 0.1, + "grad_norm": 1.6236604452133179, + "learning_rate": 4.502393980848154e-05, + "loss": 3.021, + "step": 1164 + }, + { + "epoch": 0.1, + "grad_norm": 1.5616329908370972, + "learning_rate": 4.500683994528044e-05, + "loss": 3.0602, + "step": 1168 + }, + { + "epoch": 0.1, + "grad_norm": 1.6745967864990234, + "learning_rate": 4.4989740082079346e-05, + "loss": 3.2712, + "step": 1172 + }, + { + "epoch": 0.1, + "grad_norm": 1.5813934803009033, + "learning_rate": 4.497264021887825e-05, + "loss": 3.2255, + "step": 1176 + }, + { + "epoch": 0.1, + "grad_norm": 1.5584092140197754, + "learning_rate": 4.4955540355677154e-05, + "loss": 3.082, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 1.5441848039627075, + "learning_rate": 4.493844049247606e-05, + "loss": 2.978, + "step": 1184 + }, + { + "epoch": 0.1, + "grad_norm": 1.6514015197753906, + "learning_rate": 4.492134062927497e-05, + "loss": 3.2604, + "step": 1188 + }, + { + "epoch": 0.1, + "grad_norm": 1.4784066677093506, + "learning_rate": 4.4904240766073876e-05, + "loss": 3.0761, + "step": 1192 + }, + { + "epoch": 0.1, + "grad_norm": 1.493974208831787, + "learning_rate": 4.488714090287278e-05, + "loss": 3.1323, + "step": 1196 + }, + { + "epoch": 0.1, + "grad_norm": 1.4246760606765747, + "learning_rate": 4.4870041039671684e-05, + "loss": 2.7518, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 1.6689587831497192, + "learning_rate": 4.485294117647059e-05, + "loss": 3.2381, + "step": 1204 + }, + { + "epoch": 0.1, + "grad_norm": 1.615942120552063, + "learning_rate": 4.483584131326949e-05, + "loss": 3.0886, + "step": 1208 + }, + { + "epoch": 0.1, + "grad_norm": 1.3292858600616455, + "learning_rate": 4.48187414500684e-05, + "loss": 3.0968, + "step": 1212 + }, + { + "epoch": 0.1, + "grad_norm": 1.7597836256027222, + "learning_rate": 4.4801641586867306e-05, + "loss": 3.0037, + "step": 1216 + }, + { + "epoch": 0.1, + "grad_norm": 1.7249919176101685, + "learning_rate": 4.4784541723666214e-05, + "loss": 2.9228, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 1.630422830581665, + "learning_rate": 4.476744186046512e-05, + "loss": 3.0722, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 1.4283276796340942, + "learning_rate": 4.475034199726402e-05, + "loss": 2.8726, + "step": 1228 + }, + { + "epoch": 0.11, + "grad_norm": 1.786851167678833, + "learning_rate": 4.473324213406293e-05, + "loss": 3.1425, + "step": 1232 + }, + { + "epoch": 0.11, + "grad_norm": 1.669625997543335, + "learning_rate": 4.471614227086183e-05, + "loss": 3.013, + "step": 1236 + }, + { + "epoch": 0.11, + "grad_norm": 1.4979221820831299, + "learning_rate": 4.4699042407660744e-05, + "loss": 3.106, + "step": 1240 + }, + { + "epoch": 0.11, + "grad_norm": 1.67727792263031, + "learning_rate": 4.468194254445965e-05, + "loss": 2.9748, + "step": 1244 + }, + { + "epoch": 0.11, + "grad_norm": 1.3726154565811157, + "learning_rate": 4.466484268125855e-05, + "loss": 3.0766, + "step": 1248 + }, + { + "epoch": 0.11, + "grad_norm": 1.4735255241394043, + "learning_rate": 4.464774281805746e-05, + "loss": 2.9678, + "step": 1252 + }, + { + "epoch": 0.11, + "grad_norm": 1.751224160194397, + "learning_rate": 4.463064295485636e-05, + "loss": 2.923, + "step": 1256 + }, + { + "epoch": 0.11, + "grad_norm": 1.4682483673095703, + "learning_rate": 4.461354309165527e-05, + "loss": 2.9822, + "step": 1260 + }, + { + "epoch": 0.11, + "grad_norm": 1.7135084867477417, + "learning_rate": 4.4596443228454174e-05, + "loss": 2.9087, + "step": 1264 + }, + { + "epoch": 0.11, + "grad_norm": 1.6292390823364258, + "learning_rate": 4.457934336525308e-05, + "loss": 2.9948, + "step": 1268 + }, + { + "epoch": 0.11, + "grad_norm": 1.3494658470153809, + "learning_rate": 4.456224350205199e-05, + "loss": 2.8939, + "step": 1272 + }, + { + "epoch": 0.11, + "grad_norm": 1.584025263786316, + "learning_rate": 4.454514363885089e-05, + "loss": 2.9289, + "step": 1276 + }, + { + "epoch": 0.11, + "grad_norm": 1.7876780033111572, + "learning_rate": 4.45280437756498e-05, + "loss": 3.0326, + "step": 1280 + }, + { + "epoch": 0.11, + "grad_norm": 1.6452478170394897, + "learning_rate": 4.4510943912448704e-05, + "loss": 3.2085, + "step": 1284 + }, + { + "epoch": 0.11, + "grad_norm": 1.733777642250061, + "learning_rate": 4.4493844049247605e-05, + "loss": 2.8625, + "step": 1288 + }, + { + "epoch": 0.11, + "grad_norm": 1.5891138315200806, + "learning_rate": 4.447674418604651e-05, + "loss": 2.8191, + "step": 1292 + }, + { + "epoch": 0.11, + "grad_norm": 1.5481853485107422, + "learning_rate": 4.445964432284542e-05, + "loss": 2.9586, + "step": 1296 + }, + { + "epoch": 0.11, + "grad_norm": 1.4756824970245361, + "learning_rate": 4.444254445964433e-05, + "loss": 2.9225, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 1.4950698614120483, + "learning_rate": 4.4425444596443234e-05, + "loss": 2.8898, + "step": 1304 + }, + { + "epoch": 0.11, + "grad_norm": 1.938930869102478, + "learning_rate": 4.4408344733242135e-05, + "loss": 3.0966, + "step": 1308 + }, + { + "epoch": 0.11, + "grad_norm": 1.578041672706604, + "learning_rate": 4.439124487004104e-05, + "loss": 2.9547, + "step": 1312 + }, + { + "epoch": 0.11, + "grad_norm": 1.5932279825210571, + "learning_rate": 4.437414500683994e-05, + "loss": 3.0289, + "step": 1316 + }, + { + "epoch": 0.11, + "grad_norm": 1.5020122528076172, + "learning_rate": 4.435704514363886e-05, + "loss": 3.0404, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 1.6642231941223145, + "learning_rate": 4.4339945280437764e-05, + "loss": 3.0883, + "step": 1324 + }, + { + "epoch": 0.11, + "grad_norm": 1.6281254291534424, + "learning_rate": 4.4322845417236665e-05, + "loss": 2.8135, + "step": 1328 + }, + { + "epoch": 0.11, + "grad_norm": 1.4702434539794922, + "learning_rate": 4.430574555403557e-05, + "loss": 2.8406, + "step": 1332 + }, + { + "epoch": 0.11, + "grad_norm": 2.073446750640869, + "learning_rate": 4.428864569083447e-05, + "loss": 3.0211, + "step": 1336 + }, + { + "epoch": 0.11, + "grad_norm": 1.543094515800476, + "learning_rate": 4.427154582763338e-05, + "loss": 2.8459, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 1.5158188343048096, + "learning_rate": 4.425444596443229e-05, + "loss": 2.9384, + "step": 1344 + }, + { + "epoch": 0.12, + "grad_norm": 1.6903549432754517, + "learning_rate": 4.4237346101231194e-05, + "loss": 3.0647, + "step": 1348 + }, + { + "epoch": 0.12, + "grad_norm": 1.6733980178833008, + "learning_rate": 4.42202462380301e-05, + "loss": 2.8504, + "step": 1352 + }, + { + "epoch": 0.12, + "grad_norm": 1.7396080493927002, + "learning_rate": 4.4203146374829e-05, + "loss": 3.1324, + "step": 1356 + }, + { + "epoch": 0.12, + "grad_norm": 1.6228291988372803, + "learning_rate": 4.418604651162791e-05, + "loss": 2.9918, + "step": 1360 + }, + { + "epoch": 0.12, + "grad_norm": 1.5741888284683228, + "learning_rate": 4.416894664842681e-05, + "loss": 3.0242, + "step": 1364 + }, + { + "epoch": 0.12, + "grad_norm": 1.4788377285003662, + "learning_rate": 4.415184678522572e-05, + "loss": 2.7948, + "step": 1368 + }, + { + "epoch": 0.12, + "grad_norm": 1.4203468561172485, + "learning_rate": 4.4134746922024625e-05, + "loss": 3.0096, + "step": 1372 + }, + { + "epoch": 0.12, + "grad_norm": 1.4492088556289673, + "learning_rate": 4.411764705882353e-05, + "loss": 2.9375, + "step": 1376 + }, + { + "epoch": 0.12, + "grad_norm": 1.4139641523361206, + "learning_rate": 4.410054719562244e-05, + "loss": 2.7041, + "step": 1380 + }, + { + "epoch": 0.12, + "grad_norm": 1.5051127672195435, + "learning_rate": 4.408344733242134e-05, + "loss": 2.9992, + "step": 1384 + }, + { + "epoch": 0.12, + "grad_norm": 1.6225379705429077, + "learning_rate": 4.406634746922025e-05, + "loss": 2.9139, + "step": 1388 + }, + { + "epoch": 0.12, + "grad_norm": 2.4177663326263428, + "learning_rate": 4.4049247606019155e-05, + "loss": 3.1328, + "step": 1392 + }, + { + "epoch": 0.12, + "grad_norm": 1.342938780784607, + "learning_rate": 4.4032147742818055e-05, + "loss": 2.7651, + "step": 1396 + }, + { + "epoch": 0.12, + "grad_norm": 1.5941474437713623, + "learning_rate": 4.401504787961697e-05, + "loss": 3.0623, + "step": 1400 + }, + { + "epoch": 0.12, + "grad_norm": 1.9736019372940063, + "learning_rate": 4.399794801641587e-05, + "loss": 2.9231, + "step": 1404 + }, + { + "epoch": 0.12, + "grad_norm": 1.7911643981933594, + "learning_rate": 4.398084815321478e-05, + "loss": 2.9161, + "step": 1408 + }, + { + "epoch": 0.12, + "grad_norm": 1.585992693901062, + "learning_rate": 4.3963748290013685e-05, + "loss": 3.0079, + "step": 1412 + }, + { + "epoch": 0.12, + "grad_norm": 1.537345290184021, + "learning_rate": 4.3946648426812585e-05, + "loss": 2.9238, + "step": 1416 + }, + { + "epoch": 0.12, + "grad_norm": 1.5920599699020386, + "learning_rate": 4.392954856361149e-05, + "loss": 2.8771, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 1.606899619102478, + "learning_rate": 4.39124487004104e-05, + "loss": 2.7501, + "step": 1424 + }, + { + "epoch": 0.12, + "grad_norm": 1.5318756103515625, + "learning_rate": 4.389534883720931e-05, + "loss": 2.8197, + "step": 1428 + }, + { + "epoch": 0.12, + "grad_norm": 1.987911581993103, + "learning_rate": 4.3878248974008215e-05, + "loss": 2.971, + "step": 1432 + }, + { + "epoch": 0.12, + "grad_norm": 1.5010288953781128, + "learning_rate": 4.3861149110807115e-05, + "loss": 2.9632, + "step": 1436 + }, + { + "epoch": 0.12, + "grad_norm": 1.488486647605896, + "learning_rate": 4.384404924760602e-05, + "loss": 3.0563, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 1.7354810237884521, + "learning_rate": 4.382694938440492e-05, + "loss": 2.866, + "step": 1444 + }, + { + "epoch": 0.12, + "grad_norm": 1.509134292602539, + "learning_rate": 4.380984952120383e-05, + "loss": 2.8723, + "step": 1448 + }, + { + "epoch": 0.12, + "grad_norm": 1.687049150466919, + "learning_rate": 4.379274965800274e-05, + "loss": 2.9496, + "step": 1452 + }, + { + "epoch": 0.12, + "grad_norm": 1.5449968576431274, + "learning_rate": 4.3775649794801645e-05, + "loss": 2.884, + "step": 1456 + }, + { + "epoch": 0.12, + "grad_norm": 1.7197214365005493, + "learning_rate": 4.375854993160055e-05, + "loss": 2.8686, + "step": 1460 + }, + { + "epoch": 0.13, + "grad_norm": 1.3042534589767456, + "learning_rate": 4.374145006839945e-05, + "loss": 2.739, + "step": 1464 + }, + { + "epoch": 0.13, + "grad_norm": 1.4646921157836914, + "learning_rate": 4.372435020519836e-05, + "loss": 2.8347, + "step": 1468 + }, + { + "epoch": 0.13, + "grad_norm": 1.350310206413269, + "learning_rate": 4.370725034199726e-05, + "loss": 2.8797, + "step": 1472 + }, + { + "epoch": 0.13, + "grad_norm": 1.5870814323425293, + "learning_rate": 4.369015047879617e-05, + "loss": 3.025, + "step": 1476 + }, + { + "epoch": 0.13, + "grad_norm": 1.5572704076766968, + "learning_rate": 4.3673050615595076e-05, + "loss": 2.8424, + "step": 1480 + }, + { + "epoch": 0.13, + "grad_norm": 1.4381687641143799, + "learning_rate": 4.365595075239398e-05, + "loss": 3.0395, + "step": 1484 + }, + { + "epoch": 0.13, + "grad_norm": 1.7877554893493652, + "learning_rate": 4.363885088919289e-05, + "loss": 2.8723, + "step": 1488 + }, + { + "epoch": 0.13, + "grad_norm": 1.7916138172149658, + "learning_rate": 4.362175102599179e-05, + "loss": 2.8751, + "step": 1492 + }, + { + "epoch": 0.13, + "grad_norm": 1.8024954795837402, + "learning_rate": 4.36046511627907e-05, + "loss": 3.0156, + "step": 1496 + }, + { + "epoch": 0.13, + "grad_norm": 1.4376300573349, + "learning_rate": 4.3587551299589606e-05, + "loss": 2.9152, + "step": 1500 + }, + { + "epoch": 0.13, + "grad_norm": 1.5347250699996948, + "learning_rate": 4.3570451436388506e-05, + "loss": 2.8219, + "step": 1504 + }, + { + "epoch": 0.13, + "grad_norm": 1.7805489301681519, + "learning_rate": 4.355335157318742e-05, + "loss": 2.9276, + "step": 1508 + }, + { + "epoch": 0.13, + "grad_norm": 1.5822547674179077, + "learning_rate": 4.353625170998632e-05, + "loss": 2.9262, + "step": 1512 + }, + { + "epoch": 0.13, + "grad_norm": 1.5745974779129028, + "learning_rate": 4.351915184678523e-05, + "loss": 2.8812, + "step": 1516 + }, + { + "epoch": 0.13, + "grad_norm": 1.5027135610580444, + "learning_rate": 4.3502051983584136e-05, + "loss": 2.7748, + "step": 1520 + }, + { + "epoch": 0.13, + "grad_norm": 1.5027369260787964, + "learning_rate": 4.3484952120383036e-05, + "loss": 2.8619, + "step": 1524 + }, + { + "epoch": 0.13, + "grad_norm": 1.6836864948272705, + "learning_rate": 4.3467852257181944e-05, + "loss": 2.9688, + "step": 1528 + }, + { + "epoch": 0.13, + "grad_norm": 1.682340145111084, + "learning_rate": 4.345075239398085e-05, + "loss": 2.9849, + "step": 1532 + }, + { + "epoch": 0.13, + "grad_norm": 1.3924282789230347, + "learning_rate": 4.343365253077976e-05, + "loss": 2.8154, + "step": 1536 + }, + { + "epoch": 0.13, + "grad_norm": 1.518629789352417, + "learning_rate": 4.3416552667578666e-05, + "loss": 2.8265, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 1.482877254486084, + "learning_rate": 4.3399452804377566e-05, + "loss": 2.8364, + "step": 1544 + }, + { + "epoch": 0.13, + "grad_norm": 1.52615487575531, + "learning_rate": 4.3382352941176474e-05, + "loss": 2.9488, + "step": 1548 + }, + { + "epoch": 0.13, + "grad_norm": 1.8858604431152344, + "learning_rate": 4.3365253077975374e-05, + "loss": 3.0754, + "step": 1552 + }, + { + "epoch": 0.13, + "grad_norm": 1.3640977144241333, + "learning_rate": 4.334815321477428e-05, + "loss": 2.7192, + "step": 1556 + }, + { + "epoch": 0.13, + "grad_norm": 1.6419317722320557, + "learning_rate": 4.333105335157319e-05, + "loss": 2.8016, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 1.4059332609176636, + "learning_rate": 4.3313953488372096e-05, + "loss": 2.8559, + "step": 1564 + }, + { + "epoch": 0.13, + "grad_norm": 1.5837764739990234, + "learning_rate": 4.3296853625171004e-05, + "loss": 3.0297, + "step": 1568 + }, + { + "epoch": 0.13, + "grad_norm": 1.8545840978622437, + "learning_rate": 4.3279753761969904e-05, + "loss": 2.9717, + "step": 1572 + }, + { + "epoch": 0.13, + "grad_norm": 1.2715051174163818, + "learning_rate": 4.326265389876881e-05, + "loss": 2.7563, + "step": 1576 + }, + { + "epoch": 0.14, + "grad_norm": 1.742698073387146, + "learning_rate": 4.324555403556772e-05, + "loss": 2.7826, + "step": 1580 + }, + { + "epoch": 0.14, + "grad_norm": 1.7338377237319946, + "learning_rate": 4.322845417236662e-05, + "loss": 2.8006, + "step": 1584 + }, + { + "epoch": 0.14, + "grad_norm": 1.5166951417922974, + "learning_rate": 4.3211354309165533e-05, + "loss": 3.0044, + "step": 1588 + }, + { + "epoch": 0.14, + "grad_norm": 1.5267099142074585, + "learning_rate": 4.3194254445964434e-05, + "loss": 2.817, + "step": 1592 + }, + { + "epoch": 0.14, + "grad_norm": 1.7478032112121582, + "learning_rate": 4.317715458276334e-05, + "loss": 3.0036, + "step": 1596 + }, + { + "epoch": 0.14, + "grad_norm": 1.9281939268112183, + "learning_rate": 4.316005471956224e-05, + "loss": 2.9689, + "step": 1600 + }, + { + "epoch": 0.14, + "grad_norm": 1.5747674703598022, + "learning_rate": 4.314295485636115e-05, + "loss": 2.7691, + "step": 1604 + }, + { + "epoch": 0.14, + "grad_norm": 1.530669093132019, + "learning_rate": 4.312585499316006e-05, + "loss": 2.7754, + "step": 1608 + }, + { + "epoch": 0.14, + "grad_norm": 1.4007419347763062, + "learning_rate": 4.3108755129958964e-05, + "loss": 2.6687, + "step": 1612 + }, + { + "epoch": 0.14, + "grad_norm": 1.7224704027175903, + "learning_rate": 4.309165526675787e-05, + "loss": 3.0238, + "step": 1616 + }, + { + "epoch": 0.14, + "grad_norm": 1.688553810119629, + "learning_rate": 4.307455540355677e-05, + "loss": 2.7985, + "step": 1620 + }, + { + "epoch": 0.14, + "grad_norm": 1.3165500164031982, + "learning_rate": 4.305745554035568e-05, + "loss": 2.8546, + "step": 1624 + }, + { + "epoch": 0.14, + "grad_norm": 1.4722650051116943, + "learning_rate": 4.3040355677154587e-05, + "loss": 2.8455, + "step": 1628 + }, + { + "epoch": 0.14, + "grad_norm": 1.5830330848693848, + "learning_rate": 4.302325581395349e-05, + "loss": 2.709, + "step": 1632 + }, + { + "epoch": 0.14, + "grad_norm": 1.5168038606643677, + "learning_rate": 4.3006155950752394e-05, + "loss": 2.8194, + "step": 1636 + }, + { + "epoch": 0.14, + "grad_norm": 1.527031660079956, + "learning_rate": 4.29890560875513e-05, + "loss": 2.7123, + "step": 1640 + }, + { + "epoch": 0.14, + "grad_norm": 1.492450475692749, + "learning_rate": 4.297195622435021e-05, + "loss": 2.9356, + "step": 1644 + }, + { + "epoch": 0.14, + "grad_norm": 1.4466654062271118, + "learning_rate": 4.2954856361149116e-05, + "loss": 2.8938, + "step": 1648 + }, + { + "epoch": 0.14, + "grad_norm": 1.6020383834838867, + "learning_rate": 4.293775649794802e-05, + "loss": 2.8547, + "step": 1652 + }, + { + "epoch": 0.14, + "grad_norm": 1.5876679420471191, + "learning_rate": 4.2920656634746924e-05, + "loss": 2.7769, + "step": 1656 + }, + { + "epoch": 0.14, + "grad_norm": 1.4787455797195435, + "learning_rate": 4.2903556771545825e-05, + "loss": 2.6641, + "step": 1660 + }, + { + "epoch": 0.14, + "grad_norm": 1.6105437278747559, + "learning_rate": 4.288645690834473e-05, + "loss": 2.775, + "step": 1664 + }, + { + "epoch": 0.14, + "grad_norm": 1.5835809707641602, + "learning_rate": 4.2869357045143646e-05, + "loss": 3.019, + "step": 1668 + }, + { + "epoch": 0.14, + "grad_norm": 2.103267192840576, + "learning_rate": 4.285225718194255e-05, + "loss": 2.828, + "step": 1672 + }, + { + "epoch": 0.14, + "grad_norm": 1.5641860961914062, + "learning_rate": 4.2835157318741454e-05, + "loss": 2.8392, + "step": 1676 + }, + { + "epoch": 0.14, + "grad_norm": 1.475889801979065, + "learning_rate": 4.2818057455540355e-05, + "loss": 2.9558, + "step": 1680 + }, + { + "epoch": 0.14, + "grad_norm": 1.3875279426574707, + "learning_rate": 4.280095759233926e-05, + "loss": 2.7178, + "step": 1684 + }, + { + "epoch": 0.14, + "grad_norm": 1.6096158027648926, + "learning_rate": 4.278385772913817e-05, + "loss": 2.7963, + "step": 1688 + }, + { + "epoch": 0.14, + "grad_norm": 1.4312164783477783, + "learning_rate": 4.276675786593708e-05, + "loss": 2.7887, + "step": 1692 + }, + { + "epoch": 0.15, + "grad_norm": 2.0804941654205322, + "learning_rate": 4.2749658002735984e-05, + "loss": 2.9118, + "step": 1696 + }, + { + "epoch": 0.15, + "grad_norm": 1.606184482574463, + "learning_rate": 4.2732558139534885e-05, + "loss": 2.7951, + "step": 1700 + }, + { + "epoch": 0.15, + "grad_norm": 1.4609088897705078, + "learning_rate": 4.271545827633379e-05, + "loss": 2.8464, + "step": 1704 + }, + { + "epoch": 0.15, + "grad_norm": 1.3830453157424927, + "learning_rate": 4.26983584131327e-05, + "loss": 2.9156, + "step": 1708 + }, + { + "epoch": 0.15, + "grad_norm": 1.4420268535614014, + "learning_rate": 4.26812585499316e-05, + "loss": 2.5969, + "step": 1712 + }, + { + "epoch": 0.15, + "grad_norm": 1.4451963901519775, + "learning_rate": 4.266415868673051e-05, + "loss": 2.9125, + "step": 1716 + }, + { + "epoch": 0.15, + "grad_norm": 1.5134412050247192, + "learning_rate": 4.2647058823529415e-05, + "loss": 2.6605, + "step": 1720 + }, + { + "epoch": 0.15, + "grad_norm": 1.561641812324524, + "learning_rate": 4.262995896032832e-05, + "loss": 2.8285, + "step": 1724 + }, + { + "epoch": 0.15, + "grad_norm": 1.4210346937179565, + "learning_rate": 4.261285909712722e-05, + "loss": 2.7912, + "step": 1728 + }, + { + "epoch": 0.15, + "grad_norm": 1.4329392910003662, + "learning_rate": 4.259575923392613e-05, + "loss": 2.8862, + "step": 1732 + }, + { + "epoch": 0.15, + "grad_norm": 1.6114985942840576, + "learning_rate": 4.257865937072504e-05, + "loss": 2.7819, + "step": 1736 + }, + { + "epoch": 0.15, + "grad_norm": 1.6259700059890747, + "learning_rate": 4.256155950752394e-05, + "loss": 2.9296, + "step": 1740 + }, + { + "epoch": 0.15, + "grad_norm": 1.523130178451538, + "learning_rate": 4.2544459644322845e-05, + "loss": 2.7139, + "step": 1744 + }, + { + "epoch": 0.15, + "grad_norm": 1.7980687618255615, + "learning_rate": 4.252735978112175e-05, + "loss": 2.7185, + "step": 1748 + }, + { + "epoch": 0.15, + "grad_norm": 1.73753821849823, + "learning_rate": 4.251025991792066e-05, + "loss": 2.7322, + "step": 1752 + }, + { + "epoch": 0.15, + "grad_norm": 1.7766021490097046, + "learning_rate": 4.249316005471957e-05, + "loss": 2.5836, + "step": 1756 + }, + { + "epoch": 0.15, + "grad_norm": 1.4891935586929321, + "learning_rate": 4.247606019151847e-05, + "loss": 2.931, + "step": 1760 + }, + { + "epoch": 0.15, + "grad_norm": 1.4233758449554443, + "learning_rate": 4.2458960328317375e-05, + "loss": 2.9084, + "step": 1764 + }, + { + "epoch": 0.15, + "grad_norm": 1.5510305166244507, + "learning_rate": 4.2441860465116276e-05, + "loss": 2.8017, + "step": 1768 + }, + { + "epoch": 0.15, + "grad_norm": 1.3143435716629028, + "learning_rate": 4.242476060191519e-05, + "loss": 2.7212, + "step": 1772 + }, + { + "epoch": 0.15, + "grad_norm": 1.5766334533691406, + "learning_rate": 4.24076607387141e-05, + "loss": 2.7551, + "step": 1776 + }, + { + "epoch": 0.15, + "grad_norm": 1.551622986793518, + "learning_rate": 4.2390560875513e-05, + "loss": 2.8498, + "step": 1780 + }, + { + "epoch": 0.15, + "grad_norm": 2.2195143699645996, + "learning_rate": 4.2373461012311905e-05, + "loss": 2.811, + "step": 1784 + }, + { + "epoch": 0.15, + "grad_norm": 1.530758023262024, + "learning_rate": 4.2356361149110806e-05, + "loss": 2.772, + "step": 1788 + }, + { + "epoch": 0.15, + "grad_norm": 1.5073307752609253, + "learning_rate": 4.233926128590971e-05, + "loss": 2.7401, + "step": 1792 + }, + { + "epoch": 0.15, + "grad_norm": 1.6950387954711914, + "learning_rate": 4.232216142270862e-05, + "loss": 2.7554, + "step": 1796 + }, + { + "epoch": 0.15, + "grad_norm": 1.389538288116455, + "learning_rate": 4.230506155950753e-05, + "loss": 2.8187, + "step": 1800 + }, + { + "epoch": 0.15, + "grad_norm": 1.592817783355713, + "learning_rate": 4.2287961696306435e-05, + "loss": 2.8686, + "step": 1804 + }, + { + "epoch": 0.15, + "grad_norm": 1.6047254800796509, + "learning_rate": 4.2270861833105336e-05, + "loss": 2.8033, + "step": 1808 + }, + { + "epoch": 0.15, + "grad_norm": 1.4730092287063599, + "learning_rate": 4.225376196990424e-05, + "loss": 2.6616, + "step": 1812 + }, + { + "epoch": 0.16, + "grad_norm": 1.4784797430038452, + "learning_rate": 4.223666210670315e-05, + "loss": 2.8108, + "step": 1816 + }, + { + "epoch": 0.16, + "grad_norm": 1.6881104707717896, + "learning_rate": 4.221956224350205e-05, + "loss": 2.8155, + "step": 1820 + }, + { + "epoch": 0.16, + "grad_norm": 1.4548227787017822, + "learning_rate": 4.220246238030096e-05, + "loss": 2.7195, + "step": 1824 + }, + { + "epoch": 0.16, + "grad_norm": 1.5578104257583618, + "learning_rate": 4.2185362517099866e-05, + "loss": 2.8058, + "step": 1828 + }, + { + "epoch": 0.16, + "grad_norm": 1.6127967834472656, + "learning_rate": 4.216826265389877e-05, + "loss": 2.8783, + "step": 1832 + }, + { + "epoch": 0.16, + "grad_norm": 1.883389949798584, + "learning_rate": 4.215116279069768e-05, + "loss": 2.6328, + "step": 1836 + }, + { + "epoch": 0.16, + "grad_norm": 1.6216742992401123, + "learning_rate": 4.213406292749658e-05, + "loss": 2.8021, + "step": 1840 + }, + { + "epoch": 0.16, + "grad_norm": 1.7198604345321655, + "learning_rate": 4.211696306429549e-05, + "loss": 2.6467, + "step": 1844 + }, + { + "epoch": 0.16, + "grad_norm": 1.3806415796279907, + "learning_rate": 4.209986320109439e-05, + "loss": 2.8337, + "step": 1848 + }, + { + "epoch": 0.16, + "grad_norm": 1.9981590509414673, + "learning_rate": 4.2082763337893296e-05, + "loss": 2.7637, + "step": 1852 + }, + { + "epoch": 0.16, + "grad_norm": 1.4233486652374268, + "learning_rate": 4.2065663474692204e-05, + "loss": 2.6964, + "step": 1856 + }, + { + "epoch": 0.16, + "grad_norm": 1.4599206447601318, + "learning_rate": 4.204856361149111e-05, + "loss": 2.8918, + "step": 1860 + }, + { + "epoch": 0.16, + "grad_norm": 1.6856427192687988, + "learning_rate": 4.203146374829002e-05, + "loss": 2.7184, + "step": 1864 + }, + { + "epoch": 0.16, + "grad_norm": 1.5229926109313965, + "learning_rate": 4.201436388508892e-05, + "loss": 2.7457, + "step": 1868 + }, + { + "epoch": 0.16, + "grad_norm": 1.8297462463378906, + "learning_rate": 4.1997264021887826e-05, + "loss": 2.8097, + "step": 1872 + }, + { + "epoch": 0.16, + "grad_norm": 1.4381957054138184, + "learning_rate": 4.198016415868673e-05, + "loss": 2.6845, + "step": 1876 + }, + { + "epoch": 0.16, + "grad_norm": 1.8839136362075806, + "learning_rate": 4.196306429548564e-05, + "loss": 2.7721, + "step": 1880 + }, + { + "epoch": 0.16, + "grad_norm": 1.6147316694259644, + "learning_rate": 4.194596443228455e-05, + "loss": 2.7529, + "step": 1884 + }, + { + "epoch": 0.16, + "grad_norm": 1.4473137855529785, + "learning_rate": 4.192886456908345e-05, + "loss": 2.7582, + "step": 1888 + }, + { + "epoch": 0.16, + "grad_norm": 1.6229139566421509, + "learning_rate": 4.1911764705882356e-05, + "loss": 2.7579, + "step": 1892 + }, + { + "epoch": 0.16, + "grad_norm": 1.669582724571228, + "learning_rate": 4.189466484268126e-05, + "loss": 2.7924, + "step": 1896 + }, + { + "epoch": 0.16, + "grad_norm": 1.4859235286712646, + "learning_rate": 4.1877564979480164e-05, + "loss": 2.6415, + "step": 1900 + }, + { + "epoch": 0.16, + "grad_norm": 1.4661961793899536, + "learning_rate": 4.186046511627907e-05, + "loss": 2.5649, + "step": 1904 + }, + { + "epoch": 0.16, + "grad_norm": 1.4079793691635132, + "learning_rate": 4.184336525307798e-05, + "loss": 2.6718, + "step": 1908 + }, + { + "epoch": 0.16, + "grad_norm": 1.7637715339660645, + "learning_rate": 4.1826265389876886e-05, + "loss": 2.7626, + "step": 1912 + }, + { + "epoch": 0.16, + "grad_norm": 1.4910571575164795, + "learning_rate": 4.1809165526675787e-05, + "loss": 2.683, + "step": 1916 + }, + { + "epoch": 0.16, + "grad_norm": 1.709197759628296, + "learning_rate": 4.1792065663474694e-05, + "loss": 2.8062, + "step": 1920 + }, + { + "epoch": 0.16, + "grad_norm": 1.8040099143981934, + "learning_rate": 4.17749658002736e-05, + "loss": 2.8158, + "step": 1924 + }, + { + "epoch": 0.16, + "grad_norm": 1.4271622896194458, + "learning_rate": 4.17578659370725e-05, + "loss": 2.8655, + "step": 1928 + }, + { + "epoch": 0.17, + "grad_norm": 1.6832592487335205, + "learning_rate": 4.174076607387141e-05, + "loss": 2.9054, + "step": 1932 + }, + { + "epoch": 0.17, + "grad_norm": 1.674769639968872, + "learning_rate": 4.1723666210670316e-05, + "loss": 2.7739, + "step": 1936 + }, + { + "epoch": 0.17, + "grad_norm": 1.3221181631088257, + "learning_rate": 4.1706566347469224e-05, + "loss": 2.7352, + "step": 1940 + }, + { + "epoch": 0.17, + "grad_norm": 1.6557152271270752, + "learning_rate": 4.168946648426813e-05, + "loss": 2.7729, + "step": 1944 + }, + { + "epoch": 0.17, + "grad_norm": 1.5999726057052612, + "learning_rate": 4.167236662106703e-05, + "loss": 2.8399, + "step": 1948 + }, + { + "epoch": 0.17, + "grad_norm": 1.5852841138839722, + "learning_rate": 4.165526675786594e-05, + "loss": 2.8804, + "step": 1952 + }, + { + "epoch": 0.17, + "grad_norm": 1.634738802909851, + "learning_rate": 4.163816689466484e-05, + "loss": 2.6103, + "step": 1956 + }, + { + "epoch": 0.17, + "grad_norm": 1.5783636569976807, + "learning_rate": 4.1621067031463754e-05, + "loss": 2.4903, + "step": 1960 + }, + { + "epoch": 0.17, + "grad_norm": 1.4169994592666626, + "learning_rate": 4.160396716826266e-05, + "loss": 2.6708, + "step": 1964 + }, + { + "epoch": 0.17, + "grad_norm": 2.4021108150482178, + "learning_rate": 4.158686730506156e-05, + "loss": 2.6697, + "step": 1968 + }, + { + "epoch": 0.17, + "grad_norm": 1.4128559827804565, + "learning_rate": 4.156976744186047e-05, + "loss": 2.7368, + "step": 1972 + }, + { + "epoch": 0.17, + "grad_norm": 1.5087906122207642, + "learning_rate": 4.155266757865937e-05, + "loss": 2.7219, + "step": 1976 + }, + { + "epoch": 0.17, + "grad_norm": 6.4628214836120605, + "learning_rate": 4.153556771545828e-05, + "loss": 2.7273, + "step": 1980 + }, + { + "epoch": 0.17, + "grad_norm": 1.5917600393295288, + "learning_rate": 4.1518467852257184e-05, + "loss": 2.885, + "step": 1984 + }, + { + "epoch": 0.17, + "grad_norm": 1.4118601083755493, + "learning_rate": 4.150136798905609e-05, + "loss": 2.7849, + "step": 1988 + }, + { + "epoch": 0.17, + "grad_norm": 1.4783868789672852, + "learning_rate": 4.1484268125855e-05, + "loss": 2.8531, + "step": 1992 + }, + { + "epoch": 0.17, + "grad_norm": 1.6047073602676392, + "learning_rate": 4.14671682626539e-05, + "loss": 2.784, + "step": 1996 + }, + { + "epoch": 0.17, + "grad_norm": 1.565877079963684, + "learning_rate": 4.145006839945281e-05, + "loss": 2.6856, + "step": 2000 + }, + { + "epoch": 0.17, + "grad_norm": 1.4095959663391113, + "learning_rate": 4.143296853625171e-05, + "loss": 2.7933, + "step": 2004 + }, + { + "epoch": 0.17, + "grad_norm": 1.3851944208145142, + "learning_rate": 4.1415868673050615e-05, + "loss": 2.7286, + "step": 2008 + }, + { + "epoch": 0.17, + "grad_norm": 1.4448835849761963, + "learning_rate": 4.139876880984952e-05, + "loss": 2.667, + "step": 2012 + }, + { + "epoch": 0.17, + "grad_norm": 1.2041794061660767, + "learning_rate": 4.138166894664843e-05, + "loss": 2.553, + "step": 2016 + }, + { + "epoch": 0.17, + "grad_norm": 1.5906238555908203, + "learning_rate": 4.136456908344734e-05, + "loss": 2.7194, + "step": 2020 + }, + { + "epoch": 0.17, + "grad_norm": 1.493734359741211, + "learning_rate": 4.134746922024624e-05, + "loss": 2.6597, + "step": 2024 + }, + { + "epoch": 0.17, + "grad_norm": 1.485025405883789, + "learning_rate": 4.1330369357045145e-05, + "loss": 2.7412, + "step": 2028 + }, + { + "epoch": 0.17, + "grad_norm": 1.6668092012405396, + "learning_rate": 4.131326949384405e-05, + "loss": 2.7559, + "step": 2032 + }, + { + "epoch": 0.17, + "grad_norm": 1.6972142457962036, + "learning_rate": 4.129616963064295e-05, + "loss": 2.6343, + "step": 2036 + }, + { + "epoch": 0.17, + "grad_norm": 1.6248745918273926, + "learning_rate": 4.127906976744187e-05, + "loss": 2.749, + "step": 2040 + }, + { + "epoch": 0.17, + "grad_norm": 1.486807942390442, + "learning_rate": 4.126196990424077e-05, + "loss": 2.5962, + "step": 2044 + }, + { + "epoch": 0.18, + "grad_norm": 1.558280110359192, + "learning_rate": 4.1244870041039675e-05, + "loss": 2.7067, + "step": 2048 + }, + { + "epoch": 0.18, + "grad_norm": 1.78428053855896, + "learning_rate": 4.122777017783858e-05, + "loss": 2.7901, + "step": 2052 + }, + { + "epoch": 0.18, + "grad_norm": 1.5545462369918823, + "learning_rate": 4.121067031463748e-05, + "loss": 2.7868, + "step": 2056 + }, + { + "epoch": 0.18, + "grad_norm": 1.5399055480957031, + "learning_rate": 4.119357045143639e-05, + "loss": 2.8405, + "step": 2060 + }, + { + "epoch": 0.18, + "grad_norm": 1.535919189453125, + "learning_rate": 4.11764705882353e-05, + "loss": 2.6297, + "step": 2064 + }, + { + "epoch": 0.18, + "grad_norm": 1.3103563785552979, + "learning_rate": 4.1159370725034205e-05, + "loss": 2.6334, + "step": 2068 + }, + { + "epoch": 0.18, + "grad_norm": 1.4171690940856934, + "learning_rate": 4.114227086183311e-05, + "loss": 2.5261, + "step": 2072 + }, + { + "epoch": 0.18, + "grad_norm": 1.5107210874557495, + "learning_rate": 4.112517099863201e-05, + "loss": 2.5669, + "step": 2076 + }, + { + "epoch": 0.18, + "grad_norm": 1.4645127058029175, + "learning_rate": 4.110807113543092e-05, + "loss": 2.6166, + "step": 2080 + }, + { + "epoch": 0.18, + "grad_norm": 1.8557233810424805, + "learning_rate": 4.109097127222982e-05, + "loss": 2.6651, + "step": 2084 + }, + { + "epoch": 0.18, + "grad_norm": 1.4747710227966309, + "learning_rate": 4.107387140902873e-05, + "loss": 2.9066, + "step": 2088 + }, + { + "epoch": 0.18, + "grad_norm": 1.4575109481811523, + "learning_rate": 4.1056771545827635e-05, + "loss": 2.7079, + "step": 2092 + }, + { + "epoch": 0.18, + "grad_norm": 1.5356212854385376, + "learning_rate": 4.103967168262654e-05, + "loss": 2.711, + "step": 2096 + }, + { + "epoch": 0.18, + "grad_norm": 1.4428212642669678, + "learning_rate": 4.102257181942545e-05, + "loss": 2.652, + "step": 2100 + }, + { + "epoch": 0.18, + "grad_norm": 1.6778663396835327, + "learning_rate": 4.100547195622435e-05, + "loss": 2.7872, + "step": 2104 + }, + { + "epoch": 0.18, + "grad_norm": 1.4642986059188843, + "learning_rate": 4.098837209302326e-05, + "loss": 2.587, + "step": 2108 + }, + { + "epoch": 0.18, + "grad_norm": 1.5550886392593384, + "learning_rate": 4.097127222982216e-05, + "loss": 2.6646, + "step": 2112 + }, + { + "epoch": 0.18, + "grad_norm": 1.6808562278747559, + "learning_rate": 4.0954172366621066e-05, + "loss": 2.588, + "step": 2116 + }, + { + "epoch": 0.18, + "grad_norm": 1.7080578804016113, + "learning_rate": 4.093707250341998e-05, + "loss": 2.6144, + "step": 2120 + }, + { + "epoch": 0.18, + "grad_norm": 1.8413456678390503, + "learning_rate": 4.091997264021888e-05, + "loss": 2.8098, + "step": 2124 + }, + { + "epoch": 0.18, + "grad_norm": 1.4448609352111816, + "learning_rate": 4.090287277701779e-05, + "loss": 2.7619, + "step": 2128 + }, + { + "epoch": 0.18, + "grad_norm": 1.5994160175323486, + "learning_rate": 4.088577291381669e-05, + "loss": 2.6444, + "step": 2132 + }, + { + "epoch": 0.18, + "grad_norm": 1.5190199613571167, + "learning_rate": 4.0868673050615596e-05, + "loss": 2.5597, + "step": 2136 + }, + { + "epoch": 0.18, + "grad_norm": 1.5521091222763062, + "learning_rate": 4.08515731874145e-05, + "loss": 2.6333, + "step": 2140 + }, + { + "epoch": 0.18, + "grad_norm": 1.6080738306045532, + "learning_rate": 4.083447332421341e-05, + "loss": 2.5827, + "step": 2144 + }, + { + "epoch": 0.18, + "grad_norm": 2.0923423767089844, + "learning_rate": 4.081737346101232e-05, + "loss": 2.6463, + "step": 2148 + }, + { + "epoch": 0.18, + "grad_norm": 1.4243260622024536, + "learning_rate": 4.080027359781122e-05, + "loss": 2.6702, + "step": 2152 + }, + { + "epoch": 0.18, + "grad_norm": 1.5463379621505737, + "learning_rate": 4.0783173734610126e-05, + "loss": 2.6284, + "step": 2156 + }, + { + "epoch": 0.18, + "grad_norm": 1.4061999320983887, + "learning_rate": 4.076607387140903e-05, + "loss": 2.4983, + "step": 2160 + }, + { + "epoch": 0.19, + "grad_norm": 1.3713377714157104, + "learning_rate": 4.0748974008207933e-05, + "loss": 2.5406, + "step": 2164 + }, + { + "epoch": 0.19, + "grad_norm": 1.6995548009872437, + "learning_rate": 4.073187414500684e-05, + "loss": 2.5039, + "step": 2168 + }, + { + "epoch": 0.19, + "grad_norm": 1.644362449645996, + "learning_rate": 4.071477428180575e-05, + "loss": 2.7398, + "step": 2172 + }, + { + "epoch": 0.19, + "grad_norm": 1.5956717729568481, + "learning_rate": 4.0697674418604655e-05, + "loss": 2.7333, + "step": 2176 + }, + { + "epoch": 0.19, + "grad_norm": 1.567484736442566, + "learning_rate": 4.068057455540356e-05, + "loss": 2.727, + "step": 2180 + }, + { + "epoch": 0.19, + "grad_norm": 1.573633074760437, + "learning_rate": 4.066347469220246e-05, + "loss": 2.639, + "step": 2184 + }, + { + "epoch": 0.19, + "grad_norm": 1.5799304246902466, + "learning_rate": 4.064637482900137e-05, + "loss": 2.735, + "step": 2188 + }, + { + "epoch": 0.19, + "grad_norm": 1.5772294998168945, + "learning_rate": 4.062927496580027e-05, + "loss": 2.5753, + "step": 2192 + }, + { + "epoch": 0.19, + "grad_norm": 1.4983608722686768, + "learning_rate": 4.061217510259918e-05, + "loss": 2.5989, + "step": 2196 + }, + { + "epoch": 0.19, + "grad_norm": 1.625112533569336, + "learning_rate": 4.0595075239398086e-05, + "loss": 2.6539, + "step": 2200 + }, + { + "epoch": 0.19, + "grad_norm": 1.4899359941482544, + "learning_rate": 4.057797537619699e-05, + "loss": 2.7391, + "step": 2204 + }, + { + "epoch": 0.19, + "grad_norm": 1.4032224416732788, + "learning_rate": 4.05608755129959e-05, + "loss": 2.4294, + "step": 2208 + }, + { + "epoch": 0.19, + "grad_norm": 2.379164218902588, + "learning_rate": 4.05437756497948e-05, + "loss": 2.8045, + "step": 2212 + }, + { + "epoch": 0.19, + "grad_norm": 1.4953522682189941, + "learning_rate": 4.052667578659371e-05, + "loss": 2.6627, + "step": 2216 + }, + { + "epoch": 0.19, + "grad_norm": 1.3828809261322021, + "learning_rate": 4.0509575923392616e-05, + "loss": 2.5951, + "step": 2220 + }, + { + "epoch": 0.19, + "grad_norm": 1.5651103258132935, + "learning_rate": 4.0492476060191516e-05, + "loss": 2.7003, + "step": 2224 + }, + { + "epoch": 0.19, + "grad_norm": 1.7900563478469849, + "learning_rate": 4.047537619699043e-05, + "loss": 2.5935, + "step": 2228 + }, + { + "epoch": 0.19, + "grad_norm": 1.9265758991241455, + "learning_rate": 4.045827633378933e-05, + "loss": 2.6538, + "step": 2232 + }, + { + "epoch": 0.19, + "grad_norm": 1.4881079196929932, + "learning_rate": 4.044117647058824e-05, + "loss": 2.4857, + "step": 2236 + }, + { + "epoch": 0.19, + "grad_norm": 1.4665502309799194, + "learning_rate": 4.042407660738714e-05, + "loss": 2.5874, + "step": 2240 + }, + { + "epoch": 0.19, + "grad_norm": 1.5124859809875488, + "learning_rate": 4.0406976744186046e-05, + "loss": 2.8067, + "step": 2244 + }, + { + "epoch": 0.19, + "grad_norm": 1.4011058807373047, + "learning_rate": 4.0389876880984954e-05, + "loss": 2.5625, + "step": 2248 + }, + { + "epoch": 0.19, + "grad_norm": 1.469796895980835, + "learning_rate": 4.037277701778386e-05, + "loss": 2.7695, + "step": 2252 + }, + { + "epoch": 0.19, + "grad_norm": 1.610304355621338, + "learning_rate": 4.035567715458277e-05, + "loss": 2.5637, + "step": 2256 + }, + { + "epoch": 0.19, + "grad_norm": 1.3841075897216797, + "learning_rate": 4.033857729138167e-05, + "loss": 2.7228, + "step": 2260 + }, + { + "epoch": 0.19, + "grad_norm": 1.6841098070144653, + "learning_rate": 4.0321477428180576e-05, + "loss": 2.6203, + "step": 2264 + }, + { + "epoch": 0.19, + "grad_norm": 1.5729596614837646, + "learning_rate": 4.0304377564979484e-05, + "loss": 2.5021, + "step": 2268 + }, + { + "epoch": 0.19, + "grad_norm": 1.6014264822006226, + "learning_rate": 4.0287277701778384e-05, + "loss": 2.6825, + "step": 2272 + }, + { + "epoch": 0.19, + "grad_norm": 1.3496001958847046, + "learning_rate": 4.027017783857729e-05, + "loss": 2.5271, + "step": 2276 + }, + { + "epoch": 0.19, + "grad_norm": 1.5231562852859497, + "learning_rate": 4.02530779753762e-05, + "loss": 2.6128, + "step": 2280 + }, + { + "epoch": 0.2, + "grad_norm": 1.7021198272705078, + "learning_rate": 4.0235978112175106e-05, + "loss": 2.7152, + "step": 2284 + }, + { + "epoch": 0.2, + "grad_norm": 1.4971519708633423, + "learning_rate": 4.0218878248974014e-05, + "loss": 2.6637, + "step": 2288 + }, + { + "epoch": 0.2, + "grad_norm": 1.6225758790969849, + "learning_rate": 4.0201778385772914e-05, + "loss": 2.6871, + "step": 2292 + }, + { + "epoch": 0.2, + "grad_norm": 1.5330243110656738, + "learning_rate": 4.018467852257182e-05, + "loss": 2.5721, + "step": 2296 + }, + { + "epoch": 0.2, + "grad_norm": 1.4374375343322754, + "learning_rate": 4.016757865937072e-05, + "loss": 2.6393, + "step": 2300 + }, + { + "epoch": 0.2, + "grad_norm": 1.5932408571243286, + "learning_rate": 4.015047879616963e-05, + "loss": 2.5341, + "step": 2304 + }, + { + "epoch": 0.2, + "grad_norm": 1.7230651378631592, + "learning_rate": 4.0133378932968544e-05, + "loss": 2.6912, + "step": 2308 + }, + { + "epoch": 0.2, + "grad_norm": 1.5877001285552979, + "learning_rate": 4.0116279069767444e-05, + "loss": 2.6632, + "step": 2312 + }, + { + "epoch": 0.2, + "grad_norm": 1.6193994283676147, + "learning_rate": 4.009917920656635e-05, + "loss": 2.3792, + "step": 2316 + }, + { + "epoch": 0.2, + "grad_norm": 1.5001564025878906, + "learning_rate": 4.008207934336525e-05, + "loss": 2.538, + "step": 2320 + }, + { + "epoch": 0.2, + "grad_norm": 1.5603808164596558, + "learning_rate": 4.006497948016416e-05, + "loss": 2.5206, + "step": 2324 + }, + { + "epoch": 0.2, + "grad_norm": 1.4274383783340454, + "learning_rate": 4.004787961696307e-05, + "loss": 2.6341, + "step": 2328 + }, + { + "epoch": 0.2, + "grad_norm": 1.5026893615722656, + "learning_rate": 4.0030779753761974e-05, + "loss": 2.5531, + "step": 2332 + }, + { + "epoch": 0.2, + "grad_norm": 1.415287733078003, + "learning_rate": 4.001367989056088e-05, + "loss": 2.6344, + "step": 2336 + }, + { + "epoch": 0.2, + "grad_norm": 1.61545991897583, + "learning_rate": 3.999658002735978e-05, + "loss": 2.5476, + "step": 2340 + }, + { + "epoch": 0.2, + "grad_norm": 1.4016505479812622, + "learning_rate": 3.997948016415869e-05, + "loss": 2.562, + "step": 2344 + }, + { + "epoch": 0.2, + "grad_norm": 1.665184497833252, + "learning_rate": 3.99623803009576e-05, + "loss": 2.6033, + "step": 2348 + }, + { + "epoch": 0.2, + "grad_norm": 1.562911868095398, + "learning_rate": 3.99452804377565e-05, + "loss": 2.6955, + "step": 2352 + }, + { + "epoch": 0.2, + "grad_norm": 1.477481484413147, + "learning_rate": 3.9928180574555405e-05, + "loss": 2.66, + "step": 2356 + }, + { + "epoch": 0.2, + "grad_norm": 1.4621987342834473, + "learning_rate": 3.991108071135431e-05, + "loss": 2.4728, + "step": 2360 + }, + { + "epoch": 0.2, + "grad_norm": 1.3315041065216064, + "learning_rate": 3.989398084815322e-05, + "loss": 2.5248, + "step": 2364 + }, + { + "epoch": 0.2, + "grad_norm": 1.876422643661499, + "learning_rate": 3.987688098495212e-05, + "loss": 2.6436, + "step": 2368 + }, + { + "epoch": 0.2, + "grad_norm": 1.47256600856781, + "learning_rate": 3.985978112175103e-05, + "loss": 2.5254, + "step": 2372 + }, + { + "epoch": 0.2, + "grad_norm": 1.6386239528656006, + "learning_rate": 3.9842681258549935e-05, + "loss": 2.6176, + "step": 2376 + }, + { + "epoch": 0.2, + "grad_norm": 1.3476667404174805, + "learning_rate": 3.9825581395348835e-05, + "loss": 2.4059, + "step": 2380 + }, + { + "epoch": 0.2, + "grad_norm": 1.436629056930542, + "learning_rate": 3.980848153214774e-05, + "loss": 2.6304, + "step": 2384 + }, + { + "epoch": 0.2, + "grad_norm": 1.3741427659988403, + "learning_rate": 3.979138166894665e-05, + "loss": 2.5209, + "step": 2388 + }, + { + "epoch": 0.2, + "grad_norm": 1.5190575122833252, + "learning_rate": 3.977428180574556e-05, + "loss": 2.4463, + "step": 2392 + }, + { + "epoch": 0.2, + "grad_norm": 1.389143943786621, + "learning_rate": 3.9757181942544465e-05, + "loss": 2.6484, + "step": 2396 + }, + { + "epoch": 0.21, + "grad_norm": 1.3748509883880615, + "learning_rate": 3.9740082079343365e-05, + "loss": 2.4275, + "step": 2400 + }, + { + "epoch": 0.21, + "grad_norm": 1.7997660636901855, + "learning_rate": 3.972298221614227e-05, + "loss": 2.6384, + "step": 2404 + }, + { + "epoch": 0.21, + "grad_norm": 1.4528918266296387, + "learning_rate": 3.970588235294117e-05, + "loss": 2.6204, + "step": 2408 + }, + { + "epoch": 0.21, + "grad_norm": 1.421633243560791, + "learning_rate": 3.968878248974009e-05, + "loss": 2.5918, + "step": 2412 + }, + { + "epoch": 0.21, + "grad_norm": 1.4433393478393555, + "learning_rate": 3.9671682626538994e-05, + "loss": 2.6482, + "step": 2416 + }, + { + "epoch": 0.21, + "grad_norm": 1.2582066059112549, + "learning_rate": 3.9654582763337895e-05, + "loss": 2.413, + "step": 2420 + }, + { + "epoch": 0.21, + "grad_norm": 1.394639253616333, + "learning_rate": 3.96374829001368e-05, + "loss": 2.4778, + "step": 2424 + }, + { + "epoch": 0.21, + "grad_norm": 1.424068570137024, + "learning_rate": 3.96203830369357e-05, + "loss": 2.6462, + "step": 2428 + }, + { + "epoch": 0.21, + "grad_norm": 1.5364696979522705, + "learning_rate": 3.960328317373461e-05, + "loss": 2.5108, + "step": 2432 + }, + { + "epoch": 0.21, + "grad_norm": 1.5929545164108276, + "learning_rate": 3.958618331053352e-05, + "loss": 2.5638, + "step": 2436 + }, + { + "epoch": 0.21, + "grad_norm": 1.5119692087173462, + "learning_rate": 3.9569083447332425e-05, + "loss": 2.6631, + "step": 2440 + }, + { + "epoch": 0.21, + "grad_norm": 1.573603630065918, + "learning_rate": 3.955198358413133e-05, + "loss": 2.7071, + "step": 2444 + }, + { + "epoch": 0.21, + "grad_norm": 1.563513159751892, + "learning_rate": 3.953488372093023e-05, + "loss": 2.4158, + "step": 2448 + }, + { + "epoch": 0.21, + "grad_norm": 1.572964072227478, + "learning_rate": 3.951778385772914e-05, + "loss": 2.6124, + "step": 2452 + }, + { + "epoch": 0.21, + "grad_norm": 1.4729305505752563, + "learning_rate": 3.950068399452805e-05, + "loss": 2.4546, + "step": 2456 + }, + { + "epoch": 0.21, + "grad_norm": 1.3578672409057617, + "learning_rate": 3.948358413132695e-05, + "loss": 2.5572, + "step": 2460 + }, + { + "epoch": 0.21, + "grad_norm": 1.456809401512146, + "learning_rate": 3.9466484268125855e-05, + "loss": 2.6155, + "step": 2464 + }, + { + "epoch": 0.21, + "grad_norm": 1.417240023612976, + "learning_rate": 3.944938440492476e-05, + "loss": 2.6249, + "step": 2468 + }, + { + "epoch": 0.21, + "grad_norm": 1.6173032522201538, + "learning_rate": 3.943228454172367e-05, + "loss": 2.3838, + "step": 2472 + }, + { + "epoch": 0.21, + "grad_norm": 1.5841619968414307, + "learning_rate": 3.941518467852258e-05, + "loss": 2.5878, + "step": 2476 + }, + { + "epoch": 0.21, + "grad_norm": 1.526047945022583, + "learning_rate": 3.939808481532148e-05, + "loss": 2.6055, + "step": 2480 + }, + { + "epoch": 0.21, + "grad_norm": 1.375135064125061, + "learning_rate": 3.9380984952120385e-05, + "loss": 2.4982, + "step": 2484 + }, + { + "epoch": 0.21, + "grad_norm": 2.0735349655151367, + "learning_rate": 3.9363885088919286e-05, + "loss": 2.5964, + "step": 2488 + }, + { + "epoch": 0.21, + "grad_norm": 1.4464576244354248, + "learning_rate": 3.93467852257182e-05, + "loss": 2.5043, + "step": 2492 + }, + { + "epoch": 0.21, + "grad_norm": 1.4004943370819092, + "learning_rate": 3.93296853625171e-05, + "loss": 2.3337, + "step": 2496 + }, + { + "epoch": 0.21, + "grad_norm": 1.5199224948883057, + "learning_rate": 3.931258549931601e-05, + "loss": 2.5759, + "step": 2500 + }, + { + "epoch": 0.21, + "grad_norm": 2.1466448307037354, + "learning_rate": 3.9295485636114915e-05, + "loss": 2.6014, + "step": 2504 + }, + { + "epoch": 0.21, + "grad_norm": 1.3880302906036377, + "learning_rate": 3.9278385772913816e-05, + "loss": 2.4023, + "step": 2508 + }, + { + "epoch": 0.21, + "grad_norm": 1.501216173171997, + "learning_rate": 3.926128590971272e-05, + "loss": 2.6306, + "step": 2512 + }, + { + "epoch": 0.22, + "grad_norm": 1.5279217958450317, + "learning_rate": 3.924418604651163e-05, + "loss": 2.5785, + "step": 2516 + }, + { + "epoch": 0.22, + "grad_norm": 1.4741731882095337, + "learning_rate": 3.922708618331054e-05, + "loss": 2.5174, + "step": 2520 + }, + { + "epoch": 0.22, + "grad_norm": 1.6046087741851807, + "learning_rate": 3.9209986320109445e-05, + "loss": 2.6823, + "step": 2524 + }, + { + "epoch": 0.22, + "grad_norm": 1.4944065809249878, + "learning_rate": 3.9192886456908346e-05, + "loss": 2.7186, + "step": 2528 + }, + { + "epoch": 0.22, + "grad_norm": 1.3260518312454224, + "learning_rate": 3.917578659370725e-05, + "loss": 2.3741, + "step": 2532 + }, + { + "epoch": 0.22, + "grad_norm": 1.591991901397705, + "learning_rate": 3.9158686730506154e-05, + "loss": 2.5534, + "step": 2536 + }, + { + "epoch": 0.22, + "grad_norm": 1.5477372407913208, + "learning_rate": 3.914158686730506e-05, + "loss": 2.568, + "step": 2540 + }, + { + "epoch": 0.22, + "grad_norm": 1.5402345657348633, + "learning_rate": 3.912448700410397e-05, + "loss": 2.5795, + "step": 2544 + }, + { + "epoch": 0.22, + "grad_norm": 2.2363271713256836, + "learning_rate": 3.9107387140902876e-05, + "loss": 2.4248, + "step": 2548 + }, + { + "epoch": 0.22, + "grad_norm": 1.2395968437194824, + "learning_rate": 3.909028727770178e-05, + "loss": 2.6678, + "step": 2552 + }, + { + "epoch": 0.22, + "grad_norm": 1.5662035942077637, + "learning_rate": 3.9073187414500684e-05, + "loss": 2.6065, + "step": 2556 + }, + { + "epoch": 0.22, + "grad_norm": 1.3480955362319946, + "learning_rate": 3.905608755129959e-05, + "loss": 2.5286, + "step": 2560 + }, + { + "epoch": 0.22, + "grad_norm": 1.5644841194152832, + "learning_rate": 3.90389876880985e-05, + "loss": 2.439, + "step": 2564 + }, + { + "epoch": 0.22, + "grad_norm": 1.556207537651062, + "learning_rate": 3.90218878248974e-05, + "loss": 2.5694, + "step": 2568 + }, + { + "epoch": 0.22, + "grad_norm": 1.4543012380599976, + "learning_rate": 3.9004787961696306e-05, + "loss": 2.5277, + "step": 2572 + }, + { + "epoch": 0.22, + "grad_norm": 2.2177674770355225, + "learning_rate": 3.8987688098495214e-05, + "loss": 2.5114, + "step": 2576 + }, + { + "epoch": 0.22, + "grad_norm": 1.5734175443649292, + "learning_rate": 3.897058823529412e-05, + "loss": 2.469, + "step": 2580 + }, + { + "epoch": 0.22, + "grad_norm": 1.5571191310882568, + "learning_rate": 3.895348837209303e-05, + "loss": 2.5852, + "step": 2584 + }, + { + "epoch": 0.22, + "grad_norm": 1.5742443799972534, + "learning_rate": 3.893638850889193e-05, + "loss": 2.429, + "step": 2588 + }, + { + "epoch": 0.22, + "grad_norm": 1.4786051511764526, + "learning_rate": 3.8919288645690836e-05, + "loss": 2.6498, + "step": 2592 + }, + { + "epoch": 0.22, + "grad_norm": 1.5065261125564575, + "learning_rate": 3.890218878248974e-05, + "loss": 2.5307, + "step": 2596 + }, + { + "epoch": 0.22, + "grad_norm": 1.9272600412368774, + "learning_rate": 3.888508891928865e-05, + "loss": 2.5898, + "step": 2600 + }, + { + "epoch": 0.22, + "grad_norm": 1.5606516599655151, + "learning_rate": 3.886798905608756e-05, + "loss": 2.4504, + "step": 2604 + }, + { + "epoch": 0.22, + "grad_norm": 1.4898043870925903, + "learning_rate": 3.885088919288646e-05, + "loss": 2.4989, + "step": 2608 + }, + { + "epoch": 0.22, + "grad_norm": 1.878469467163086, + "learning_rate": 3.8833789329685366e-05, + "loss": 2.6865, + "step": 2612 + }, + { + "epoch": 0.22, + "grad_norm": 1.8781071901321411, + "learning_rate": 3.881668946648427e-05, + "loss": 2.5997, + "step": 2616 + }, + { + "epoch": 0.22, + "grad_norm": 1.6376733779907227, + "learning_rate": 3.8799589603283174e-05, + "loss": 2.5791, + "step": 2620 + }, + { + "epoch": 0.22, + "grad_norm": 1.4639188051223755, + "learning_rate": 3.878248974008208e-05, + "loss": 2.3323, + "step": 2624 + }, + { + "epoch": 0.22, + "grad_norm": 1.5879504680633545, + "learning_rate": 3.876538987688099e-05, + "loss": 2.4866, + "step": 2628 + }, + { + "epoch": 0.23, + "grad_norm": 1.5608062744140625, + "learning_rate": 3.8748290013679896e-05, + "loss": 2.3779, + "step": 2632 + }, + { + "epoch": 0.23, + "grad_norm": 1.390249252319336, + "learning_rate": 3.87311901504788e-05, + "loss": 2.3288, + "step": 2636 + }, + { + "epoch": 0.23, + "grad_norm": 1.357127070426941, + "learning_rate": 3.8714090287277704e-05, + "loss": 2.5016, + "step": 2640 + }, + { + "epoch": 0.23, + "grad_norm": 1.4445414543151855, + "learning_rate": 3.8696990424076605e-05, + "loss": 2.5401, + "step": 2644 + }, + { + "epoch": 0.23, + "grad_norm": 1.4636071920394897, + "learning_rate": 3.867989056087551e-05, + "loss": 2.3385, + "step": 2648 + }, + { + "epoch": 0.23, + "grad_norm": 1.4558255672454834, + "learning_rate": 3.866279069767442e-05, + "loss": 2.3848, + "step": 2652 + }, + { + "epoch": 0.23, + "grad_norm": 1.6711084842681885, + "learning_rate": 3.864569083447333e-05, + "loss": 2.4541, + "step": 2656 + }, + { + "epoch": 0.23, + "grad_norm": 2.5824813842773438, + "learning_rate": 3.8628590971272234e-05, + "loss": 2.4533, + "step": 2660 + }, + { + "epoch": 0.23, + "grad_norm": 1.5478894710540771, + "learning_rate": 3.8611491108071135e-05, + "loss": 2.4698, + "step": 2664 + }, + { + "epoch": 0.23, + "grad_norm": 2.2620136737823486, + "learning_rate": 3.859439124487004e-05, + "loss": 2.746, + "step": 2668 + }, + { + "epoch": 0.23, + "grad_norm": 2.4922306537628174, + "learning_rate": 3.857729138166895e-05, + "loss": 2.6415, + "step": 2672 + }, + { + "epoch": 0.23, + "grad_norm": 1.476605772972107, + "learning_rate": 3.856019151846785e-05, + "loss": 2.4816, + "step": 2676 + }, + { + "epoch": 0.23, + "grad_norm": 1.7977741956710815, + "learning_rate": 3.8543091655266764e-05, + "loss": 2.4403, + "step": 2680 + }, + { + "epoch": 0.23, + "grad_norm": 1.3775659799575806, + "learning_rate": 3.8525991792065665e-05, + "loss": 2.425, + "step": 2684 + }, + { + "epoch": 0.23, + "grad_norm": 1.645110845565796, + "learning_rate": 3.850889192886457e-05, + "loss": 2.4656, + "step": 2688 + }, + { + "epoch": 0.23, + "grad_norm": 1.3301700353622437, + "learning_rate": 3.849179206566348e-05, + "loss": 2.4745, + "step": 2692 + }, + { + "epoch": 0.23, + "grad_norm": 1.7708193063735962, + "learning_rate": 3.847469220246238e-05, + "loss": 2.6024, + "step": 2696 + }, + { + "epoch": 0.23, + "grad_norm": 1.4959282875061035, + "learning_rate": 3.845759233926129e-05, + "loss": 2.5686, + "step": 2700 + }, + { + "epoch": 0.23, + "grad_norm": 1.6098191738128662, + "learning_rate": 3.8440492476060194e-05, + "loss": 2.3471, + "step": 2704 + }, + { + "epoch": 0.23, + "grad_norm": 1.3885496854782104, + "learning_rate": 3.84233926128591e-05, + "loss": 2.3912, + "step": 2708 + }, + { + "epoch": 0.23, + "grad_norm": 1.4107521772384644, + "learning_rate": 3.840629274965801e-05, + "loss": 2.397, + "step": 2712 + }, + { + "epoch": 0.23, + "grad_norm": 1.6801328659057617, + "learning_rate": 3.838919288645691e-05, + "loss": 2.5848, + "step": 2716 + }, + { + "epoch": 0.23, + "grad_norm": 1.3853564262390137, + "learning_rate": 3.837209302325582e-05, + "loss": 2.2871, + "step": 2720 + }, + { + "epoch": 0.23, + "grad_norm": 1.3792238235473633, + "learning_rate": 3.835499316005472e-05, + "loss": 2.45, + "step": 2724 + }, + { + "epoch": 0.23, + "grad_norm": 1.6318219900131226, + "learning_rate": 3.8337893296853625e-05, + "loss": 2.6403, + "step": 2728 + }, + { + "epoch": 0.23, + "grad_norm": 1.7002670764923096, + "learning_rate": 3.832079343365253e-05, + "loss": 2.5752, + "step": 2732 + }, + { + "epoch": 0.23, + "grad_norm": 1.3887910842895508, + "learning_rate": 3.830369357045144e-05, + "loss": 2.4802, + "step": 2736 + }, + { + "epoch": 0.23, + "grad_norm": 1.4336968660354614, + "learning_rate": 3.828659370725035e-05, + "loss": 2.4769, + "step": 2740 + }, + { + "epoch": 0.23, + "grad_norm": 1.5607534646987915, + "learning_rate": 3.826949384404925e-05, + "loss": 2.4018, + "step": 2744 + }, + { + "epoch": 0.23, + "grad_norm": 1.4709196090698242, + "learning_rate": 3.8252393980848155e-05, + "loss": 2.4358, + "step": 2748 + }, + { + "epoch": 0.24, + "grad_norm": 1.3464138507843018, + "learning_rate": 3.8235294117647055e-05, + "loss": 2.5305, + "step": 2752 + }, + { + "epoch": 0.24, + "grad_norm": 1.4054791927337646, + "learning_rate": 3.821819425444596e-05, + "loss": 2.4714, + "step": 2756 + }, + { + "epoch": 0.24, + "grad_norm": 1.699159026145935, + "learning_rate": 3.820109439124488e-05, + "loss": 2.4277, + "step": 2760 + }, + { + "epoch": 0.24, + "grad_norm": 1.3283201456069946, + "learning_rate": 3.818399452804378e-05, + "loss": 2.405, + "step": 2764 + }, + { + "epoch": 0.24, + "grad_norm": 1.6334400177001953, + "learning_rate": 3.8166894664842685e-05, + "loss": 2.6551, + "step": 2768 + }, + { + "epoch": 0.24, + "grad_norm": 1.4785411357879639, + "learning_rate": 3.8149794801641585e-05, + "loss": 2.5849, + "step": 2772 + }, + { + "epoch": 0.24, + "grad_norm": 1.5953280925750732, + "learning_rate": 3.813269493844049e-05, + "loss": 2.4022, + "step": 2776 + }, + { + "epoch": 0.24, + "grad_norm": 1.4671732187271118, + "learning_rate": 3.81155950752394e-05, + "loss": 2.3134, + "step": 2780 + }, + { + "epoch": 0.24, + "grad_norm": 1.461626410484314, + "learning_rate": 3.809849521203831e-05, + "loss": 2.4263, + "step": 2784 + }, + { + "epoch": 0.24, + "grad_norm": 1.5867888927459717, + "learning_rate": 3.8081395348837215e-05, + "loss": 2.4209, + "step": 2788 + }, + { + "epoch": 0.24, + "grad_norm": 1.4046286344528198, + "learning_rate": 3.8064295485636115e-05, + "loss": 2.3673, + "step": 2792 + }, + { + "epoch": 0.24, + "grad_norm": 1.6984418630599976, + "learning_rate": 3.804719562243502e-05, + "loss": 2.4858, + "step": 2796 + }, + { + "epoch": 0.24, + "grad_norm": 1.6407222747802734, + "learning_rate": 3.803009575923393e-05, + "loss": 2.4795, + "step": 2800 + }, + { + "epoch": 0.24, + "grad_norm": 1.4830973148345947, + "learning_rate": 3.801299589603283e-05, + "loss": 2.5164, + "step": 2804 + }, + { + "epoch": 0.24, + "grad_norm": 1.2736021280288696, + "learning_rate": 3.799589603283174e-05, + "loss": 2.3541, + "step": 2808 + }, + { + "epoch": 0.24, + "grad_norm": 1.564693570137024, + "learning_rate": 3.7978796169630645e-05, + "loss": 2.3652, + "step": 2812 + }, + { + "epoch": 0.24, + "grad_norm": 1.5286824703216553, + "learning_rate": 3.796169630642955e-05, + "loss": 2.5063, + "step": 2816 + }, + { + "epoch": 0.24, + "grad_norm": 1.466086745262146, + "learning_rate": 3.794459644322846e-05, + "loss": 2.5876, + "step": 2820 + }, + { + "epoch": 0.24, + "grad_norm": 1.5650023221969604, + "learning_rate": 3.792749658002736e-05, + "loss": 2.4316, + "step": 2824 + }, + { + "epoch": 0.24, + "grad_norm": 1.4438190460205078, + "learning_rate": 3.791039671682627e-05, + "loss": 2.3659, + "step": 2828 + }, + { + "epoch": 0.24, + "grad_norm": 1.8183629512786865, + "learning_rate": 3.789329685362517e-05, + "loss": 2.4262, + "step": 2832 + }, + { + "epoch": 0.24, + "grad_norm": 1.856266736984253, + "learning_rate": 3.7876196990424076e-05, + "loss": 2.457, + "step": 2836 + }, + { + "epoch": 0.24, + "grad_norm": 1.6876139640808105, + "learning_rate": 3.785909712722299e-05, + "loss": 2.4234, + "step": 2840 + }, + { + "epoch": 0.24, + "grad_norm": 1.3969238996505737, + "learning_rate": 3.784199726402189e-05, + "loss": 2.3044, + "step": 2844 + }, + { + "epoch": 0.24, + "grad_norm": 1.3718289136886597, + "learning_rate": 3.78248974008208e-05, + "loss": 2.4889, + "step": 2848 + }, + { + "epoch": 0.24, + "grad_norm": 1.5117738246917725, + "learning_rate": 3.78077975376197e-05, + "loss": 2.3139, + "step": 2852 + }, + { + "epoch": 0.24, + "grad_norm": 1.564997673034668, + "learning_rate": 3.7790697674418606e-05, + "loss": 2.6801, + "step": 2856 + }, + { + "epoch": 0.24, + "grad_norm": 1.5139191150665283, + "learning_rate": 3.777359781121751e-05, + "loss": 2.3891, + "step": 2860 + }, + { + "epoch": 0.24, + "grad_norm": 1.5658632516860962, + "learning_rate": 3.775649794801642e-05, + "loss": 2.4947, + "step": 2864 + }, + { + "epoch": 0.25, + "grad_norm": 1.5268386602401733, + "learning_rate": 3.773939808481533e-05, + "loss": 2.4591, + "step": 2868 + }, + { + "epoch": 0.25, + "grad_norm": 1.977797508239746, + "learning_rate": 3.772229822161423e-05, + "loss": 2.5294, + "step": 2872 + }, + { + "epoch": 0.25, + "grad_norm": 1.368944764137268, + "learning_rate": 3.7705198358413136e-05, + "loss": 2.2917, + "step": 2876 + }, + { + "epoch": 0.25, + "grad_norm": 1.502429723739624, + "learning_rate": 3.7688098495212036e-05, + "loss": 2.5181, + "step": 2880 + }, + { + "epoch": 0.25, + "grad_norm": 1.4561350345611572, + "learning_rate": 3.7670998632010944e-05, + "loss": 2.4059, + "step": 2884 + }, + { + "epoch": 0.25, + "grad_norm": 1.9906591176986694, + "learning_rate": 3.765389876880985e-05, + "loss": 2.475, + "step": 2888 + }, + { + "epoch": 0.25, + "grad_norm": 1.607998251914978, + "learning_rate": 3.763679890560876e-05, + "loss": 2.668, + "step": 2892 + }, + { + "epoch": 0.25, + "grad_norm": 1.4631425142288208, + "learning_rate": 3.7619699042407666e-05, + "loss": 2.4278, + "step": 2896 + }, + { + "epoch": 0.25, + "grad_norm": 1.3773366212844849, + "learning_rate": 3.7602599179206566e-05, + "loss": 2.5251, + "step": 2900 + }, + { + "epoch": 0.25, + "grad_norm": 1.8001102209091187, + "learning_rate": 3.7585499316005474e-05, + "loss": 2.4981, + "step": 2904 + }, + { + "epoch": 0.25, + "grad_norm": 1.4368932247161865, + "learning_rate": 3.756839945280438e-05, + "loss": 2.3902, + "step": 2908 + }, + { + "epoch": 0.25, + "grad_norm": 1.5067082643508911, + "learning_rate": 3.755129958960328e-05, + "loss": 2.4835, + "step": 2912 + }, + { + "epoch": 0.25, + "grad_norm": 1.6977261304855347, + "learning_rate": 3.753419972640219e-05, + "loss": 2.4424, + "step": 2916 + }, + { + "epoch": 0.25, + "grad_norm": 1.5715231895446777, + "learning_rate": 3.7517099863201096e-05, + "loss": 2.376, + "step": 2920 + }, + { + "epoch": 0.25, + "grad_norm": 1.8464969396591187, + "learning_rate": 3.7500000000000003e-05, + "loss": 2.3253, + "step": 2924 + }, + { + "epoch": 0.25, + "grad_norm": 1.4582910537719727, + "learning_rate": 3.748290013679891e-05, + "loss": 2.2077, + "step": 2928 + }, + { + "epoch": 0.25, + "grad_norm": 1.3920525312423706, + "learning_rate": 3.746580027359781e-05, + "loss": 2.2856, + "step": 2932 + }, + { + "epoch": 0.25, + "grad_norm": 1.594838261604309, + "learning_rate": 3.744870041039672e-05, + "loss": 2.441, + "step": 2936 + }, + { + "epoch": 0.25, + "grad_norm": 1.608803629875183, + "learning_rate": 3.743160054719562e-05, + "loss": 2.5194, + "step": 2940 + }, + { + "epoch": 0.25, + "grad_norm": 1.492957592010498, + "learning_rate": 3.741450068399453e-05, + "loss": 2.3489, + "step": 2944 + }, + { + "epoch": 0.25, + "grad_norm": 4.186474323272705, + "learning_rate": 3.739740082079344e-05, + "loss": 2.5155, + "step": 2948 + }, + { + "epoch": 0.25, + "grad_norm": 1.3143714666366577, + "learning_rate": 3.738030095759234e-05, + "loss": 2.5857, + "step": 2952 + }, + { + "epoch": 0.25, + "grad_norm": 2.287429094314575, + "learning_rate": 3.736320109439125e-05, + "loss": 2.4715, + "step": 2956 + }, + { + "epoch": 0.25, + "grad_norm": 1.3473930358886719, + "learning_rate": 3.734610123119015e-05, + "loss": 2.4606, + "step": 2960 + }, + { + "epoch": 0.25, + "grad_norm": 1.6107207536697388, + "learning_rate": 3.7329001367989057e-05, + "loss": 2.4097, + "step": 2964 + }, + { + "epoch": 0.25, + "grad_norm": 1.959020733833313, + "learning_rate": 3.7311901504787964e-05, + "loss": 2.5748, + "step": 2968 + }, + { + "epoch": 0.25, + "grad_norm": 1.483560562133789, + "learning_rate": 3.729480164158687e-05, + "loss": 2.4517, + "step": 2972 + }, + { + "epoch": 0.25, + "grad_norm": 1.4312429428100586, + "learning_rate": 3.727770177838578e-05, + "loss": 2.6805, + "step": 2976 + }, + { + "epoch": 0.25, + "grad_norm": 1.3787798881530762, + "learning_rate": 3.726060191518468e-05, + "loss": 2.3881, + "step": 2980 + }, + { + "epoch": 0.26, + "grad_norm": 1.6351797580718994, + "learning_rate": 3.7243502051983587e-05, + "loss": 2.4977, + "step": 2984 + }, + { + "epoch": 0.26, + "grad_norm": 1.615729808807373, + "learning_rate": 3.722640218878249e-05, + "loss": 2.4574, + "step": 2988 + }, + { + "epoch": 0.26, + "grad_norm": 1.4360637664794922, + "learning_rate": 3.7209302325581394e-05, + "loss": 2.4282, + "step": 2992 + }, + { + "epoch": 0.26, + "grad_norm": 1.5573183298110962, + "learning_rate": 3.71922024623803e-05, + "loss": 2.3941, + "step": 2996 + }, + { + "epoch": 0.26, + "grad_norm": 1.398614764213562, + "learning_rate": 3.717510259917921e-05, + "loss": 2.4977, + "step": 3000 + }, + { + "epoch": 0.26, + "grad_norm": 1.7220714092254639, + "learning_rate": 3.7158002735978116e-05, + "loss": 2.3985, + "step": 3004 + }, + { + "epoch": 0.26, + "grad_norm": 1.470420479774475, + "learning_rate": 3.714090287277702e-05, + "loss": 2.3886, + "step": 3008 + }, + { + "epoch": 0.26, + "grad_norm": 1.478071689605713, + "learning_rate": 3.7123803009575924e-05, + "loss": 2.3968, + "step": 3012 + }, + { + "epoch": 0.26, + "grad_norm": 1.4263502359390259, + "learning_rate": 3.710670314637483e-05, + "loss": 2.5225, + "step": 3016 + }, + { + "epoch": 0.26, + "grad_norm": 1.7249011993408203, + "learning_rate": 3.708960328317373e-05, + "loss": 2.538, + "step": 3020 + }, + { + "epoch": 0.26, + "grad_norm": 1.5792475938796997, + "learning_rate": 3.707250341997264e-05, + "loss": 2.6076, + "step": 3024 + }, + { + "epoch": 0.26, + "grad_norm": 1.558214783668518, + "learning_rate": 3.705540355677155e-05, + "loss": 2.4636, + "step": 3028 + }, + { + "epoch": 0.26, + "grad_norm": 1.395617961883545, + "learning_rate": 3.7038303693570454e-05, + "loss": 2.4319, + "step": 3032 + }, + { + "epoch": 0.26, + "grad_norm": 1.6385856866836548, + "learning_rate": 3.702120383036936e-05, + "loss": 2.7062, + "step": 3036 + }, + { + "epoch": 0.26, + "grad_norm": 1.4634343385696411, + "learning_rate": 3.700410396716826e-05, + "loss": 2.5087, + "step": 3040 + }, + { + "epoch": 0.26, + "grad_norm": 1.6941776275634766, + "learning_rate": 3.698700410396717e-05, + "loss": 2.3967, + "step": 3044 + }, + { + "epoch": 0.26, + "grad_norm": 1.5948282480239868, + "learning_rate": 3.696990424076607e-05, + "loss": 2.371, + "step": 3048 + }, + { + "epoch": 0.26, + "grad_norm": 1.6816450357437134, + "learning_rate": 3.6952804377564984e-05, + "loss": 2.3695, + "step": 3052 + }, + { + "epoch": 0.26, + "grad_norm": 1.4531540870666504, + "learning_rate": 3.693570451436389e-05, + "loss": 2.375, + "step": 3056 + }, + { + "epoch": 0.26, + "grad_norm": 1.337263584136963, + "learning_rate": 3.691860465116279e-05, + "loss": 2.3965, + "step": 3060 + }, + { + "epoch": 0.26, + "grad_norm": 1.5273529291152954, + "learning_rate": 3.69015047879617e-05, + "loss": 2.3984, + "step": 3064 + }, + { + "epoch": 0.26, + "grad_norm": 1.4230504035949707, + "learning_rate": 3.68844049247606e-05, + "loss": 2.4717, + "step": 3068 + }, + { + "epoch": 0.26, + "grad_norm": 1.6386831998825073, + "learning_rate": 3.686730506155951e-05, + "loss": 2.5523, + "step": 3072 + }, + { + "epoch": 0.26, + "grad_norm": 1.5074329376220703, + "learning_rate": 3.6850205198358415e-05, + "loss": 2.4326, + "step": 3076 + }, + { + "epoch": 0.26, + "grad_norm": 1.5377895832061768, + "learning_rate": 3.683310533515732e-05, + "loss": 2.3838, + "step": 3080 + }, + { + "epoch": 0.26, + "grad_norm": 1.5310460329055786, + "learning_rate": 3.681600547195623e-05, + "loss": 2.5792, + "step": 3084 + }, + { + "epoch": 0.26, + "grad_norm": 1.486627221107483, + "learning_rate": 3.679890560875513e-05, + "loss": 2.3772, + "step": 3088 + }, + { + "epoch": 0.26, + "grad_norm": 1.9239519834518433, + "learning_rate": 3.678180574555404e-05, + "loss": 2.4844, + "step": 3092 + }, + { + "epoch": 0.26, + "grad_norm": 1.4352439641952515, + "learning_rate": 3.6764705882352945e-05, + "loss": 2.3233, + "step": 3096 + }, + { + "epoch": 0.27, + "grad_norm": 1.540075421333313, + "learning_rate": 3.6747606019151845e-05, + "loss": 2.3849, + "step": 3100 + }, + { + "epoch": 0.27, + "grad_norm": 1.5862653255462646, + "learning_rate": 3.673050615595075e-05, + "loss": 2.3689, + "step": 3104 + }, + { + "epoch": 0.27, + "grad_norm": 1.4095063209533691, + "learning_rate": 3.671340629274966e-05, + "loss": 2.3854, + "step": 3108 + }, + { + "epoch": 0.27, + "grad_norm": 1.3623970746994019, + "learning_rate": 3.669630642954857e-05, + "loss": 2.2868, + "step": 3112 + }, + { + "epoch": 0.27, + "grad_norm": 1.4835405349731445, + "learning_rate": 3.667920656634747e-05, + "loss": 2.3524, + "step": 3116 + }, + { + "epoch": 0.27, + "grad_norm": 1.790026068687439, + "learning_rate": 3.6662106703146375e-05, + "loss": 2.379, + "step": 3120 + }, + { + "epoch": 0.27, + "grad_norm": 1.452793836593628, + "learning_rate": 3.664500683994528e-05, + "loss": 2.2826, + "step": 3124 + }, + { + "epoch": 0.27, + "grad_norm": 1.438991665840149, + "learning_rate": 3.662790697674418e-05, + "loss": 2.4365, + "step": 3128 + }, + { + "epoch": 0.27, + "grad_norm": 1.6045351028442383, + "learning_rate": 3.66108071135431e-05, + "loss": 2.2628, + "step": 3132 + }, + { + "epoch": 0.27, + "grad_norm": 1.568385362625122, + "learning_rate": 3.6593707250342e-05, + "loss": 2.465, + "step": 3136 + }, + { + "epoch": 0.27, + "grad_norm": 1.451302170753479, + "learning_rate": 3.6576607387140905e-05, + "loss": 2.3324, + "step": 3140 + }, + { + "epoch": 0.27, + "grad_norm": 1.4595221281051636, + "learning_rate": 3.655950752393981e-05, + "loss": 2.4677, + "step": 3144 + }, + { + "epoch": 0.27, + "grad_norm": 1.5326175689697266, + "learning_rate": 3.654240766073871e-05, + "loss": 2.458, + "step": 3148 + }, + { + "epoch": 0.27, + "grad_norm": 1.651713490486145, + "learning_rate": 3.652530779753762e-05, + "loss": 2.3911, + "step": 3152 + }, + { + "epoch": 0.27, + "grad_norm": 1.475189208984375, + "learning_rate": 3.650820793433653e-05, + "loss": 2.4598, + "step": 3156 + }, + { + "epoch": 0.27, + "grad_norm": 1.7967069149017334, + "learning_rate": 3.6491108071135435e-05, + "loss": 2.41, + "step": 3160 + }, + { + "epoch": 0.27, + "grad_norm": 1.380880355834961, + "learning_rate": 3.647400820793434e-05, + "loss": 2.3952, + "step": 3164 + }, + { + "epoch": 0.27, + "grad_norm": 1.392062783241272, + "learning_rate": 3.645690834473324e-05, + "loss": 2.5062, + "step": 3168 + }, + { + "epoch": 0.27, + "grad_norm": 1.5198041200637817, + "learning_rate": 3.643980848153215e-05, + "loss": 2.3908, + "step": 3172 + }, + { + "epoch": 0.27, + "grad_norm": 1.3749966621398926, + "learning_rate": 3.642270861833105e-05, + "loss": 2.3101, + "step": 3176 + }, + { + "epoch": 0.27, + "grad_norm": 1.559767246246338, + "learning_rate": 3.640560875512996e-05, + "loss": 2.3929, + "step": 3180 + }, + { + "epoch": 0.27, + "grad_norm": 1.3676600456237793, + "learning_rate": 3.6388508891928866e-05, + "loss": 2.4963, + "step": 3184 + }, + { + "epoch": 0.27, + "grad_norm": 1.4563826322555542, + "learning_rate": 3.637140902872777e-05, + "loss": 2.4377, + "step": 3188 + }, + { + "epoch": 0.27, + "grad_norm": 1.5789074897766113, + "learning_rate": 3.635430916552668e-05, + "loss": 2.484, + "step": 3192 + }, + { + "epoch": 0.27, + "grad_norm": 1.633427381515503, + "learning_rate": 3.633720930232558e-05, + "loss": 2.5337, + "step": 3196 + }, + { + "epoch": 0.27, + "grad_norm": 1.6052011251449585, + "learning_rate": 3.632010943912449e-05, + "loss": 2.2981, + "step": 3200 + }, + { + "epoch": 0.27, + "grad_norm": 1.4831585884094238, + "learning_rate": 3.6303009575923396e-05, + "loss": 2.2886, + "step": 3204 + }, + { + "epoch": 0.27, + "grad_norm": 1.715384840965271, + "learning_rate": 3.6285909712722296e-05, + "loss": 2.2484, + "step": 3208 + }, + { + "epoch": 0.27, + "grad_norm": 1.5094373226165771, + "learning_rate": 3.626880984952121e-05, + "loss": 2.3951, + "step": 3212 + }, + { + "epoch": 0.27, + "grad_norm": 1.4048563241958618, + "learning_rate": 3.625170998632011e-05, + "loss": 2.3434, + "step": 3216 + }, + { + "epoch": 0.28, + "grad_norm": 2.033588171005249, + "learning_rate": 3.623461012311902e-05, + "loss": 2.4892, + "step": 3220 + }, + { + "epoch": 0.28, + "grad_norm": 1.5815871953964233, + "learning_rate": 3.6217510259917926e-05, + "loss": 2.4064, + "step": 3224 + }, + { + "epoch": 0.28, + "grad_norm": 1.5218071937561035, + "learning_rate": 3.6200410396716826e-05, + "loss": 2.4017, + "step": 3228 + }, + { + "epoch": 0.28, + "grad_norm": 1.5122883319854736, + "learning_rate": 3.6183310533515733e-05, + "loss": 2.2099, + "step": 3232 + }, + { + "epoch": 0.28, + "grad_norm": 1.6052781343460083, + "learning_rate": 3.616621067031464e-05, + "loss": 2.4246, + "step": 3236 + }, + { + "epoch": 0.28, + "grad_norm": 1.629193663597107, + "learning_rate": 3.614911080711355e-05, + "loss": 2.5361, + "step": 3240 + }, + { + "epoch": 0.28, + "grad_norm": 1.478102445602417, + "learning_rate": 3.613201094391245e-05, + "loss": 2.2904, + "step": 3244 + }, + { + "epoch": 0.28, + "grad_norm": 1.5051685571670532, + "learning_rate": 3.6114911080711356e-05, + "loss": 2.5879, + "step": 3248 + }, + { + "epoch": 0.28, + "grad_norm": 1.39858078956604, + "learning_rate": 3.609781121751026e-05, + "loss": 2.3035, + "step": 3252 + }, + { + "epoch": 0.28, + "grad_norm": 1.5903834104537964, + "learning_rate": 3.6080711354309164e-05, + "loss": 2.3173, + "step": 3256 + }, + { + "epoch": 0.28, + "grad_norm": 1.5299763679504395, + "learning_rate": 3.606361149110807e-05, + "loss": 2.3344, + "step": 3260 + }, + { + "epoch": 0.28, + "grad_norm": 1.5950665473937988, + "learning_rate": 3.604651162790698e-05, + "loss": 2.2693, + "step": 3264 + }, + { + "epoch": 0.28, + "grad_norm": 1.4136861562728882, + "learning_rate": 3.6029411764705886e-05, + "loss": 2.3145, + "step": 3268 + }, + { + "epoch": 0.28, + "grad_norm": 1.4014285802841187, + "learning_rate": 3.601231190150479e-05, + "loss": 2.4621, + "step": 3272 + }, + { + "epoch": 0.28, + "grad_norm": 1.419997215270996, + "learning_rate": 3.5995212038303694e-05, + "loss": 2.3412, + "step": 3276 + }, + { + "epoch": 0.28, + "grad_norm": 1.4487018585205078, + "learning_rate": 3.59781121751026e-05, + "loss": 2.3379, + "step": 3280 + }, + { + "epoch": 0.28, + "grad_norm": 1.4295108318328857, + "learning_rate": 3.59610123119015e-05, + "loss": 2.3464, + "step": 3284 + }, + { + "epoch": 0.28, + "grad_norm": 1.5380290746688843, + "learning_rate": 3.594391244870041e-05, + "loss": 2.1826, + "step": 3288 + }, + { + "epoch": 0.28, + "grad_norm": 1.3480552434921265, + "learning_rate": 3.5926812585499316e-05, + "loss": 2.2881, + "step": 3292 + }, + { + "epoch": 0.28, + "grad_norm": 1.5125305652618408, + "learning_rate": 3.5909712722298224e-05, + "loss": 2.4374, + "step": 3296 + }, + { + "epoch": 0.28, + "grad_norm": 1.5424193143844604, + "learning_rate": 3.589261285909713e-05, + "loss": 2.3602, + "step": 3300 + }, + { + "epoch": 0.28, + "grad_norm": 1.9561680555343628, + "learning_rate": 3.587551299589603e-05, + "loss": 2.598, + "step": 3304 + }, + { + "epoch": 0.28, + "grad_norm": 1.4579107761383057, + "learning_rate": 3.585841313269494e-05, + "loss": 2.4397, + "step": 3308 + }, + { + "epoch": 0.28, + "grad_norm": 1.3612507581710815, + "learning_rate": 3.5841313269493846e-05, + "loss": 2.3842, + "step": 3312 + }, + { + "epoch": 0.28, + "grad_norm": 1.6156667470932007, + "learning_rate": 3.582421340629275e-05, + "loss": 2.4143, + "step": 3316 + }, + { + "epoch": 0.28, + "grad_norm": 1.457672119140625, + "learning_rate": 3.580711354309166e-05, + "loss": 2.327, + "step": 3320 + }, + { + "epoch": 0.28, + "grad_norm": 1.474558711051941, + "learning_rate": 3.579001367989056e-05, + "loss": 2.3957, + "step": 3324 + }, + { + "epoch": 0.28, + "grad_norm": 1.4732122421264648, + "learning_rate": 3.577291381668947e-05, + "loss": 2.4016, + "step": 3328 + }, + { + "epoch": 0.28, + "grad_norm": 1.4019269943237305, + "learning_rate": 3.5755813953488376e-05, + "loss": 2.4573, + "step": 3332 + }, + { + "epoch": 0.29, + "grad_norm": 1.6624845266342163, + "learning_rate": 3.573871409028728e-05, + "loss": 2.2681, + "step": 3336 + }, + { + "epoch": 0.29, + "grad_norm": 1.4161522388458252, + "learning_rate": 3.5721614227086184e-05, + "loss": 2.2106, + "step": 3340 + }, + { + "epoch": 0.29, + "grad_norm": 1.3711514472961426, + "learning_rate": 3.570451436388509e-05, + "loss": 2.1563, + "step": 3344 + }, + { + "epoch": 0.29, + "grad_norm": 1.575616478919983, + "learning_rate": 3.5687414500684e-05, + "loss": 2.3763, + "step": 3348 + }, + { + "epoch": 0.29, + "grad_norm": 1.6165823936462402, + "learning_rate": 3.5670314637482906e-05, + "loss": 2.3312, + "step": 3352 + }, + { + "epoch": 0.29, + "grad_norm": 1.3387306928634644, + "learning_rate": 3.565321477428181e-05, + "loss": 2.2245, + "step": 3356 + }, + { + "epoch": 0.29, + "grad_norm": 1.541719675064087, + "learning_rate": 3.5636114911080714e-05, + "loss": 2.3347, + "step": 3360 + }, + { + "epoch": 0.29, + "grad_norm": 1.4336274862289429, + "learning_rate": 3.5619015047879615e-05, + "loss": 2.3484, + "step": 3364 + }, + { + "epoch": 0.29, + "grad_norm": 1.586674690246582, + "learning_rate": 3.560191518467852e-05, + "loss": 2.5278, + "step": 3368 + }, + { + "epoch": 0.29, + "grad_norm": 1.429751992225647, + "learning_rate": 3.558481532147743e-05, + "loss": 2.2046, + "step": 3372 + }, + { + "epoch": 0.29, + "grad_norm": 1.6239968538284302, + "learning_rate": 3.556771545827634e-05, + "loss": 2.4159, + "step": 3376 + }, + { + "epoch": 0.29, + "grad_norm": 2.665915012359619, + "learning_rate": 3.5550615595075244e-05, + "loss": 2.3933, + "step": 3380 + }, + { + "epoch": 0.29, + "grad_norm": 1.532535195350647, + "learning_rate": 3.5533515731874145e-05, + "loss": 2.514, + "step": 3384 + }, + { + "epoch": 0.29, + "grad_norm": 1.8504074811935425, + "learning_rate": 3.551641586867305e-05, + "loss": 2.4337, + "step": 3388 + }, + { + "epoch": 0.29, + "grad_norm": 1.6196644306182861, + "learning_rate": 3.549931600547195e-05, + "loss": 2.297, + "step": 3392 + }, + { + "epoch": 0.29, + "grad_norm": 1.5914441347122192, + "learning_rate": 3.548221614227086e-05, + "loss": 2.3395, + "step": 3396 + }, + { + "epoch": 0.29, + "grad_norm": 1.5233490467071533, + "learning_rate": 3.5465116279069774e-05, + "loss": 2.253, + "step": 3400 + }, + { + "epoch": 0.29, + "grad_norm": 1.4033238887786865, + "learning_rate": 3.5448016415868675e-05, + "loss": 2.3872, + "step": 3404 + }, + { + "epoch": 0.29, + "grad_norm": 1.485261082649231, + "learning_rate": 3.543091655266758e-05, + "loss": 2.367, + "step": 3408 + }, + { + "epoch": 0.29, + "grad_norm": 1.3384130001068115, + "learning_rate": 3.541381668946648e-05, + "loss": 2.2609, + "step": 3412 + }, + { + "epoch": 0.29, + "grad_norm": 1.3713083267211914, + "learning_rate": 3.539671682626539e-05, + "loss": 2.311, + "step": 3416 + }, + { + "epoch": 0.29, + "grad_norm": 1.6065647602081299, + "learning_rate": 3.53796169630643e-05, + "loss": 2.1545, + "step": 3420 + }, + { + "epoch": 0.29, + "grad_norm": 1.455736517906189, + "learning_rate": 3.5362517099863205e-05, + "loss": 2.3288, + "step": 3424 + }, + { + "epoch": 0.29, + "grad_norm": 1.6390444040298462, + "learning_rate": 3.534541723666211e-05, + "loss": 2.3899, + "step": 3428 + }, + { + "epoch": 0.29, + "grad_norm": 1.3747378587722778, + "learning_rate": 3.532831737346101e-05, + "loss": 2.3622, + "step": 3432 + }, + { + "epoch": 0.29, + "grad_norm": 1.4323629140853882, + "learning_rate": 3.531121751025992e-05, + "loss": 2.2561, + "step": 3436 + }, + { + "epoch": 0.29, + "grad_norm": 1.4392497539520264, + "learning_rate": 3.529411764705883e-05, + "loss": 2.4516, + "step": 3440 + }, + { + "epoch": 0.29, + "grad_norm": 1.507541298866272, + "learning_rate": 3.527701778385773e-05, + "loss": 2.3012, + "step": 3444 + }, + { + "epoch": 0.29, + "grad_norm": 1.6980098485946655, + "learning_rate": 3.5259917920656635e-05, + "loss": 2.3305, + "step": 3448 + }, + { + "epoch": 0.3, + "grad_norm": 1.378029704093933, + "learning_rate": 3.524281805745554e-05, + "loss": 2.3159, + "step": 3452 + }, + { + "epoch": 0.3, + "grad_norm": 1.4904416799545288, + "learning_rate": 3.522571819425445e-05, + "loss": 2.2613, + "step": 3456 + }, + { + "epoch": 0.3, + "grad_norm": 1.5655815601348877, + "learning_rate": 3.520861833105336e-05, + "loss": 2.3091, + "step": 3460 + }, + { + "epoch": 0.3, + "grad_norm": 1.4613752365112305, + "learning_rate": 3.519151846785226e-05, + "loss": 2.3852, + "step": 3464 + }, + { + "epoch": 0.3, + "grad_norm": 1.735697627067566, + "learning_rate": 3.5174418604651165e-05, + "loss": 2.3746, + "step": 3468 + }, + { + "epoch": 0.3, + "grad_norm": 1.56474769115448, + "learning_rate": 3.5157318741450066e-05, + "loss": 2.4085, + "step": 3472 + }, + { + "epoch": 0.3, + "grad_norm": 1.6144722700119019, + "learning_rate": 3.514021887824897e-05, + "loss": 2.2048, + "step": 3476 + }, + { + "epoch": 0.3, + "grad_norm": 1.4500874280929565, + "learning_rate": 3.512311901504789e-05, + "loss": 2.4197, + "step": 3480 + }, + { + "epoch": 0.3, + "grad_norm": 1.5875989198684692, + "learning_rate": 3.510601915184679e-05, + "loss": 2.3395, + "step": 3484 + }, + { + "epoch": 0.3, + "grad_norm": 1.5678739547729492, + "learning_rate": 3.5088919288645695e-05, + "loss": 2.4267, + "step": 3488 + }, + { + "epoch": 0.3, + "grad_norm": 1.4578819274902344, + "learning_rate": 3.5071819425444596e-05, + "loss": 2.3653, + "step": 3492 + }, + { + "epoch": 0.3, + "grad_norm": 1.8042303323745728, + "learning_rate": 3.50547195622435e-05, + "loss": 2.3785, + "step": 3496 + }, + { + "epoch": 0.3, + "grad_norm": 1.4263440370559692, + "learning_rate": 3.5037619699042403e-05, + "loss": 2.3203, + "step": 3500 + }, + { + "epoch": 0.3, + "grad_norm": 1.5346360206604004, + "learning_rate": 3.502051983584132e-05, + "loss": 2.3125, + "step": 3504 + }, + { + "epoch": 0.3, + "grad_norm": 2.5747292041778564, + "learning_rate": 3.5003419972640225e-05, + "loss": 2.4275, + "step": 3508 + }, + { + "epoch": 0.3, + "grad_norm": 1.7760605812072754, + "learning_rate": 3.4986320109439126e-05, + "loss": 2.6035, + "step": 3512 + }, + { + "epoch": 0.3, + "grad_norm": 1.5713919401168823, + "learning_rate": 3.496922024623803e-05, + "loss": 2.3099, + "step": 3516 + }, + { + "epoch": 0.3, + "grad_norm": 1.6528106927871704, + "learning_rate": 3.4952120383036933e-05, + "loss": 2.4168, + "step": 3520 + }, + { + "epoch": 0.3, + "grad_norm": 1.4009822607040405, + "learning_rate": 3.493502051983584e-05, + "loss": 2.3955, + "step": 3524 + }, + { + "epoch": 0.3, + "grad_norm": 1.7046074867248535, + "learning_rate": 3.491792065663475e-05, + "loss": 2.3926, + "step": 3528 + }, + { + "epoch": 0.3, + "grad_norm": 1.3500627279281616, + "learning_rate": 3.4900820793433655e-05, + "loss": 2.3855, + "step": 3532 + }, + { + "epoch": 0.3, + "grad_norm": 1.4953322410583496, + "learning_rate": 3.488372093023256e-05, + "loss": 2.3912, + "step": 3536 + }, + { + "epoch": 0.3, + "grad_norm": 1.3733028173446655, + "learning_rate": 3.486662106703146e-05, + "loss": 2.2607, + "step": 3540 + }, + { + "epoch": 0.3, + "grad_norm": 1.427222490310669, + "learning_rate": 3.484952120383037e-05, + "loss": 2.3234, + "step": 3544 + }, + { + "epoch": 0.3, + "grad_norm": 1.4577361345291138, + "learning_rate": 3.483242134062928e-05, + "loss": 2.3509, + "step": 3548 + }, + { + "epoch": 0.3, + "grad_norm": 1.3897165060043335, + "learning_rate": 3.481532147742818e-05, + "loss": 2.154, + "step": 3552 + }, + { + "epoch": 0.3, + "grad_norm": 1.4338810443878174, + "learning_rate": 3.4798221614227086e-05, + "loss": 2.2335, + "step": 3556 + }, + { + "epoch": 0.3, + "grad_norm": 1.6268779039382935, + "learning_rate": 3.478112175102599e-05, + "loss": 2.2263, + "step": 3560 + }, + { + "epoch": 0.3, + "grad_norm": 1.4855129718780518, + "learning_rate": 3.47640218878249e-05, + "loss": 2.1839, + "step": 3564 + }, + { + "epoch": 0.31, + "grad_norm": 1.525172233581543, + "learning_rate": 3.474692202462381e-05, + "loss": 2.3011, + "step": 3568 + }, + { + "epoch": 0.31, + "grad_norm": 1.543357253074646, + "learning_rate": 3.472982216142271e-05, + "loss": 2.34, + "step": 3572 + }, + { + "epoch": 0.31, + "grad_norm": 1.4775084257125854, + "learning_rate": 3.4712722298221616e-05, + "loss": 2.2593, + "step": 3576 + }, + { + "epoch": 0.31, + "grad_norm": 1.3302327394485474, + "learning_rate": 3.4695622435020516e-05, + "loss": 2.1703, + "step": 3580 + }, + { + "epoch": 0.31, + "grad_norm": 3.1734111309051514, + "learning_rate": 3.467852257181943e-05, + "loss": 2.3284, + "step": 3584 + }, + { + "epoch": 0.31, + "grad_norm": 1.5368781089782715, + "learning_rate": 3.466142270861834e-05, + "loss": 2.3187, + "step": 3588 + }, + { + "epoch": 0.31, + "grad_norm": 1.526089072227478, + "learning_rate": 3.464432284541724e-05, + "loss": 2.3247, + "step": 3592 + }, + { + "epoch": 0.31, + "grad_norm": 1.460354208946228, + "learning_rate": 3.4627222982216146e-05, + "loss": 2.2871, + "step": 3596 + }, + { + "epoch": 0.31, + "grad_norm": 1.5204929113388062, + "learning_rate": 3.4610123119015046e-05, + "loss": 2.2317, + "step": 3600 + }, + { + "epoch": 0.31, + "grad_norm": 1.355778455734253, + "learning_rate": 3.4593023255813954e-05, + "loss": 2.2038, + "step": 3604 + }, + { + "epoch": 0.31, + "grad_norm": 1.4250309467315674, + "learning_rate": 3.457592339261286e-05, + "loss": 2.288, + "step": 3608 + }, + { + "epoch": 0.31, + "grad_norm": 1.529437780380249, + "learning_rate": 3.455882352941177e-05, + "loss": 2.2193, + "step": 3612 + }, + { + "epoch": 0.31, + "grad_norm": 1.6923147439956665, + "learning_rate": 3.4541723666210676e-05, + "loss": 2.2283, + "step": 3616 + }, + { + "epoch": 0.31, + "grad_norm": 1.5242345333099365, + "learning_rate": 3.4524623803009576e-05, + "loss": 2.3881, + "step": 3620 + }, + { + "epoch": 0.31, + "grad_norm": 1.4843759536743164, + "learning_rate": 3.4507523939808484e-05, + "loss": 2.4458, + "step": 3624 + }, + { + "epoch": 0.31, + "grad_norm": 1.361167550086975, + "learning_rate": 3.4490424076607384e-05, + "loss": 2.264, + "step": 3628 + }, + { + "epoch": 0.31, + "grad_norm": 1.4210281372070312, + "learning_rate": 3.447332421340629e-05, + "loss": 2.2471, + "step": 3632 + }, + { + "epoch": 0.31, + "grad_norm": 1.5141572952270508, + "learning_rate": 3.44562243502052e-05, + "loss": 2.3368, + "step": 3636 + }, + { + "epoch": 0.31, + "grad_norm": 1.3176366090774536, + "learning_rate": 3.4439124487004106e-05, + "loss": 2.1529, + "step": 3640 + }, + { + "epoch": 0.31, + "grad_norm": 1.4620164632797241, + "learning_rate": 3.4422024623803014e-05, + "loss": 2.3995, + "step": 3644 + }, + { + "epoch": 0.31, + "grad_norm": 1.6192833185195923, + "learning_rate": 3.4404924760601914e-05, + "loss": 2.3621, + "step": 3648 + }, + { + "epoch": 0.31, + "grad_norm": 1.459579348564148, + "learning_rate": 3.438782489740082e-05, + "loss": 2.3437, + "step": 3652 + }, + { + "epoch": 0.31, + "grad_norm": 1.3151201009750366, + "learning_rate": 3.437072503419973e-05, + "loss": 2.318, + "step": 3656 + }, + { + "epoch": 0.31, + "grad_norm": 1.4882017374038696, + "learning_rate": 3.435362517099863e-05, + "loss": 2.2267, + "step": 3660 + }, + { + "epoch": 0.31, + "grad_norm": 1.3695846796035767, + "learning_rate": 3.433652530779754e-05, + "loss": 2.2106, + "step": 3664 + }, + { + "epoch": 0.31, + "grad_norm": 1.8380380868911743, + "learning_rate": 3.4319425444596444e-05, + "loss": 2.3097, + "step": 3668 + }, + { + "epoch": 0.31, + "grad_norm": 1.560056447982788, + "learning_rate": 3.430232558139535e-05, + "loss": 2.4494, + "step": 3672 + }, + { + "epoch": 0.31, + "grad_norm": 1.4692696332931519, + "learning_rate": 3.428522571819426e-05, + "loss": 2.1774, + "step": 3676 + }, + { + "epoch": 0.31, + "grad_norm": 1.585492730140686, + "learning_rate": 3.426812585499316e-05, + "loss": 2.2936, + "step": 3680 + }, + { + "epoch": 0.31, + "grad_norm": 1.5503042936325073, + "learning_rate": 3.425102599179207e-05, + "loss": 2.2004, + "step": 3684 + }, + { + "epoch": 0.32, + "grad_norm": 1.640663504600525, + "learning_rate": 3.423392612859097e-05, + "loss": 2.2647, + "step": 3688 + }, + { + "epoch": 0.32, + "grad_norm": 1.4468867778778076, + "learning_rate": 3.421682626538988e-05, + "loss": 2.3467, + "step": 3692 + }, + { + "epoch": 0.32, + "grad_norm": 1.5558197498321533, + "learning_rate": 3.419972640218879e-05, + "loss": 2.4436, + "step": 3696 + }, + { + "epoch": 0.32, + "grad_norm": 1.489776372909546, + "learning_rate": 3.418262653898769e-05, + "loss": 2.3, + "step": 3700 + }, + { + "epoch": 0.32, + "grad_norm": 1.4555796384811401, + "learning_rate": 3.41655266757866e-05, + "loss": 2.2145, + "step": 3704 + }, + { + "epoch": 0.32, + "grad_norm": 1.3881696462631226, + "learning_rate": 3.41484268125855e-05, + "loss": 2.3039, + "step": 3708 + }, + { + "epoch": 0.32, + "grad_norm": 1.412907361984253, + "learning_rate": 3.4131326949384405e-05, + "loss": 2.3594, + "step": 3712 + }, + { + "epoch": 0.32, + "grad_norm": 1.746773362159729, + "learning_rate": 3.411422708618331e-05, + "loss": 2.2989, + "step": 3716 + }, + { + "epoch": 0.32, + "grad_norm": 1.4776164293289185, + "learning_rate": 3.409712722298222e-05, + "loss": 2.3647, + "step": 3720 + }, + { + "epoch": 0.32, + "grad_norm": 1.4442262649536133, + "learning_rate": 3.408002735978113e-05, + "loss": 2.1363, + "step": 3724 + }, + { + "epoch": 0.32, + "grad_norm": 1.49587082862854, + "learning_rate": 3.406292749658003e-05, + "loss": 2.3467, + "step": 3728 + }, + { + "epoch": 0.32, + "grad_norm": 1.8401976823806763, + "learning_rate": 3.4045827633378935e-05, + "loss": 2.4508, + "step": 3732 + }, + { + "epoch": 0.32, + "grad_norm": 1.379357099533081, + "learning_rate": 3.402872777017784e-05, + "loss": 2.2629, + "step": 3736 + }, + { + "epoch": 0.32, + "grad_norm": 1.590693712234497, + "learning_rate": 3.401162790697674e-05, + "loss": 2.3216, + "step": 3740 + }, + { + "epoch": 0.32, + "grad_norm": 1.4456241130828857, + "learning_rate": 3.399452804377565e-05, + "loss": 2.3142, + "step": 3744 + }, + { + "epoch": 0.32, + "grad_norm": 1.5368770360946655, + "learning_rate": 3.397742818057456e-05, + "loss": 2.2955, + "step": 3748 + }, + { + "epoch": 0.32, + "grad_norm": 1.6656548976898193, + "learning_rate": 3.3960328317373464e-05, + "loss": 2.4015, + "step": 3752 + }, + { + "epoch": 0.32, + "grad_norm": 1.519033432006836, + "learning_rate": 3.3943228454172365e-05, + "loss": 2.1557, + "step": 3756 + }, + { + "epoch": 0.32, + "grad_norm": 1.56667959690094, + "learning_rate": 3.392612859097127e-05, + "loss": 2.5277, + "step": 3760 + }, + { + "epoch": 0.32, + "grad_norm": 1.5404236316680908, + "learning_rate": 3.390902872777018e-05, + "loss": 2.2924, + "step": 3764 + }, + { + "epoch": 0.32, + "grad_norm": 1.619696021080017, + "learning_rate": 3.389192886456908e-05, + "loss": 2.2094, + "step": 3768 + }, + { + "epoch": 0.32, + "grad_norm": 1.3681530952453613, + "learning_rate": 3.3874829001367994e-05, + "loss": 2.231, + "step": 3772 + }, + { + "epoch": 0.32, + "grad_norm": 1.7254544496536255, + "learning_rate": 3.3857729138166895e-05, + "loss": 2.2382, + "step": 3776 + }, + { + "epoch": 0.32, + "grad_norm": 1.6230955123901367, + "learning_rate": 3.38406292749658e-05, + "loss": 2.1105, + "step": 3780 + }, + { + "epoch": 0.32, + "grad_norm": 1.5223900079727173, + "learning_rate": 3.382352941176471e-05, + "loss": 2.1982, + "step": 3784 + }, + { + "epoch": 0.32, + "grad_norm": 1.3313345909118652, + "learning_rate": 3.380642954856361e-05, + "loss": 2.1896, + "step": 3788 + }, + { + "epoch": 0.32, + "grad_norm": 1.5461819171905518, + "learning_rate": 3.378932968536252e-05, + "loss": 2.2544, + "step": 3792 + }, + { + "epoch": 0.32, + "grad_norm": 1.4953100681304932, + "learning_rate": 3.3772229822161425e-05, + "loss": 2.3158, + "step": 3796 + }, + { + "epoch": 0.32, + "grad_norm": 1.4358595609664917, + "learning_rate": 3.375512995896033e-05, + "loss": 2.1203, + "step": 3800 + }, + { + "epoch": 0.33, + "grad_norm": 1.6208328008651733, + "learning_rate": 3.373803009575924e-05, + "loss": 2.3804, + "step": 3804 + }, + { + "epoch": 0.33, + "grad_norm": 1.58054518699646, + "learning_rate": 3.372093023255814e-05, + "loss": 2.2398, + "step": 3808 + }, + { + "epoch": 0.33, + "grad_norm": 1.6291831731796265, + "learning_rate": 3.370383036935705e-05, + "loss": 2.2565, + "step": 3812 + }, + { + "epoch": 0.33, + "grad_norm": 1.500659465789795, + "learning_rate": 3.368673050615595e-05, + "loss": 2.2771, + "step": 3816 + }, + { + "epoch": 0.33, + "grad_norm": 1.3757811784744263, + "learning_rate": 3.3669630642954855e-05, + "loss": 2.2549, + "step": 3820 + }, + { + "epoch": 0.33, + "grad_norm": 1.5131489038467407, + "learning_rate": 3.365253077975376e-05, + "loss": 2.1822, + "step": 3824 + }, + { + "epoch": 0.33, + "grad_norm": 1.511177897453308, + "learning_rate": 3.363543091655267e-05, + "loss": 2.3104, + "step": 3828 + }, + { + "epoch": 0.33, + "grad_norm": 1.4455082416534424, + "learning_rate": 3.361833105335158e-05, + "loss": 2.5066, + "step": 3832 + }, + { + "epoch": 0.33, + "grad_norm": 1.3408520221710205, + "learning_rate": 3.360123119015048e-05, + "loss": 2.2685, + "step": 3836 + }, + { + "epoch": 0.33, + "grad_norm": 1.3482859134674072, + "learning_rate": 3.3584131326949385e-05, + "loss": 2.4401, + "step": 3840 + }, + { + "epoch": 0.33, + "grad_norm": 1.4088398218154907, + "learning_rate": 3.356703146374829e-05, + "loss": 2.3414, + "step": 3844 + }, + { + "epoch": 0.33, + "grad_norm": 1.483805775642395, + "learning_rate": 3.354993160054719e-05, + "loss": 2.1907, + "step": 3848 + }, + { + "epoch": 0.33, + "grad_norm": 1.4749683141708374, + "learning_rate": 3.353283173734611e-05, + "loss": 2.4228, + "step": 3852 + }, + { + "epoch": 0.33, + "grad_norm": 1.5431963205337524, + "learning_rate": 3.351573187414501e-05, + "loss": 2.2988, + "step": 3856 + }, + { + "epoch": 0.33, + "grad_norm": 1.7580955028533936, + "learning_rate": 3.3498632010943915e-05, + "loss": 2.2444, + "step": 3860 + }, + { + "epoch": 0.33, + "grad_norm": 1.3347162008285522, + "learning_rate": 3.348153214774282e-05, + "loss": 2.3015, + "step": 3864 + }, + { + "epoch": 0.33, + "grad_norm": 1.4706435203552246, + "learning_rate": 3.346443228454172e-05, + "loss": 2.3175, + "step": 3868 + }, + { + "epoch": 0.33, + "grad_norm": 1.63969886302948, + "learning_rate": 3.344733242134063e-05, + "loss": 2.1465, + "step": 3872 + }, + { + "epoch": 0.33, + "grad_norm": 2.056724786758423, + "learning_rate": 3.343023255813954e-05, + "loss": 2.3949, + "step": 3876 + }, + { + "epoch": 0.33, + "grad_norm": 1.551066279411316, + "learning_rate": 3.3413132694938445e-05, + "loss": 2.2891, + "step": 3880 + }, + { + "epoch": 0.33, + "grad_norm": 1.6610709428787231, + "learning_rate": 3.3396032831737346e-05, + "loss": 2.5973, + "step": 3884 + }, + { + "epoch": 0.33, + "grad_norm": 1.5867125988006592, + "learning_rate": 3.337893296853625e-05, + "loss": 2.2614, + "step": 3888 + }, + { + "epoch": 0.33, + "grad_norm": 1.8439687490463257, + "learning_rate": 3.336183310533516e-05, + "loss": 2.1923, + "step": 3892 + }, + { + "epoch": 0.33, + "grad_norm": 1.4217692613601685, + "learning_rate": 3.334473324213406e-05, + "loss": 2.2826, + "step": 3896 + }, + { + "epoch": 0.33, + "grad_norm": 1.2552785873413086, + "learning_rate": 3.332763337893297e-05, + "loss": 2.3539, + "step": 3900 + }, + { + "epoch": 0.33, + "grad_norm": 1.5517688989639282, + "learning_rate": 3.3310533515731876e-05, + "loss": 2.2412, + "step": 3904 + }, + { + "epoch": 0.33, + "grad_norm": 1.6065213680267334, + "learning_rate": 3.329343365253078e-05, + "loss": 2.2576, + "step": 3908 + }, + { + "epoch": 0.33, + "grad_norm": 1.4800779819488525, + "learning_rate": 3.327633378932969e-05, + "loss": 2.324, + "step": 3912 + }, + { + "epoch": 0.33, + "grad_norm": 1.3546322584152222, + "learning_rate": 3.325923392612859e-05, + "loss": 2.323, + "step": 3916 + }, + { + "epoch": 0.34, + "grad_norm": 1.6678704023361206, + "learning_rate": 3.32421340629275e-05, + "loss": 2.3459, + "step": 3920 + }, + { + "epoch": 0.34, + "grad_norm": 1.4288249015808105, + "learning_rate": 3.32250341997264e-05, + "loss": 2.2059, + "step": 3924 + }, + { + "epoch": 0.34, + "grad_norm": 1.4324655532836914, + "learning_rate": 3.3207934336525306e-05, + "loss": 2.1479, + "step": 3928 + }, + { + "epoch": 0.34, + "grad_norm": 1.5552873611450195, + "learning_rate": 3.319083447332422e-05, + "loss": 2.2897, + "step": 3932 + }, + { + "epoch": 0.34, + "grad_norm": 1.4077160358428955, + "learning_rate": 3.317373461012312e-05, + "loss": 2.3246, + "step": 3936 + }, + { + "epoch": 0.34, + "grad_norm": 1.4134106636047363, + "learning_rate": 3.315663474692203e-05, + "loss": 2.2485, + "step": 3940 + }, + { + "epoch": 0.34, + "grad_norm": 1.670109510421753, + "learning_rate": 3.313953488372093e-05, + "loss": 2.2681, + "step": 3944 + }, + { + "epoch": 0.34, + "grad_norm": 1.706309199333191, + "learning_rate": 3.3122435020519836e-05, + "loss": 2.2443, + "step": 3948 + }, + { + "epoch": 0.34, + "grad_norm": 1.5278905630111694, + "learning_rate": 3.3105335157318744e-05, + "loss": 2.1895, + "step": 3952 + }, + { + "epoch": 0.34, + "grad_norm": 1.3573322296142578, + "learning_rate": 3.308823529411765e-05, + "loss": 2.4002, + "step": 3956 + }, + { + "epoch": 0.34, + "grad_norm": 1.383784532546997, + "learning_rate": 3.307113543091656e-05, + "loss": 2.3252, + "step": 3960 + }, + { + "epoch": 0.34, + "grad_norm": 1.5072522163391113, + "learning_rate": 3.305403556771546e-05, + "loss": 2.3, + "step": 3964 + }, + { + "epoch": 0.34, + "grad_norm": 1.5655957460403442, + "learning_rate": 3.3036935704514366e-05, + "loss": 2.1498, + "step": 3968 + }, + { + "epoch": 0.34, + "grad_norm": 1.4027670621871948, + "learning_rate": 3.3019835841313274e-05, + "loss": 2.2169, + "step": 3972 + }, + { + "epoch": 0.34, + "grad_norm": 1.5429667234420776, + "learning_rate": 3.3002735978112174e-05, + "loss": 2.3642, + "step": 3976 + }, + { + "epoch": 0.34, + "grad_norm": 1.4422931671142578, + "learning_rate": 3.298563611491108e-05, + "loss": 2.1742, + "step": 3980 + }, + { + "epoch": 0.34, + "grad_norm": 1.4613866806030273, + "learning_rate": 3.296853625170999e-05, + "loss": 2.4487, + "step": 3984 + }, + { + "epoch": 0.34, + "grad_norm": 1.5176717042922974, + "learning_rate": 3.2951436388508896e-05, + "loss": 2.3083, + "step": 3988 + }, + { + "epoch": 0.34, + "grad_norm": 1.7266799211502075, + "learning_rate": 3.2934336525307803e-05, + "loss": 2.2963, + "step": 3992 + }, + { + "epoch": 0.34, + "grad_norm": 1.5054651498794556, + "learning_rate": 3.2917236662106704e-05, + "loss": 2.4407, + "step": 3996 + }, + { + "epoch": 0.34, + "grad_norm": 1.4415006637573242, + "learning_rate": 3.290013679890561e-05, + "loss": 2.25, + "step": 4000 + }, + { + "epoch": 0.34, + "grad_norm": 1.5419758558273315, + "learning_rate": 3.288303693570451e-05, + "loss": 2.3623, + "step": 4004 + }, + { + "epoch": 0.34, + "grad_norm": 1.7471479177474976, + "learning_rate": 3.286593707250342e-05, + "loss": 2.4276, + "step": 4008 + }, + { + "epoch": 0.34, + "grad_norm": 1.7161086797714233, + "learning_rate": 3.284883720930233e-05, + "loss": 2.3613, + "step": 4012 + }, + { + "epoch": 0.34, + "grad_norm": 1.622846007347107, + "learning_rate": 3.2831737346101234e-05, + "loss": 2.2338, + "step": 4016 + }, + { + "epoch": 0.34, + "grad_norm": 1.52423095703125, + "learning_rate": 3.281463748290014e-05, + "loss": 2.3147, + "step": 4020 + }, + { + "epoch": 0.34, + "grad_norm": 1.4126418828964233, + "learning_rate": 3.279753761969904e-05, + "loss": 2.2087, + "step": 4024 + }, + { + "epoch": 0.34, + "grad_norm": 1.316510796546936, + "learning_rate": 3.278043775649795e-05, + "loss": 2.2318, + "step": 4028 + }, + { + "epoch": 0.34, + "grad_norm": 1.7000290155410767, + "learning_rate": 3.276333789329685e-05, + "loss": 2.187, + "step": 4032 + }, + { + "epoch": 0.35, + "grad_norm": 1.4855263233184814, + "learning_rate": 3.274623803009576e-05, + "loss": 2.1994, + "step": 4036 + }, + { + "epoch": 0.35, + "grad_norm": 1.4632859230041504, + "learning_rate": 3.272913816689467e-05, + "loss": 2.2279, + "step": 4040 + }, + { + "epoch": 0.35, + "grad_norm": 1.456778645515442, + "learning_rate": 3.271203830369357e-05, + "loss": 2.1008, + "step": 4044 + }, + { + "epoch": 0.35, + "grad_norm": 1.4377546310424805, + "learning_rate": 3.269493844049248e-05, + "loss": 2.2231, + "step": 4048 + }, + { + "epoch": 0.35, + "grad_norm": 1.5278170108795166, + "learning_rate": 3.267783857729138e-05, + "loss": 2.1359, + "step": 4052 + }, + { + "epoch": 0.35, + "grad_norm": 1.558070182800293, + "learning_rate": 3.266073871409029e-05, + "loss": 2.383, + "step": 4056 + }, + { + "epoch": 0.35, + "grad_norm": 1.5845755338668823, + "learning_rate": 3.2643638850889194e-05, + "loss": 2.2107, + "step": 4060 + }, + { + "epoch": 0.35, + "grad_norm": 1.6732598543167114, + "learning_rate": 3.26265389876881e-05, + "loss": 2.0189, + "step": 4064 + }, + { + "epoch": 0.35, + "grad_norm": 1.5206589698791504, + "learning_rate": 3.260943912448701e-05, + "loss": 2.2918, + "step": 4068 + }, + { + "epoch": 0.35, + "grad_norm": 1.6750423908233643, + "learning_rate": 3.259233926128591e-05, + "loss": 2.2082, + "step": 4072 + }, + { + "epoch": 0.35, + "grad_norm": 1.6563453674316406, + "learning_rate": 3.257523939808482e-05, + "loss": 2.3358, + "step": 4076 + }, + { + "epoch": 0.35, + "grad_norm": 1.575918197631836, + "learning_rate": 3.2558139534883724e-05, + "loss": 2.355, + "step": 4080 + }, + { + "epoch": 0.35, + "grad_norm": 1.5032954216003418, + "learning_rate": 3.2541039671682625e-05, + "loss": 2.259, + "step": 4084 + }, + { + "epoch": 0.35, + "grad_norm": 1.4477620124816895, + "learning_rate": 3.252393980848153e-05, + "loss": 2.2821, + "step": 4088 + }, + { + "epoch": 0.35, + "grad_norm": 1.342366337776184, + "learning_rate": 3.250683994528044e-05, + "loss": 2.1086, + "step": 4092 + }, + { + "epoch": 0.35, + "grad_norm": 1.4458723068237305, + "learning_rate": 3.248974008207935e-05, + "loss": 2.2197, + "step": 4096 + }, + { + "epoch": 0.35, + "grad_norm": 1.4470369815826416, + "learning_rate": 3.2472640218878254e-05, + "loss": 2.3436, + "step": 4100 + }, + { + "epoch": 0.35, + "grad_norm": 1.6926721334457397, + "learning_rate": 3.2455540355677155e-05, + "loss": 2.3198, + "step": 4104 + }, + { + "epoch": 0.35, + "grad_norm": 1.483053207397461, + "learning_rate": 3.243844049247606e-05, + "loss": 2.2944, + "step": 4108 + }, + { + "epoch": 0.35, + "grad_norm": 1.5896413326263428, + "learning_rate": 3.242134062927496e-05, + "loss": 2.1459, + "step": 4112 + }, + { + "epoch": 0.35, + "grad_norm": 1.915610432624817, + "learning_rate": 3.240424076607387e-05, + "loss": 2.2768, + "step": 4116 + }, + { + "epoch": 0.35, + "grad_norm": 1.4022173881530762, + "learning_rate": 3.2387140902872784e-05, + "loss": 2.1106, + "step": 4120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3702282905578613, + "learning_rate": 3.2370041039671685e-05, + "loss": 2.0183, + "step": 4124 + }, + { + "epoch": 0.35, + "grad_norm": 1.3947548866271973, + "learning_rate": 3.235294117647059e-05, + "loss": 2.1075, + "step": 4128 + }, + { + "epoch": 0.35, + "grad_norm": 1.5231854915618896, + "learning_rate": 3.233584131326949e-05, + "loss": 2.3221, + "step": 4132 + }, + { + "epoch": 0.35, + "grad_norm": 1.5928566455841064, + "learning_rate": 3.23187414500684e-05, + "loss": 2.1517, + "step": 4136 + }, + { + "epoch": 0.35, + "grad_norm": 1.518593192100525, + "learning_rate": 3.23016415868673e-05, + "loss": 2.4391, + "step": 4140 + }, + { + "epoch": 0.35, + "grad_norm": 1.4657520055770874, + "learning_rate": 3.2284541723666215e-05, + "loss": 2.2852, + "step": 4144 + }, + { + "epoch": 0.35, + "grad_norm": 1.4905215501785278, + "learning_rate": 3.226744186046512e-05, + "loss": 2.1462, + "step": 4148 + }, + { + "epoch": 0.35, + "grad_norm": 1.6270649433135986, + "learning_rate": 3.225034199726402e-05, + "loss": 2.4138, + "step": 4152 + }, + { + "epoch": 0.36, + "grad_norm": 1.7110861539840698, + "learning_rate": 3.223324213406293e-05, + "loss": 2.2067, + "step": 4156 + }, + { + "epoch": 0.36, + "grad_norm": 1.771278977394104, + "learning_rate": 3.221614227086183e-05, + "loss": 2.3056, + "step": 4160 + }, + { + "epoch": 0.36, + "grad_norm": 1.7753063440322876, + "learning_rate": 3.219904240766074e-05, + "loss": 2.1597, + "step": 4164 + }, + { + "epoch": 0.36, + "grad_norm": 1.4673937559127808, + "learning_rate": 3.2181942544459645e-05, + "loss": 2.2006, + "step": 4168 + }, + { + "epoch": 0.36, + "grad_norm": 1.4760159254074097, + "learning_rate": 3.216484268125855e-05, + "loss": 2.213, + "step": 4172 + }, + { + "epoch": 0.36, + "grad_norm": 1.435507893562317, + "learning_rate": 3.214774281805746e-05, + "loss": 2.2281, + "step": 4176 + }, + { + "epoch": 0.36, + "grad_norm": 1.3926265239715576, + "learning_rate": 3.213064295485636e-05, + "loss": 2.1146, + "step": 4180 + }, + { + "epoch": 0.36, + "grad_norm": 1.4813460111618042, + "learning_rate": 3.211354309165527e-05, + "loss": 2.2133, + "step": 4184 + }, + { + "epoch": 0.36, + "grad_norm": 1.576973795890808, + "learning_rate": 3.2096443228454175e-05, + "loss": 2.1075, + "step": 4188 + }, + { + "epoch": 0.36, + "grad_norm": 2.532358169555664, + "learning_rate": 3.2079343365253076e-05, + "loss": 2.2588, + "step": 4192 + }, + { + "epoch": 0.36, + "grad_norm": 1.4591597318649292, + "learning_rate": 3.206224350205198e-05, + "loss": 2.2587, + "step": 4196 + }, + { + "epoch": 0.36, + "grad_norm": 1.5652563571929932, + "learning_rate": 3.204514363885089e-05, + "loss": 2.217, + "step": 4200 + }, + { + "epoch": 0.36, + "grad_norm": 1.4030766487121582, + "learning_rate": 3.20280437756498e-05, + "loss": 2.2451, + "step": 4204 + }, + { + "epoch": 0.36, + "grad_norm": 1.5012942552566528, + "learning_rate": 3.2010943912448705e-05, + "loss": 2.2296, + "step": 4208 + }, + { + "epoch": 0.36, + "grad_norm": 1.3982198238372803, + "learning_rate": 3.1993844049247606e-05, + "loss": 2.3254, + "step": 4212 + }, + { + "epoch": 0.36, + "grad_norm": 2.359818458557129, + "learning_rate": 3.197674418604651e-05, + "loss": 2.3709, + "step": 4216 + }, + { + "epoch": 0.36, + "grad_norm": 1.687652349472046, + "learning_rate": 3.1959644322845414e-05, + "loss": 2.2442, + "step": 4220 + }, + { + "epoch": 0.36, + "grad_norm": 1.577440857887268, + "learning_rate": 3.194254445964433e-05, + "loss": 2.1392, + "step": 4224 + }, + { + "epoch": 0.36, + "grad_norm": 1.4704028367996216, + "learning_rate": 3.1925444596443235e-05, + "loss": 2.3586, + "step": 4228 + }, + { + "epoch": 0.36, + "grad_norm": 1.5463335514068604, + "learning_rate": 3.1908344733242136e-05, + "loss": 2.3475, + "step": 4232 + }, + { + "epoch": 0.36, + "grad_norm": 1.4893290996551514, + "learning_rate": 3.189124487004104e-05, + "loss": 2.1804, + "step": 4236 + }, + { + "epoch": 0.36, + "grad_norm": 1.6646398305892944, + "learning_rate": 3.1874145006839944e-05, + "loss": 2.2972, + "step": 4240 + }, + { + "epoch": 0.36, + "grad_norm": 1.4279797077178955, + "learning_rate": 3.185704514363885e-05, + "loss": 2.235, + "step": 4244 + }, + { + "epoch": 0.36, + "grad_norm": 1.5145081281661987, + "learning_rate": 3.183994528043776e-05, + "loss": 2.231, + "step": 4248 + }, + { + "epoch": 0.36, + "grad_norm": 1.4431096315383911, + "learning_rate": 3.1822845417236666e-05, + "loss": 2.1942, + "step": 4252 + }, + { + "epoch": 0.36, + "grad_norm": 1.431936264038086, + "learning_rate": 3.180574555403557e-05, + "loss": 2.2726, + "step": 4256 + }, + { + "epoch": 0.36, + "grad_norm": 1.4116493463516235, + "learning_rate": 3.1788645690834474e-05, + "loss": 2.346, + "step": 4260 + }, + { + "epoch": 0.36, + "grad_norm": 1.480595588684082, + "learning_rate": 3.177154582763338e-05, + "loss": 2.1486, + "step": 4264 + }, + { + "epoch": 0.36, + "grad_norm": 1.4615095853805542, + "learning_rate": 3.175444596443228e-05, + "loss": 2.2935, + "step": 4268 + }, + { + "epoch": 0.37, + "grad_norm": 1.414937973022461, + "learning_rate": 3.173734610123119e-05, + "loss": 2.0618, + "step": 4272 + }, + { + "epoch": 0.37, + "grad_norm": 2.260504961013794, + "learning_rate": 3.1720246238030096e-05, + "loss": 2.2604, + "step": 4276 + }, + { + "epoch": 0.37, + "grad_norm": 1.573109745979309, + "learning_rate": 3.1703146374829003e-05, + "loss": 2.1983, + "step": 4280 + }, + { + "epoch": 0.37, + "grad_norm": 1.4415810108184814, + "learning_rate": 3.168604651162791e-05, + "loss": 2.1983, + "step": 4284 + }, + { + "epoch": 0.37, + "grad_norm": 1.4468250274658203, + "learning_rate": 3.166894664842681e-05, + "loss": 2.3544, + "step": 4288 + }, + { + "epoch": 0.37, + "grad_norm": 1.4121309518814087, + "learning_rate": 3.165184678522572e-05, + "loss": 2.3186, + "step": 4292 + }, + { + "epoch": 0.37, + "grad_norm": 1.3506333827972412, + "learning_rate": 3.1634746922024626e-05, + "loss": 2.2057, + "step": 4296 + }, + { + "epoch": 0.37, + "grad_norm": 1.5312514305114746, + "learning_rate": 3.161764705882353e-05, + "loss": 2.1451, + "step": 4300 + }, + { + "epoch": 0.37, + "grad_norm": 1.8453575372695923, + "learning_rate": 3.160054719562244e-05, + "loss": 2.2856, + "step": 4304 + }, + { + "epoch": 0.37, + "grad_norm": 1.5432989597320557, + "learning_rate": 3.158344733242134e-05, + "loss": 2.2947, + "step": 4308 + }, + { + "epoch": 0.37, + "grad_norm": 1.575966238975525, + "learning_rate": 3.156634746922025e-05, + "loss": 2.1444, + "step": 4312 + }, + { + "epoch": 0.37, + "grad_norm": 1.6484041213989258, + "learning_rate": 3.1549247606019156e-05, + "loss": 2.1952, + "step": 4316 + }, + { + "epoch": 0.37, + "grad_norm": 1.4162933826446533, + "learning_rate": 3.1532147742818057e-05, + "loss": 2.0885, + "step": 4320 + }, + { + "epoch": 0.37, + "grad_norm": 1.570765495300293, + "learning_rate": 3.1515047879616964e-05, + "loss": 2.1415, + "step": 4324 + }, + { + "epoch": 0.37, + "grad_norm": 1.603226661682129, + "learning_rate": 3.149794801641587e-05, + "loss": 2.2982, + "step": 4328 + }, + { + "epoch": 0.37, + "grad_norm": 1.4410016536712646, + "learning_rate": 3.148084815321478e-05, + "loss": 2.2572, + "step": 4332 + }, + { + "epoch": 0.37, + "grad_norm": 1.392094373703003, + "learning_rate": 3.1463748290013686e-05, + "loss": 2.4107, + "step": 4336 + }, + { + "epoch": 0.37, + "grad_norm": 1.4003255367279053, + "learning_rate": 3.1446648426812587e-05, + "loss": 2.2181, + "step": 4340 + }, + { + "epoch": 0.37, + "grad_norm": 3.496429204940796, + "learning_rate": 3.1429548563611494e-05, + "loss": 2.241, + "step": 4344 + }, + { + "epoch": 0.37, + "grad_norm": 1.5706188678741455, + "learning_rate": 3.1412448700410394e-05, + "loss": 2.1635, + "step": 4348 + }, + { + "epoch": 0.37, + "grad_norm": 1.4904406070709229, + "learning_rate": 3.13953488372093e-05, + "loss": 2.2688, + "step": 4352 + }, + { + "epoch": 0.37, + "grad_norm": 1.4973410367965698, + "learning_rate": 3.137824897400821e-05, + "loss": 2.2812, + "step": 4356 + }, + { + "epoch": 0.37, + "grad_norm": 1.6343064308166504, + "learning_rate": 3.1361149110807116e-05, + "loss": 2.3717, + "step": 4360 + }, + { + "epoch": 0.37, + "grad_norm": 1.327519178390503, + "learning_rate": 3.1344049247606024e-05, + "loss": 2.1028, + "step": 4364 + }, + { + "epoch": 0.37, + "grad_norm": 1.46675705909729, + "learning_rate": 3.1326949384404924e-05, + "loss": 2.2575, + "step": 4368 + }, + { + "epoch": 0.37, + "grad_norm": 1.5625230073928833, + "learning_rate": 3.130984952120383e-05, + "loss": 2.1085, + "step": 4372 + }, + { + "epoch": 0.37, + "grad_norm": 1.4606382846832275, + "learning_rate": 3.129274965800274e-05, + "loss": 2.1484, + "step": 4376 + }, + { + "epoch": 0.37, + "grad_norm": 1.3543037176132202, + "learning_rate": 3.127564979480164e-05, + "loss": 2.3504, + "step": 4380 + }, + { + "epoch": 0.37, + "grad_norm": 1.3623141050338745, + "learning_rate": 3.125854993160055e-05, + "loss": 2.1932, + "step": 4384 + }, + { + "epoch": 0.38, + "grad_norm": 1.2342579364776611, + "learning_rate": 3.1241450068399454e-05, + "loss": 2.229, + "step": 4388 + }, + { + "epoch": 0.38, + "grad_norm": 1.5705771446228027, + "learning_rate": 3.122435020519836e-05, + "loss": 2.3358, + "step": 4392 + }, + { + "epoch": 0.38, + "grad_norm": 1.5220746994018555, + "learning_rate": 3.120725034199726e-05, + "loss": 2.0569, + "step": 4396 + }, + { + "epoch": 0.38, + "grad_norm": 1.5559509992599487, + "learning_rate": 3.119015047879617e-05, + "loss": 2.348, + "step": 4400 + }, + { + "epoch": 0.38, + "grad_norm": 1.4289395809173584, + "learning_rate": 3.117305061559508e-05, + "loss": 2.1036, + "step": 4404 + }, + { + "epoch": 0.38, + "grad_norm": 1.69617760181427, + "learning_rate": 3.115595075239398e-05, + "loss": 2.1352, + "step": 4408 + }, + { + "epoch": 0.38, + "grad_norm": 1.6432487964630127, + "learning_rate": 3.113885088919289e-05, + "loss": 2.2481, + "step": 4412 + }, + { + "epoch": 0.38, + "grad_norm": 1.986351728439331, + "learning_rate": 3.112175102599179e-05, + "loss": 2.2431, + "step": 4416 + }, + { + "epoch": 0.38, + "grad_norm": 1.541490912437439, + "learning_rate": 3.11046511627907e-05, + "loss": 2.1826, + "step": 4420 + }, + { + "epoch": 0.38, + "grad_norm": 1.5522269010543823, + "learning_rate": 3.108755129958961e-05, + "loss": 2.1923, + "step": 4424 + }, + { + "epoch": 0.38, + "grad_norm": 1.4916808605194092, + "learning_rate": 3.107045143638851e-05, + "loss": 2.1626, + "step": 4428 + }, + { + "epoch": 0.38, + "grad_norm": 1.3527686595916748, + "learning_rate": 3.1053351573187415e-05, + "loss": 2.0595, + "step": 4432 + }, + { + "epoch": 0.38, + "grad_norm": 1.568322777748108, + "learning_rate": 3.103625170998632e-05, + "loss": 2.2729, + "step": 4436 + }, + { + "epoch": 0.38, + "grad_norm": 1.5389866828918457, + "learning_rate": 3.101915184678523e-05, + "loss": 2.0978, + "step": 4440 + }, + { + "epoch": 0.38, + "grad_norm": 1.4940327405929565, + "learning_rate": 3.100205198358414e-05, + "loss": 2.2146, + "step": 4444 + }, + { + "epoch": 0.38, + "grad_norm": 1.5102323293685913, + "learning_rate": 3.098495212038304e-05, + "loss": 2.1204, + "step": 4448 + }, + { + "epoch": 0.38, + "grad_norm": 1.4519439935684204, + "learning_rate": 3.0967852257181945e-05, + "loss": 2.2592, + "step": 4452 + }, + { + "epoch": 0.38, + "grad_norm": 1.3934937715530396, + "learning_rate": 3.0950752393980845e-05, + "loss": 2.1279, + "step": 4456 + }, + { + "epoch": 0.38, + "grad_norm": 1.3405640125274658, + "learning_rate": 3.093365253077975e-05, + "loss": 2.1896, + "step": 4460 + }, + { + "epoch": 0.38, + "grad_norm": 1.6696523427963257, + "learning_rate": 3.091655266757866e-05, + "loss": 2.0079, + "step": 4464 + }, + { + "epoch": 0.38, + "grad_norm": 1.470809817314148, + "learning_rate": 3.089945280437757e-05, + "loss": 2.2609, + "step": 4468 + }, + { + "epoch": 0.38, + "grad_norm": 1.54929518699646, + "learning_rate": 3.0882352941176475e-05, + "loss": 2.18, + "step": 4472 + }, + { + "epoch": 0.38, + "grad_norm": 2.0508859157562256, + "learning_rate": 3.0865253077975375e-05, + "loss": 2.1995, + "step": 4476 + }, + { + "epoch": 0.38, + "grad_norm": 1.4847677946090698, + "learning_rate": 3.084815321477428e-05, + "loss": 2.1576, + "step": 4480 + }, + { + "epoch": 0.38, + "grad_norm": 1.490350604057312, + "learning_rate": 3.083105335157319e-05, + "loss": 2.188, + "step": 4484 + }, + { + "epoch": 0.38, + "grad_norm": 1.5087822675704956, + "learning_rate": 3.081395348837209e-05, + "loss": 2.2151, + "step": 4488 + }, + { + "epoch": 0.38, + "grad_norm": 1.8956283330917358, + "learning_rate": 3.0796853625171005e-05, + "loss": 2.1073, + "step": 4492 + }, + { + "epoch": 0.38, + "grad_norm": 1.575896978378296, + "learning_rate": 3.0779753761969905e-05, + "loss": 2.2574, + "step": 4496 + }, + { + "epoch": 0.38, + "grad_norm": 1.5014541149139404, + "learning_rate": 3.076265389876881e-05, + "loss": 2.1651, + "step": 4500 + }, + { + "epoch": 0.39, + "grad_norm": 1.8757340908050537, + "learning_rate": 3.074555403556772e-05, + "loss": 2.0871, + "step": 4504 + }, + { + "epoch": 0.39, + "grad_norm": 1.5806230306625366, + "learning_rate": 3.072845417236662e-05, + "loss": 2.1398, + "step": 4508 + }, + { + "epoch": 0.39, + "grad_norm": 1.7795255184173584, + "learning_rate": 3.071135430916553e-05, + "loss": 2.2246, + "step": 4512 + }, + { + "epoch": 0.39, + "grad_norm": 1.6598823070526123, + "learning_rate": 3.0694254445964435e-05, + "loss": 2.2928, + "step": 4516 + }, + { + "epoch": 0.39, + "grad_norm": 1.5600835084915161, + "learning_rate": 3.067715458276334e-05, + "loss": 2.3343, + "step": 4520 + }, + { + "epoch": 0.39, + "grad_norm": 1.5268632173538208, + "learning_rate": 3.066005471956224e-05, + "loss": 2.3551, + "step": 4524 + }, + { + "epoch": 0.39, + "grad_norm": 1.8789829015731812, + "learning_rate": 3.064295485636115e-05, + "loss": 2.3127, + "step": 4528 + }, + { + "epoch": 0.39, + "grad_norm": 1.607021450996399, + "learning_rate": 3.062585499316006e-05, + "loss": 2.142, + "step": 4532 + }, + { + "epoch": 0.39, + "grad_norm": 1.4491835832595825, + "learning_rate": 3.060875512995896e-05, + "loss": 2.2, + "step": 4536 + }, + { + "epoch": 0.39, + "grad_norm": 1.5633890628814697, + "learning_rate": 3.0591655266757866e-05, + "loss": 2.0443, + "step": 4540 + }, + { + "epoch": 0.39, + "grad_norm": 1.6567158699035645, + "learning_rate": 3.057455540355677e-05, + "loss": 2.1271, + "step": 4544 + }, + { + "epoch": 0.39, + "grad_norm": 1.5010050535202026, + "learning_rate": 3.055745554035568e-05, + "loss": 2.2272, + "step": 4548 + }, + { + "epoch": 0.39, + "grad_norm": 1.5739331245422363, + "learning_rate": 3.054035567715459e-05, + "loss": 2.1923, + "step": 4552 + }, + { + "epoch": 0.39, + "grad_norm": 1.4975517988204956, + "learning_rate": 3.052325581395349e-05, + "loss": 2.2189, + "step": 4556 + }, + { + "epoch": 0.39, + "grad_norm": 1.571330189704895, + "learning_rate": 3.0506155950752396e-05, + "loss": 2.1658, + "step": 4560 + }, + { + "epoch": 0.39, + "grad_norm": 1.296584129333496, + "learning_rate": 3.04890560875513e-05, + "loss": 2.1641, + "step": 4564 + }, + { + "epoch": 0.39, + "grad_norm": 1.920922875404358, + "learning_rate": 3.0471956224350207e-05, + "loss": 2.3731, + "step": 4568 + }, + { + "epoch": 0.39, + "grad_norm": 1.5119904279708862, + "learning_rate": 3.0454856361149114e-05, + "loss": 2.0966, + "step": 4572 + }, + { + "epoch": 0.39, + "grad_norm": 1.3779191970825195, + "learning_rate": 3.0437756497948018e-05, + "loss": 2.0729, + "step": 4576 + }, + { + "epoch": 0.39, + "grad_norm": 1.3981715440750122, + "learning_rate": 3.0420656634746926e-05, + "loss": 2.1438, + "step": 4580 + }, + { + "epoch": 0.39, + "grad_norm": 1.4112988710403442, + "learning_rate": 3.0403556771545826e-05, + "loss": 2.3213, + "step": 4584 + }, + { + "epoch": 0.39, + "grad_norm": 1.4584025144577026, + "learning_rate": 3.0386456908344733e-05, + "loss": 2.2532, + "step": 4588 + }, + { + "epoch": 0.39, + "grad_norm": 1.500198245048523, + "learning_rate": 3.0369357045143644e-05, + "loss": 2.029, + "step": 4592 + }, + { + "epoch": 0.39, + "grad_norm": 1.4857838153839111, + "learning_rate": 3.0352257181942545e-05, + "loss": 2.0186, + "step": 4596 + }, + { + "epoch": 0.39, + "grad_norm": 1.396033525466919, + "learning_rate": 3.0335157318741452e-05, + "loss": 2.2293, + "step": 4600 + }, + { + "epoch": 0.39, + "grad_norm": 1.5105597972869873, + "learning_rate": 3.0318057455540356e-05, + "loss": 2.1206, + "step": 4604 + }, + { + "epoch": 0.39, + "grad_norm": 1.3721805810928345, + "learning_rate": 3.0300957592339263e-05, + "loss": 2.1646, + "step": 4608 + }, + { + "epoch": 0.39, + "grad_norm": 1.4651081562042236, + "learning_rate": 3.028385772913817e-05, + "loss": 2.2184, + "step": 4612 + }, + { + "epoch": 0.39, + "grad_norm": 1.468990445137024, + "learning_rate": 3.0266757865937075e-05, + "loss": 2.1083, + "step": 4616 + }, + { + "epoch": 0.4, + "grad_norm": 2.047722578048706, + "learning_rate": 3.0249658002735982e-05, + "loss": 2.1685, + "step": 4620 + }, + { + "epoch": 0.4, + "grad_norm": 1.5102801322937012, + "learning_rate": 3.0232558139534883e-05, + "loss": 2.2332, + "step": 4624 + }, + { + "epoch": 0.4, + "grad_norm": 1.3657864332199097, + "learning_rate": 3.021545827633379e-05, + "loss": 2.3045, + "step": 4628 + }, + { + "epoch": 0.4, + "grad_norm": 1.468764305114746, + "learning_rate": 3.01983584131327e-05, + "loss": 2.2094, + "step": 4632 + }, + { + "epoch": 0.4, + "grad_norm": 1.451079249382019, + "learning_rate": 3.01812585499316e-05, + "loss": 2.0958, + "step": 4636 + }, + { + "epoch": 0.4, + "grad_norm": 1.5094703435897827, + "learning_rate": 3.016415868673051e-05, + "loss": 2.186, + "step": 4640 + }, + { + "epoch": 0.4, + "grad_norm": 1.5918653011322021, + "learning_rate": 3.0147058823529413e-05, + "loss": 2.2616, + "step": 4644 + }, + { + "epoch": 0.4, + "grad_norm": 1.5116558074951172, + "learning_rate": 3.012995896032832e-05, + "loss": 2.2503, + "step": 4648 + }, + { + "epoch": 0.4, + "grad_norm": 1.4294779300689697, + "learning_rate": 3.011285909712722e-05, + "loss": 2.3022, + "step": 4652 + }, + { + "epoch": 0.4, + "grad_norm": 2.080925464630127, + "learning_rate": 3.009575923392613e-05, + "loss": 2.0627, + "step": 4656 + }, + { + "epoch": 0.4, + "grad_norm": 1.4744940996170044, + "learning_rate": 3.007865937072504e-05, + "loss": 2.1063, + "step": 4660 + }, + { + "epoch": 0.4, + "grad_norm": 1.6344597339630127, + "learning_rate": 3.006155950752394e-05, + "loss": 2.1501, + "step": 4664 + }, + { + "epoch": 0.4, + "grad_norm": 1.5882827043533325, + "learning_rate": 3.0044459644322846e-05, + "loss": 2.1208, + "step": 4668 + }, + { + "epoch": 0.4, + "grad_norm": 1.4376238584518433, + "learning_rate": 3.002735978112175e-05, + "loss": 2.0962, + "step": 4672 + }, + { + "epoch": 0.4, + "grad_norm": 1.6142209768295288, + "learning_rate": 3.0010259917920658e-05, + "loss": 2.0721, + "step": 4676 + }, + { + "epoch": 0.4, + "grad_norm": 1.4628876447677612, + "learning_rate": 2.9993160054719565e-05, + "loss": 1.959, + "step": 4680 + }, + { + "epoch": 0.4, + "grad_norm": 1.3108237981796265, + "learning_rate": 2.997606019151847e-05, + "loss": 2.1634, + "step": 4684 + }, + { + "epoch": 0.4, + "grad_norm": 1.4312976598739624, + "learning_rate": 2.9958960328317376e-05, + "loss": 2.1643, + "step": 4688 + }, + { + "epoch": 0.4, + "grad_norm": 1.449655294418335, + "learning_rate": 2.9941860465116277e-05, + "loss": 2.2098, + "step": 4692 + }, + { + "epoch": 0.4, + "grad_norm": 1.5844769477844238, + "learning_rate": 2.9924760601915188e-05, + "loss": 2.1497, + "step": 4696 + }, + { + "epoch": 0.4, + "grad_norm": 1.4893046617507935, + "learning_rate": 2.9907660738714095e-05, + "loss": 2.2939, + "step": 4700 + }, + { + "epoch": 0.4, + "grad_norm": 1.3878313302993774, + "learning_rate": 2.9890560875512996e-05, + "loss": 2.1361, + "step": 4704 + }, + { + "epoch": 0.4, + "grad_norm": 1.9359514713287354, + "learning_rate": 2.9873461012311903e-05, + "loss": 2.178, + "step": 4708 + }, + { + "epoch": 0.4, + "grad_norm": 1.581032633781433, + "learning_rate": 2.9856361149110807e-05, + "loss": 2.0514, + "step": 4712 + }, + { + "epoch": 0.4, + "grad_norm": 1.3860806226730347, + "learning_rate": 2.9839261285909714e-05, + "loss": 2.2888, + "step": 4716 + }, + { + "epoch": 0.4, + "grad_norm": 1.4699238538742065, + "learning_rate": 2.982216142270862e-05, + "loss": 2.167, + "step": 4720 + }, + { + "epoch": 0.4, + "grad_norm": 1.459145188331604, + "learning_rate": 2.9805061559507526e-05, + "loss": 2.1719, + "step": 4724 + }, + { + "epoch": 0.4, + "grad_norm": 1.7492071390151978, + "learning_rate": 2.9787961696306433e-05, + "loss": 2.314, + "step": 4728 + }, + { + "epoch": 0.4, + "grad_norm": 1.9393258094787598, + "learning_rate": 2.9770861833105333e-05, + "loss": 2.3491, + "step": 4732 + }, + { + "epoch": 0.4, + "grad_norm": 1.8979884386062622, + "learning_rate": 2.9753761969904244e-05, + "loss": 2.1962, + "step": 4736 + }, + { + "epoch": 0.41, + "grad_norm": 1.5597001314163208, + "learning_rate": 2.973666210670315e-05, + "loss": 2.1819, + "step": 4740 + }, + { + "epoch": 0.41, + "grad_norm": 1.6771557331085205, + "learning_rate": 2.9719562243502052e-05, + "loss": 2.1407, + "step": 4744 + }, + { + "epoch": 0.41, + "grad_norm": 1.5106451511383057, + "learning_rate": 2.970246238030096e-05, + "loss": 2.1655, + "step": 4748 + }, + { + "epoch": 0.41, + "grad_norm": 1.531887173652649, + "learning_rate": 2.9685362517099863e-05, + "loss": 2.1351, + "step": 4752 + }, + { + "epoch": 0.41, + "grad_norm": 1.3597395420074463, + "learning_rate": 2.966826265389877e-05, + "loss": 2.0589, + "step": 4756 + }, + { + "epoch": 0.41, + "grad_norm": 1.761173129081726, + "learning_rate": 2.9651162790697678e-05, + "loss": 2.0529, + "step": 4760 + }, + { + "epoch": 0.41, + "grad_norm": 1.44886314868927, + "learning_rate": 2.9634062927496582e-05, + "loss": 1.9243, + "step": 4764 + }, + { + "epoch": 0.41, + "grad_norm": 1.516022801399231, + "learning_rate": 2.961696306429549e-05, + "loss": 2.0999, + "step": 4768 + }, + { + "epoch": 0.41, + "grad_norm": 1.3773995637893677, + "learning_rate": 2.959986320109439e-05, + "loss": 2.2363, + "step": 4772 + }, + { + "epoch": 0.41, + "grad_norm": 1.5385714769363403, + "learning_rate": 2.9582763337893297e-05, + "loss": 2.1583, + "step": 4776 + }, + { + "epoch": 0.41, + "grad_norm": 1.746711015701294, + "learning_rate": 2.95656634746922e-05, + "loss": 2.1572, + "step": 4780 + }, + { + "epoch": 0.41, + "grad_norm": 1.6620087623596191, + "learning_rate": 2.954856361149111e-05, + "loss": 2.102, + "step": 4784 + }, + { + "epoch": 0.41, + "grad_norm": 1.4622972011566162, + "learning_rate": 2.9531463748290016e-05, + "loss": 2.2325, + "step": 4788 + }, + { + "epoch": 0.41, + "grad_norm": 1.3566945791244507, + "learning_rate": 2.951436388508892e-05, + "loss": 2.2317, + "step": 4792 + }, + { + "epoch": 0.41, + "grad_norm": 1.4192919731140137, + "learning_rate": 2.9497264021887827e-05, + "loss": 2.1342, + "step": 4796 + }, + { + "epoch": 0.41, + "grad_norm": 1.7992663383483887, + "learning_rate": 2.9480164158686728e-05, + "loss": 1.9621, + "step": 4800 + }, + { + "epoch": 0.41, + "grad_norm": 1.591426968574524, + "learning_rate": 2.946306429548564e-05, + "loss": 2.0418, + "step": 4804 + }, + { + "epoch": 0.41, + "grad_norm": 1.5884109735488892, + "learning_rate": 2.9445964432284546e-05, + "loss": 2.2499, + "step": 4808 + }, + { + "epoch": 0.41, + "grad_norm": 1.4876173734664917, + "learning_rate": 2.9428864569083446e-05, + "loss": 2.172, + "step": 4812 + }, + { + "epoch": 0.41, + "grad_norm": 1.9083555936813354, + "learning_rate": 2.9411764705882354e-05, + "loss": 2.1113, + "step": 4816 + }, + { + "epoch": 0.41, + "grad_norm": 1.6316877603530884, + "learning_rate": 2.9394664842681258e-05, + "loss": 2.2434, + "step": 4820 + }, + { + "epoch": 0.41, + "grad_norm": 1.578415036201477, + "learning_rate": 2.9377564979480165e-05, + "loss": 2.1634, + "step": 4824 + }, + { + "epoch": 0.41, + "grad_norm": 1.7644866704940796, + "learning_rate": 2.9360465116279072e-05, + "loss": 2.0934, + "step": 4828 + }, + { + "epoch": 0.41, + "grad_norm": 1.3768229484558105, + "learning_rate": 2.9343365253077976e-05, + "loss": 1.9734, + "step": 4832 + }, + { + "epoch": 0.41, + "grad_norm": 1.6598727703094482, + "learning_rate": 2.9326265389876884e-05, + "loss": 2.2622, + "step": 4836 + }, + { + "epoch": 0.41, + "grad_norm": 1.346545934677124, + "learning_rate": 2.9309165526675784e-05, + "loss": 2.1191, + "step": 4840 + }, + { + "epoch": 0.41, + "grad_norm": 1.5258831977844238, + "learning_rate": 2.9292065663474695e-05, + "loss": 1.9744, + "step": 4844 + }, + { + "epoch": 0.41, + "grad_norm": 1.573669195175171, + "learning_rate": 2.9274965800273602e-05, + "loss": 2.1529, + "step": 4848 + }, + { + "epoch": 0.41, + "grad_norm": 1.3979613780975342, + "learning_rate": 2.9257865937072503e-05, + "loss": 2.1268, + "step": 4852 + }, + { + "epoch": 0.42, + "grad_norm": 1.5907671451568604, + "learning_rate": 2.924076607387141e-05, + "loss": 2.0754, + "step": 4856 + }, + { + "epoch": 0.42, + "grad_norm": 1.4485595226287842, + "learning_rate": 2.9223666210670314e-05, + "loss": 2.2218, + "step": 4860 + }, + { + "epoch": 0.42, + "grad_norm": 1.4896790981292725, + "learning_rate": 2.920656634746922e-05, + "loss": 2.2253, + "step": 4864 + }, + { + "epoch": 0.42, + "grad_norm": 1.5012835264205933, + "learning_rate": 2.918946648426813e-05, + "loss": 2.3698, + "step": 4868 + }, + { + "epoch": 0.42, + "grad_norm": 1.4697033166885376, + "learning_rate": 2.9172366621067033e-05, + "loss": 2.2947, + "step": 4872 + }, + { + "epoch": 0.42, + "grad_norm": 1.693452000617981, + "learning_rate": 2.915526675786594e-05, + "loss": 2.1249, + "step": 4876 + }, + { + "epoch": 0.42, + "grad_norm": 1.5979125499725342, + "learning_rate": 2.913816689466484e-05, + "loss": 2.3482, + "step": 4880 + }, + { + "epoch": 0.42, + "grad_norm": 1.587105631828308, + "learning_rate": 2.912106703146375e-05, + "loss": 2.1287, + "step": 4884 + }, + { + "epoch": 0.42, + "grad_norm": 1.5790472030639648, + "learning_rate": 2.910396716826266e-05, + "loss": 2.1099, + "step": 4888 + }, + { + "epoch": 0.42, + "grad_norm": 1.4962116479873657, + "learning_rate": 2.908686730506156e-05, + "loss": 2.1216, + "step": 4892 + }, + { + "epoch": 0.42, + "grad_norm": 1.657333254814148, + "learning_rate": 2.9069767441860467e-05, + "loss": 2.1813, + "step": 4896 + }, + { + "epoch": 0.42, + "grad_norm": 1.520858645439148, + "learning_rate": 2.905266757865937e-05, + "loss": 2.1585, + "step": 4900 + }, + { + "epoch": 0.42, + "grad_norm": 1.4019360542297363, + "learning_rate": 2.9035567715458278e-05, + "loss": 2.1757, + "step": 4904 + }, + { + "epoch": 0.42, + "grad_norm": 1.4157730340957642, + "learning_rate": 2.9018467852257182e-05, + "loss": 2.313, + "step": 4908 + }, + { + "epoch": 0.42, + "grad_norm": 1.4938081502914429, + "learning_rate": 2.900136798905609e-05, + "loss": 2.2372, + "step": 4912 + }, + { + "epoch": 0.42, + "grad_norm": 1.5117088556289673, + "learning_rate": 2.8984268125854997e-05, + "loss": 2.3339, + "step": 4916 + }, + { + "epoch": 0.42, + "grad_norm": 1.4263554811477661, + "learning_rate": 2.8967168262653897e-05, + "loss": 2.1498, + "step": 4920 + }, + { + "epoch": 0.42, + "grad_norm": 1.3601114749908447, + "learning_rate": 2.8950068399452808e-05, + "loss": 2.1967, + "step": 4924 + }, + { + "epoch": 0.42, + "grad_norm": 1.5118712186813354, + "learning_rate": 2.893296853625171e-05, + "loss": 2.2998, + "step": 4928 + }, + { + "epoch": 0.42, + "grad_norm": 1.63152015209198, + "learning_rate": 2.8915868673050616e-05, + "loss": 2.085, + "step": 4932 + }, + { + "epoch": 0.42, + "grad_norm": 1.4992414712905884, + "learning_rate": 2.8898768809849523e-05, + "loss": 2.2337, + "step": 4936 + }, + { + "epoch": 0.42, + "grad_norm": 1.4842474460601807, + "learning_rate": 2.8881668946648427e-05, + "loss": 2.2246, + "step": 4940 + }, + { + "epoch": 0.42, + "grad_norm": 2.064748525619507, + "learning_rate": 2.8864569083447335e-05, + "loss": 2.2128, + "step": 4944 + }, + { + "epoch": 0.42, + "grad_norm": 1.7368000745773315, + "learning_rate": 2.884746922024624e-05, + "loss": 2.1803, + "step": 4948 + }, + { + "epoch": 0.42, + "grad_norm": 1.6576658487319946, + "learning_rate": 2.8830369357045146e-05, + "loss": 2.0681, + "step": 4952 + }, + { + "epoch": 0.42, + "grad_norm": 1.886851191520691, + "learning_rate": 2.8813269493844053e-05, + "loss": 2.3038, + "step": 4956 + }, + { + "epoch": 0.42, + "grad_norm": 1.6933567523956299, + "learning_rate": 2.8796169630642954e-05, + "loss": 2.1189, + "step": 4960 + }, + { + "epoch": 0.42, + "grad_norm": 1.5234380960464478, + "learning_rate": 2.8779069767441864e-05, + "loss": 2.239, + "step": 4964 + }, + { + "epoch": 0.42, + "grad_norm": 1.4364981651306152, + "learning_rate": 2.8761969904240765e-05, + "loss": 2.1525, + "step": 4968 + }, + { + "epoch": 0.43, + "grad_norm": 1.5221827030181885, + "learning_rate": 2.8744870041039672e-05, + "loss": 2.0648, + "step": 4972 + }, + { + "epoch": 0.43, + "grad_norm": 1.4764840602874756, + "learning_rate": 2.872777017783858e-05, + "loss": 2.1009, + "step": 4976 + }, + { + "epoch": 0.43, + "grad_norm": 1.361648440361023, + "learning_rate": 2.8710670314637484e-05, + "loss": 2.0012, + "step": 4980 + }, + { + "epoch": 0.43, + "grad_norm": 1.4987961053848267, + "learning_rate": 2.869357045143639e-05, + "loss": 2.3403, + "step": 4984 + }, + { + "epoch": 0.43, + "grad_norm": 1.422187328338623, + "learning_rate": 2.8676470588235295e-05, + "loss": 2.1681, + "step": 4988 + }, + { + "epoch": 0.43, + "grad_norm": 1.5344418287277222, + "learning_rate": 2.8659370725034202e-05, + "loss": 2.1772, + "step": 4992 + }, + { + "epoch": 0.43, + "grad_norm": 1.5092509984970093, + "learning_rate": 2.864227086183311e-05, + "loss": 2.207, + "step": 4996 + }, + { + "epoch": 0.43, + "grad_norm": 1.6343382596969604, + "learning_rate": 2.862517099863201e-05, + "loss": 2.1994, + "step": 5000 + }, + { + "epoch": 0.43, + "grad_norm": 1.447045087814331, + "learning_rate": 2.860807113543092e-05, + "loss": 2.1439, + "step": 5004 + }, + { + "epoch": 0.43, + "grad_norm": 1.4870860576629639, + "learning_rate": 2.859097127222982e-05, + "loss": 2.1185, + "step": 5008 + }, + { + "epoch": 0.43, + "grad_norm": 1.538109302520752, + "learning_rate": 2.857387140902873e-05, + "loss": 2.1966, + "step": 5012 + }, + { + "epoch": 0.43, + "grad_norm": 1.5985993146896362, + "learning_rate": 2.8556771545827636e-05, + "loss": 2.1939, + "step": 5016 + }, + { + "epoch": 0.43, + "grad_norm": 1.562982439994812, + "learning_rate": 2.853967168262654e-05, + "loss": 2.1722, + "step": 5020 + }, + { + "epoch": 0.43, + "grad_norm": 1.4697779417037964, + "learning_rate": 2.8522571819425448e-05, + "loss": 2.1089, + "step": 5024 + }, + { + "epoch": 0.43, + "grad_norm": 1.3964099884033203, + "learning_rate": 2.850547195622435e-05, + "loss": 2.1349, + "step": 5028 + }, + { + "epoch": 0.43, + "grad_norm": 1.446781039237976, + "learning_rate": 2.848837209302326e-05, + "loss": 2.0189, + "step": 5032 + }, + { + "epoch": 0.43, + "grad_norm": 1.305580735206604, + "learning_rate": 2.847127222982216e-05, + "loss": 2.1448, + "step": 5036 + }, + { + "epoch": 0.43, + "grad_norm": 1.5852243900299072, + "learning_rate": 2.8454172366621067e-05, + "loss": 2.0051, + "step": 5040 + }, + { + "epoch": 0.43, + "grad_norm": 1.6528069972991943, + "learning_rate": 2.8437072503419977e-05, + "loss": 2.0865, + "step": 5044 + }, + { + "epoch": 0.43, + "grad_norm": 1.5359212160110474, + "learning_rate": 2.8419972640218878e-05, + "loss": 2.301, + "step": 5048 + }, + { + "epoch": 0.43, + "grad_norm": 1.2999087572097778, + "learning_rate": 2.8402872777017785e-05, + "loss": 2.074, + "step": 5052 + }, + { + "epoch": 0.43, + "grad_norm": 1.6305381059646606, + "learning_rate": 2.838577291381669e-05, + "loss": 2.1127, + "step": 5056 + }, + { + "epoch": 0.43, + "grad_norm": 1.4597679376602173, + "learning_rate": 2.8368673050615597e-05, + "loss": 2.0304, + "step": 5060 + }, + { + "epoch": 0.43, + "grad_norm": 1.6588799953460693, + "learning_rate": 2.8351573187414504e-05, + "loss": 2.2835, + "step": 5064 + }, + { + "epoch": 0.43, + "grad_norm": 1.5638916492462158, + "learning_rate": 2.8334473324213408e-05, + "loss": 2.0453, + "step": 5068 + }, + { + "epoch": 0.43, + "grad_norm": 1.3972184658050537, + "learning_rate": 2.8317373461012315e-05, + "loss": 2.0929, + "step": 5072 + }, + { + "epoch": 0.43, + "grad_norm": 1.448038935661316, + "learning_rate": 2.8300273597811216e-05, + "loss": 2.2133, + "step": 5076 + }, + { + "epoch": 0.43, + "grad_norm": 1.492927074432373, + "learning_rate": 2.8283173734610123e-05, + "loss": 2.0079, + "step": 5080 + }, + { + "epoch": 0.43, + "grad_norm": 1.555188775062561, + "learning_rate": 2.8266073871409034e-05, + "loss": 2.1773, + "step": 5084 + }, + { + "epoch": 0.44, + "grad_norm": 1.4068621397018433, + "learning_rate": 2.8248974008207935e-05, + "loss": 2.288, + "step": 5088 + }, + { + "epoch": 0.44, + "grad_norm": 1.592962622642517, + "learning_rate": 2.8231874145006842e-05, + "loss": 2.2457, + "step": 5092 + }, + { + "epoch": 0.44, + "grad_norm": 1.6329854726791382, + "learning_rate": 2.8214774281805746e-05, + "loss": 2.1736, + "step": 5096 + }, + { + "epoch": 0.44, + "grad_norm": 1.5315256118774414, + "learning_rate": 2.8197674418604653e-05, + "loss": 2.0541, + "step": 5100 + }, + { + "epoch": 0.44, + "grad_norm": 1.5884466171264648, + "learning_rate": 2.818057455540356e-05, + "loss": 2.0702, + "step": 5104 + }, + { + "epoch": 0.44, + "grad_norm": 1.4373297691345215, + "learning_rate": 2.8163474692202464e-05, + "loss": 2.0738, + "step": 5108 + }, + { + "epoch": 0.44, + "grad_norm": 1.7089366912841797, + "learning_rate": 2.8146374829001372e-05, + "loss": 2.2945, + "step": 5112 + }, + { + "epoch": 0.44, + "grad_norm": 1.4914695024490356, + "learning_rate": 2.8129274965800272e-05, + "loss": 2.0974, + "step": 5116 + }, + { + "epoch": 0.44, + "grad_norm": 1.7952321767807007, + "learning_rate": 2.811217510259918e-05, + "loss": 2.0721, + "step": 5120 + }, + { + "epoch": 0.44, + "grad_norm": 1.4391483068466187, + "learning_rate": 2.8095075239398087e-05, + "loss": 2.1753, + "step": 5124 + }, + { + "epoch": 0.44, + "grad_norm": 1.6410449743270874, + "learning_rate": 2.807797537619699e-05, + "loss": 2.0375, + "step": 5128 + }, + { + "epoch": 0.44, + "grad_norm": 1.5932730436325073, + "learning_rate": 2.80608755129959e-05, + "loss": 2.0907, + "step": 5132 + }, + { + "epoch": 0.44, + "grad_norm": 1.5292744636535645, + "learning_rate": 2.8043775649794802e-05, + "loss": 2.266, + "step": 5136 + }, + { + "epoch": 0.44, + "grad_norm": 1.472765326499939, + "learning_rate": 2.802667578659371e-05, + "loss": 2.075, + "step": 5140 + }, + { + "epoch": 0.44, + "grad_norm": 1.5142451524734497, + "learning_rate": 2.8009575923392617e-05, + "loss": 2.0953, + "step": 5144 + }, + { + "epoch": 0.44, + "grad_norm": 1.8267227411270142, + "learning_rate": 2.7992476060191518e-05, + "loss": 2.183, + "step": 5148 + }, + { + "epoch": 0.44, + "grad_norm": 1.5693491697311401, + "learning_rate": 2.797537619699043e-05, + "loss": 2.1086, + "step": 5152 + }, + { + "epoch": 0.44, + "grad_norm": 1.3335392475128174, + "learning_rate": 2.795827633378933e-05, + "loss": 2.0814, + "step": 5156 + }, + { + "epoch": 0.44, + "grad_norm": 1.5873955488204956, + "learning_rate": 2.7941176470588236e-05, + "loss": 2.132, + "step": 5160 + }, + { + "epoch": 0.44, + "grad_norm": 1.5618573427200317, + "learning_rate": 2.792407660738714e-05, + "loss": 2.1605, + "step": 5164 + }, + { + "epoch": 0.44, + "grad_norm": 1.355206847190857, + "learning_rate": 2.7906976744186048e-05, + "loss": 2.0064, + "step": 5168 + }, + { + "epoch": 0.44, + "grad_norm": 1.4535104036331177, + "learning_rate": 2.7889876880984955e-05, + "loss": 2.092, + "step": 5172 + }, + { + "epoch": 0.44, + "grad_norm": 1.477333426475525, + "learning_rate": 2.787277701778386e-05, + "loss": 2.0415, + "step": 5176 + }, + { + "epoch": 0.44, + "grad_norm": 1.4349262714385986, + "learning_rate": 2.7855677154582766e-05, + "loss": 2.1695, + "step": 5180 + }, + { + "epoch": 0.44, + "grad_norm": 1.4431016445159912, + "learning_rate": 2.7838577291381667e-05, + "loss": 2.1246, + "step": 5184 + }, + { + "epoch": 0.44, + "grad_norm": 1.419053554534912, + "learning_rate": 2.7821477428180574e-05, + "loss": 2.1279, + "step": 5188 + }, + { + "epoch": 0.44, + "grad_norm": 1.5947861671447754, + "learning_rate": 2.7804377564979485e-05, + "loss": 2.1138, + "step": 5192 + }, + { + "epoch": 0.44, + "grad_norm": 1.8315672874450684, + "learning_rate": 2.7787277701778385e-05, + "loss": 2.172, + "step": 5196 + }, + { + "epoch": 0.44, + "grad_norm": 1.532003402709961, + "learning_rate": 2.7770177838577293e-05, + "loss": 2.0775, + "step": 5200 + }, + { + "epoch": 0.44, + "grad_norm": 1.6215981245040894, + "learning_rate": 2.7753077975376197e-05, + "loss": 2.0522, + "step": 5204 + }, + { + "epoch": 0.45, + "grad_norm": 1.8606538772583008, + "learning_rate": 2.7735978112175104e-05, + "loss": 2.0371, + "step": 5208 + }, + { + "epoch": 0.45, + "grad_norm": 1.9666029214859009, + "learning_rate": 2.771887824897401e-05, + "loss": 2.256, + "step": 5212 + }, + { + "epoch": 0.45, + "grad_norm": 1.5295203924179077, + "learning_rate": 2.7701778385772915e-05, + "loss": 2.0297, + "step": 5216 + }, + { + "epoch": 0.45, + "grad_norm": 1.395086407661438, + "learning_rate": 2.7684678522571823e-05, + "loss": 2.1393, + "step": 5220 + }, + { + "epoch": 0.45, + "grad_norm": 1.396425485610962, + "learning_rate": 2.7667578659370723e-05, + "loss": 1.9676, + "step": 5224 + }, + { + "epoch": 0.45, + "grad_norm": 1.5650209188461304, + "learning_rate": 2.765047879616963e-05, + "loss": 2.1365, + "step": 5228 + }, + { + "epoch": 0.45, + "grad_norm": 1.447646975517273, + "learning_rate": 2.763337893296854e-05, + "loss": 1.9595, + "step": 5232 + }, + { + "epoch": 0.45, + "grad_norm": 1.438038945198059, + "learning_rate": 2.7616279069767442e-05, + "loss": 1.9664, + "step": 5236 + }, + { + "epoch": 0.45, + "grad_norm": 1.4711318016052246, + "learning_rate": 2.759917920656635e-05, + "loss": 2.0887, + "step": 5240 + }, + { + "epoch": 0.45, + "grad_norm": 1.4893648624420166, + "learning_rate": 2.7582079343365253e-05, + "loss": 1.9264, + "step": 5244 + }, + { + "epoch": 0.45, + "grad_norm": 1.579157829284668, + "learning_rate": 2.756497948016416e-05, + "loss": 2.126, + "step": 5248 + }, + { + "epoch": 0.45, + "grad_norm": 1.480891466140747, + "learning_rate": 2.7547879616963068e-05, + "loss": 2.2045, + "step": 5252 + }, + { + "epoch": 0.45, + "grad_norm": 1.8084710836410522, + "learning_rate": 2.7530779753761972e-05, + "loss": 2.2102, + "step": 5256 + }, + { + "epoch": 0.45, + "grad_norm": 1.4894654750823975, + "learning_rate": 2.751367989056088e-05, + "loss": 2.1216, + "step": 5260 + }, + { + "epoch": 0.45, + "grad_norm": 1.478981852531433, + "learning_rate": 2.749658002735978e-05, + "loss": 2.0361, + "step": 5264 + }, + { + "epoch": 0.45, + "grad_norm": 1.5740448236465454, + "learning_rate": 2.7479480164158687e-05, + "loss": 1.9947, + "step": 5268 + }, + { + "epoch": 0.45, + "grad_norm": 1.4918889999389648, + "learning_rate": 2.7462380300957598e-05, + "loss": 2.068, + "step": 5272 + }, + { + "epoch": 0.45, + "grad_norm": 1.586724877357483, + "learning_rate": 2.74452804377565e-05, + "loss": 1.9932, + "step": 5276 + }, + { + "epoch": 0.45, + "grad_norm": 1.7443528175354004, + "learning_rate": 2.7428180574555406e-05, + "loss": 2.2906, + "step": 5280 + }, + { + "epoch": 0.45, + "grad_norm": 1.630373239517212, + "learning_rate": 2.741108071135431e-05, + "loss": 2.161, + "step": 5284 + }, + { + "epoch": 0.45, + "grad_norm": 1.3437303304672241, + "learning_rate": 2.7393980848153217e-05, + "loss": 1.9711, + "step": 5288 + }, + { + "epoch": 0.45, + "grad_norm": 1.6629517078399658, + "learning_rate": 2.7376880984952118e-05, + "loss": 2.1511, + "step": 5292 + }, + { + "epoch": 0.45, + "grad_norm": 1.507279872894287, + "learning_rate": 2.735978112175103e-05, + "loss": 2.0911, + "step": 5296 + }, + { + "epoch": 0.45, + "grad_norm": 1.5633537769317627, + "learning_rate": 2.7342681258549936e-05, + "loss": 2.0376, + "step": 5300 + }, + { + "epoch": 0.45, + "grad_norm": 1.524722933769226, + "learning_rate": 2.7325581395348836e-05, + "loss": 2.131, + "step": 5304 + }, + { + "epoch": 0.45, + "grad_norm": 1.4950675964355469, + "learning_rate": 2.7308481532147744e-05, + "loss": 2.1594, + "step": 5308 + }, + { + "epoch": 0.45, + "grad_norm": 1.6241108179092407, + "learning_rate": 2.7291381668946648e-05, + "loss": 2.1674, + "step": 5312 + }, + { + "epoch": 0.45, + "grad_norm": 1.6131986379623413, + "learning_rate": 2.7274281805745555e-05, + "loss": 2.0742, + "step": 5316 + }, + { + "epoch": 0.45, + "grad_norm": 1.5340795516967773, + "learning_rate": 2.7257181942544462e-05, + "loss": 2.0877, + "step": 5320 + }, + { + "epoch": 0.46, + "grad_norm": 1.586997389793396, + "learning_rate": 2.7240082079343366e-05, + "loss": 2.2434, + "step": 5324 + }, + { + "epoch": 0.46, + "grad_norm": 1.51520836353302, + "learning_rate": 2.7222982216142274e-05, + "loss": 2.0303, + "step": 5328 + }, + { + "epoch": 0.46, + "grad_norm": 1.6436048746109009, + "learning_rate": 2.7205882352941174e-05, + "loss": 2.0539, + "step": 5332 + }, + { + "epoch": 0.46, + "grad_norm": 1.3098795413970947, + "learning_rate": 2.7188782489740085e-05, + "loss": 2.2811, + "step": 5336 + }, + { + "epoch": 0.46, + "grad_norm": 1.6447290182113647, + "learning_rate": 2.7171682626538992e-05, + "loss": 2.1018, + "step": 5340 + }, + { + "epoch": 0.46, + "grad_norm": 1.8019658327102661, + "learning_rate": 2.7154582763337893e-05, + "loss": 2.0778, + "step": 5344 + }, + { + "epoch": 0.46, + "grad_norm": 1.7310752868652344, + "learning_rate": 2.71374829001368e-05, + "loss": 2.1467, + "step": 5348 + }, + { + "epoch": 0.46, + "grad_norm": 1.4999805688858032, + "learning_rate": 2.7120383036935704e-05, + "loss": 2.134, + "step": 5352 + }, + { + "epoch": 0.46, + "grad_norm": 1.4711189270019531, + "learning_rate": 2.710328317373461e-05, + "loss": 2.1542, + "step": 5356 + }, + { + "epoch": 0.46, + "grad_norm": 1.477725863456726, + "learning_rate": 2.708618331053352e-05, + "loss": 2.0225, + "step": 5360 + }, + { + "epoch": 0.46, + "grad_norm": 1.6154271364212036, + "learning_rate": 2.7069083447332423e-05, + "loss": 2.1603, + "step": 5364 + }, + { + "epoch": 0.46, + "grad_norm": 1.6522784233093262, + "learning_rate": 2.705198358413133e-05, + "loss": 2.2915, + "step": 5368 + }, + { + "epoch": 0.46, + "grad_norm": 1.6237088441848755, + "learning_rate": 2.703488372093023e-05, + "loss": 1.9629, + "step": 5372 + }, + { + "epoch": 0.46, + "grad_norm": 1.8103796243667603, + "learning_rate": 2.701778385772914e-05, + "loss": 2.0765, + "step": 5376 + }, + { + "epoch": 0.46, + "grad_norm": 1.4163451194763184, + "learning_rate": 2.700068399452805e-05, + "loss": 2.0539, + "step": 5380 + }, + { + "epoch": 0.46, + "grad_norm": 2.560591697692871, + "learning_rate": 2.698358413132695e-05, + "loss": 2.1445, + "step": 5384 + }, + { + "epoch": 0.46, + "grad_norm": 1.5179965496063232, + "learning_rate": 2.6966484268125857e-05, + "loss": 2.1425, + "step": 5388 + }, + { + "epoch": 0.46, + "grad_norm": 1.5749553442001343, + "learning_rate": 2.694938440492476e-05, + "loss": 1.8312, + "step": 5392 + }, + { + "epoch": 0.46, + "grad_norm": 1.3497620820999146, + "learning_rate": 2.6932284541723668e-05, + "loss": 1.9427, + "step": 5396 + }, + { + "epoch": 0.46, + "grad_norm": 1.3808335065841675, + "learning_rate": 2.6915184678522575e-05, + "loss": 2.2364, + "step": 5400 + }, + { + "epoch": 0.46, + "grad_norm": 1.8348784446716309, + "learning_rate": 2.689808481532148e-05, + "loss": 1.9962, + "step": 5404 + }, + { + "epoch": 0.46, + "grad_norm": 1.3197436332702637, + "learning_rate": 2.6880984952120387e-05, + "loss": 2.1261, + "step": 5408 + }, + { + "epoch": 0.46, + "grad_norm": 1.622539758682251, + "learning_rate": 2.6863885088919287e-05, + "loss": 2.2002, + "step": 5412 + }, + { + "epoch": 0.46, + "grad_norm": 1.4938942193984985, + "learning_rate": 2.6846785225718198e-05, + "loss": 2.0079, + "step": 5416 + }, + { + "epoch": 0.46, + "grad_norm": 1.4376903772354126, + "learning_rate": 2.68296853625171e-05, + "loss": 1.9435, + "step": 5420 + }, + { + "epoch": 0.46, + "grad_norm": 1.7015161514282227, + "learning_rate": 2.6812585499316006e-05, + "loss": 2.1806, + "step": 5424 + }, + { + "epoch": 0.46, + "grad_norm": 1.7079006433486938, + "learning_rate": 2.6795485636114913e-05, + "loss": 2.1705, + "step": 5428 + }, + { + "epoch": 0.46, + "grad_norm": 1.5330369472503662, + "learning_rate": 2.6778385772913817e-05, + "loss": 2.109, + "step": 5432 + }, + { + "epoch": 0.46, + "grad_norm": 1.606583595275879, + "learning_rate": 2.6761285909712724e-05, + "loss": 2.1793, + "step": 5436 + }, + { + "epoch": 0.47, + "grad_norm": 1.5606343746185303, + "learning_rate": 2.674418604651163e-05, + "loss": 2.1413, + "step": 5440 + }, + { + "epoch": 0.47, + "grad_norm": 1.5075594186782837, + "learning_rate": 2.6727086183310536e-05, + "loss": 2.1066, + "step": 5444 + }, + { + "epoch": 0.47, + "grad_norm": 1.3254252672195435, + "learning_rate": 2.6709986320109443e-05, + "loss": 1.9901, + "step": 5448 + }, + { + "epoch": 0.47, + "grad_norm": 1.5555039644241333, + "learning_rate": 2.6692886456908344e-05, + "loss": 2.1421, + "step": 5452 + }, + { + "epoch": 0.47, + "grad_norm": 1.5434138774871826, + "learning_rate": 2.6675786593707254e-05, + "loss": 2.1533, + "step": 5456 + }, + { + "epoch": 0.47, + "grad_norm": 1.5345412492752075, + "learning_rate": 2.6658686730506155e-05, + "loss": 2.1013, + "step": 5460 + }, + { + "epoch": 0.47, + "grad_norm": 1.5029314756393433, + "learning_rate": 2.6641586867305062e-05, + "loss": 2.0473, + "step": 5464 + }, + { + "epoch": 0.47, + "grad_norm": 1.668525218963623, + "learning_rate": 2.662448700410397e-05, + "loss": 2.0636, + "step": 5468 + }, + { + "epoch": 0.47, + "grad_norm": 1.5454894304275513, + "learning_rate": 2.6607387140902874e-05, + "loss": 2.1219, + "step": 5472 + }, + { + "epoch": 0.47, + "grad_norm": 1.3577351570129395, + "learning_rate": 2.659028727770178e-05, + "loss": 1.9395, + "step": 5476 + }, + { + "epoch": 0.47, + "grad_norm": 1.7034279108047485, + "learning_rate": 2.6573187414500685e-05, + "loss": 2.1734, + "step": 5480 + }, + { + "epoch": 0.47, + "grad_norm": 1.519974708557129, + "learning_rate": 2.6556087551299592e-05, + "loss": 2.0007, + "step": 5484 + }, + { + "epoch": 0.47, + "grad_norm": 1.6780128479003906, + "learning_rate": 2.65389876880985e-05, + "loss": 2.2113, + "step": 5488 + }, + { + "epoch": 0.47, + "grad_norm": 1.5499041080474854, + "learning_rate": 2.65218878248974e-05, + "loss": 2.0985, + "step": 5492 + }, + { + "epoch": 0.47, + "grad_norm": 1.5147311687469482, + "learning_rate": 2.6504787961696307e-05, + "loss": 2.0704, + "step": 5496 + }, + { + "epoch": 0.47, + "grad_norm": 1.4439009428024292, + "learning_rate": 2.648768809849521e-05, + "loss": 1.9784, + "step": 5500 + }, + { + "epoch": 0.47, + "grad_norm": 1.5315637588500977, + "learning_rate": 2.647058823529412e-05, + "loss": 2.0466, + "step": 5504 + }, + { + "epoch": 0.47, + "grad_norm": 1.514778971672058, + "learning_rate": 2.6453488372093026e-05, + "loss": 2.1113, + "step": 5508 + }, + { + "epoch": 0.47, + "grad_norm": 1.898563027381897, + "learning_rate": 2.643638850889193e-05, + "loss": 2.129, + "step": 5512 + }, + { + "epoch": 0.47, + "grad_norm": 1.4862741231918335, + "learning_rate": 2.6419288645690837e-05, + "loss": 2.1319, + "step": 5516 + }, + { + "epoch": 0.47, + "grad_norm": 1.5534049272537231, + "learning_rate": 2.6402188782489738e-05, + "loss": 2.1904, + "step": 5520 + }, + { + "epoch": 0.47, + "grad_norm": 1.6791940927505493, + "learning_rate": 2.638508891928865e-05, + "loss": 2.0653, + "step": 5524 + }, + { + "epoch": 0.47, + "grad_norm": 1.4288556575775146, + "learning_rate": 2.6367989056087556e-05, + "loss": 2.1007, + "step": 5528 + }, + { + "epoch": 0.47, + "grad_norm": 1.4318090677261353, + "learning_rate": 2.6350889192886457e-05, + "loss": 2.0791, + "step": 5532 + }, + { + "epoch": 0.47, + "grad_norm": 1.5377432107925415, + "learning_rate": 2.6333789329685364e-05, + "loss": 2.2141, + "step": 5536 + }, + { + "epoch": 0.47, + "grad_norm": 1.6055461168289185, + "learning_rate": 2.6316689466484268e-05, + "loss": 2.0733, + "step": 5540 + }, + { + "epoch": 0.47, + "grad_norm": 1.4851833581924438, + "learning_rate": 2.6299589603283175e-05, + "loss": 2.1332, + "step": 5544 + }, + { + "epoch": 0.47, + "grad_norm": 1.4060307741165161, + "learning_rate": 2.628248974008208e-05, + "loss": 2.0943, + "step": 5548 + }, + { + "epoch": 0.47, + "grad_norm": 1.5619680881500244, + "learning_rate": 2.6265389876880987e-05, + "loss": 2.1601, + "step": 5552 + }, + { + "epoch": 0.48, + "grad_norm": 1.6302011013031006, + "learning_rate": 2.6248290013679894e-05, + "loss": 2.1288, + "step": 5556 + }, + { + "epoch": 0.48, + "grad_norm": 1.514349102973938, + "learning_rate": 2.6231190150478794e-05, + "loss": 1.9701, + "step": 5560 + }, + { + "epoch": 0.48, + "grad_norm": 1.5871026515960693, + "learning_rate": 2.6214090287277705e-05, + "loss": 2.0344, + "step": 5564 + }, + { + "epoch": 0.48, + "grad_norm": 1.5523772239685059, + "learning_rate": 2.6196990424076606e-05, + "loss": 2.0973, + "step": 5568 + }, + { + "epoch": 0.48, + "grad_norm": 1.4616386890411377, + "learning_rate": 2.6179890560875513e-05, + "loss": 2.1624, + "step": 5572 + }, + { + "epoch": 0.48, + "grad_norm": 1.53379487991333, + "learning_rate": 2.616279069767442e-05, + "loss": 2.0887, + "step": 5576 + }, + { + "epoch": 0.48, + "grad_norm": 1.6133915185928345, + "learning_rate": 2.6145690834473324e-05, + "loss": 2.0548, + "step": 5580 + }, + { + "epoch": 0.48, + "grad_norm": 1.7219460010528564, + "learning_rate": 2.6128590971272232e-05, + "loss": 2.0882, + "step": 5584 + }, + { + "epoch": 0.48, + "grad_norm": 1.4539477825164795, + "learning_rate": 2.6111491108071136e-05, + "loss": 1.9867, + "step": 5588 + }, + { + "epoch": 0.48, + "grad_norm": 1.6804320812225342, + "learning_rate": 2.6094391244870043e-05, + "loss": 2.2284, + "step": 5592 + }, + { + "epoch": 0.48, + "grad_norm": 1.6927332878112793, + "learning_rate": 2.607729138166895e-05, + "loss": 2.0798, + "step": 5596 + }, + { + "epoch": 0.48, + "grad_norm": 1.5225088596343994, + "learning_rate": 2.606019151846785e-05, + "loss": 2.1072, + "step": 5600 + }, + { + "epoch": 0.48, + "grad_norm": 1.4824042320251465, + "learning_rate": 2.604309165526676e-05, + "loss": 2.0789, + "step": 5604 + }, + { + "epoch": 0.48, + "grad_norm": 1.3722364902496338, + "learning_rate": 2.6025991792065662e-05, + "loss": 2.1098, + "step": 5608 + }, + { + "epoch": 0.48, + "grad_norm": 1.4595104455947876, + "learning_rate": 2.600889192886457e-05, + "loss": 2.119, + "step": 5612 + }, + { + "epoch": 0.48, + "grad_norm": 1.418157696723938, + "learning_rate": 2.5991792065663477e-05, + "loss": 2.1059, + "step": 5616 + }, + { + "epoch": 0.48, + "grad_norm": 1.4522316455841064, + "learning_rate": 2.597469220246238e-05, + "loss": 2.2825, + "step": 5620 + }, + { + "epoch": 0.48, + "grad_norm": 1.685712456703186, + "learning_rate": 2.5957592339261288e-05, + "loss": 2.0914, + "step": 5624 + }, + { + "epoch": 0.48, + "grad_norm": 1.5950579643249512, + "learning_rate": 2.5940492476060192e-05, + "loss": 2.0957, + "step": 5628 + }, + { + "epoch": 0.48, + "grad_norm": 1.5742863416671753, + "learning_rate": 2.59233926128591e-05, + "loss": 2.2025, + "step": 5632 + }, + { + "epoch": 0.48, + "grad_norm": 1.5395307540893555, + "learning_rate": 2.5906292749658007e-05, + "loss": 2.0688, + "step": 5636 + }, + { + "epoch": 0.48, + "grad_norm": 1.5343267917633057, + "learning_rate": 2.5889192886456907e-05, + "loss": 2.1783, + "step": 5640 + }, + { + "epoch": 0.48, + "grad_norm": 1.4409688711166382, + "learning_rate": 2.5872093023255818e-05, + "loss": 1.9016, + "step": 5644 + }, + { + "epoch": 0.48, + "grad_norm": 1.5827747583389282, + "learning_rate": 2.585499316005472e-05, + "loss": 2.1501, + "step": 5648 + }, + { + "epoch": 0.48, + "grad_norm": 1.6408090591430664, + "learning_rate": 2.5837893296853626e-05, + "loss": 2.0266, + "step": 5652 + }, + { + "epoch": 0.48, + "grad_norm": 1.454546332359314, + "learning_rate": 2.5820793433652533e-05, + "loss": 1.9665, + "step": 5656 + }, + { + "epoch": 0.48, + "grad_norm": 1.4659438133239746, + "learning_rate": 2.5803693570451437e-05, + "loss": 2.0251, + "step": 5660 + }, + { + "epoch": 0.48, + "grad_norm": 1.6807434558868408, + "learning_rate": 2.5786593707250345e-05, + "loss": 2.0587, + "step": 5664 + }, + { + "epoch": 0.48, + "grad_norm": 1.4312716722488403, + "learning_rate": 2.576949384404925e-05, + "loss": 1.9798, + "step": 5668 + }, + { + "epoch": 0.48, + "grad_norm": 1.5319584608078003, + "learning_rate": 2.5752393980848156e-05, + "loss": 2.0179, + "step": 5672 + }, + { + "epoch": 0.49, + "grad_norm": 1.5227807760238647, + "learning_rate": 2.5735294117647057e-05, + "loss": 2.079, + "step": 5676 + }, + { + "epoch": 0.49, + "grad_norm": 1.5442856550216675, + "learning_rate": 2.5718194254445964e-05, + "loss": 2.0757, + "step": 5680 + }, + { + "epoch": 0.49, + "grad_norm": 1.4400264024734497, + "learning_rate": 2.5701094391244875e-05, + "loss": 2.0712, + "step": 5684 + }, + { + "epoch": 0.49, + "grad_norm": 1.5786032676696777, + "learning_rate": 2.5683994528043775e-05, + "loss": 2.1628, + "step": 5688 + }, + { + "epoch": 0.49, + "grad_norm": 1.5079550743103027, + "learning_rate": 2.5666894664842683e-05, + "loss": 2.0104, + "step": 5692 + }, + { + "epoch": 0.49, + "grad_norm": 1.6693617105484009, + "learning_rate": 2.5649794801641587e-05, + "loss": 1.9668, + "step": 5696 + }, + { + "epoch": 0.49, + "grad_norm": 1.6841495037078857, + "learning_rate": 2.5632694938440494e-05, + "loss": 2.2648, + "step": 5700 + }, + { + "epoch": 0.49, + "grad_norm": 1.4236379861831665, + "learning_rate": 2.56155950752394e-05, + "loss": 2.1775, + "step": 5704 + }, + { + "epoch": 0.49, + "grad_norm": 1.4873919486999512, + "learning_rate": 2.5598495212038305e-05, + "loss": 1.8591, + "step": 5708 + }, + { + "epoch": 0.49, + "grad_norm": 1.5848278999328613, + "learning_rate": 2.5581395348837212e-05, + "loss": 2.0736, + "step": 5712 + }, + { + "epoch": 0.49, + "grad_norm": 1.9348700046539307, + "learning_rate": 2.5564295485636113e-05, + "loss": 2.1204, + "step": 5716 + }, + { + "epoch": 0.49, + "grad_norm": 1.5655988454818726, + "learning_rate": 2.554719562243502e-05, + "loss": 2.0318, + "step": 5720 + }, + { + "epoch": 0.49, + "grad_norm": 1.6480430364608765, + "learning_rate": 2.553009575923393e-05, + "loss": 2.0188, + "step": 5724 + }, + { + "epoch": 0.49, + "grad_norm": 1.4328726530075073, + "learning_rate": 2.5512995896032832e-05, + "loss": 2.1973, + "step": 5728 + }, + { + "epoch": 0.49, + "grad_norm": 1.6577439308166504, + "learning_rate": 2.549589603283174e-05, + "loss": 2.0672, + "step": 5732 + }, + { + "epoch": 0.49, + "grad_norm": 1.3879199028015137, + "learning_rate": 2.5478796169630643e-05, + "loss": 1.9124, + "step": 5736 + }, + { + "epoch": 0.49, + "grad_norm": 1.4111934900283813, + "learning_rate": 2.546169630642955e-05, + "loss": 1.9643, + "step": 5740 + }, + { + "epoch": 0.49, + "grad_norm": 1.6698105335235596, + "learning_rate": 2.5444596443228458e-05, + "loss": 2.167, + "step": 5744 + }, + { + "epoch": 0.49, + "grad_norm": 1.643890380859375, + "learning_rate": 2.542749658002736e-05, + "loss": 2.1449, + "step": 5748 + }, + { + "epoch": 0.49, + "grad_norm": 1.4590214490890503, + "learning_rate": 2.541039671682627e-05, + "loss": 2.2577, + "step": 5752 + }, + { + "epoch": 0.49, + "grad_norm": 1.5629549026489258, + "learning_rate": 2.539329685362517e-05, + "loss": 2.2183, + "step": 5756 + }, + { + "epoch": 0.49, + "grad_norm": 1.4380923509597778, + "learning_rate": 2.5376196990424077e-05, + "loss": 1.9198, + "step": 5760 + }, + { + "epoch": 0.49, + "grad_norm": 1.4581702947616577, + "learning_rate": 2.5359097127222988e-05, + "loss": 2.0363, + "step": 5764 + }, + { + "epoch": 0.49, + "grad_norm": 1.3352808952331543, + "learning_rate": 2.5341997264021888e-05, + "loss": 2.1038, + "step": 5768 + }, + { + "epoch": 0.49, + "grad_norm": 1.604493498802185, + "learning_rate": 2.5324897400820796e-05, + "loss": 2.1413, + "step": 5772 + }, + { + "epoch": 0.49, + "grad_norm": 1.360095739364624, + "learning_rate": 2.53077975376197e-05, + "loss": 2.0967, + "step": 5776 + }, + { + "epoch": 0.49, + "grad_norm": 1.400731086730957, + "learning_rate": 2.5290697674418607e-05, + "loss": 2.0638, + "step": 5780 + }, + { + "epoch": 0.49, + "grad_norm": 1.611741304397583, + "learning_rate": 2.5273597811217514e-05, + "loss": 1.9994, + "step": 5784 + }, + { + "epoch": 0.49, + "grad_norm": 1.614253044128418, + "learning_rate": 2.5256497948016418e-05, + "loss": 2.0905, + "step": 5788 + }, + { + "epoch": 0.5, + "grad_norm": 1.5998769998550415, + "learning_rate": 2.5239398084815325e-05, + "loss": 2.2255, + "step": 5792 + }, + { + "epoch": 0.5, + "grad_norm": 1.5840561389923096, + "learning_rate": 2.5222298221614226e-05, + "loss": 2.0949, + "step": 5796 + }, + { + "epoch": 0.5, + "grad_norm": 1.5957179069519043, + "learning_rate": 2.5205198358413133e-05, + "loss": 1.9565, + "step": 5800 + }, + { + "epoch": 0.5, + "grad_norm": 1.5299445390701294, + "learning_rate": 2.5188098495212037e-05, + "loss": 2.0896, + "step": 5804 + }, + { + "epoch": 0.5, + "grad_norm": 1.5328844785690308, + "learning_rate": 2.5170998632010945e-05, + "loss": 2.0734, + "step": 5808 + }, + { + "epoch": 0.5, + "grad_norm": 1.43681800365448, + "learning_rate": 2.5153898768809852e-05, + "loss": 1.9839, + "step": 5812 + }, + { + "epoch": 0.5, + "grad_norm": 1.5096018314361572, + "learning_rate": 2.5136798905608756e-05, + "loss": 2.1289, + "step": 5816 + }, + { + "epoch": 0.5, + "grad_norm": 1.5648272037506104, + "learning_rate": 2.5119699042407663e-05, + "loss": 1.9608, + "step": 5820 + }, + { + "epoch": 0.5, + "grad_norm": 1.7235368490219116, + "learning_rate": 2.5102599179206564e-05, + "loss": 2.0752, + "step": 5824 + }, + { + "epoch": 0.5, + "grad_norm": 1.6707707643508911, + "learning_rate": 2.5085499316005475e-05, + "loss": 2.2227, + "step": 5828 + }, + { + "epoch": 0.5, + "grad_norm": 1.5506484508514404, + "learning_rate": 2.5068399452804382e-05, + "loss": 1.9812, + "step": 5832 + }, + { + "epoch": 0.5, + "grad_norm": 1.50376296043396, + "learning_rate": 2.5051299589603283e-05, + "loss": 2.0457, + "step": 5836 + }, + { + "epoch": 0.5, + "grad_norm": 1.5945008993148804, + "learning_rate": 2.503419972640219e-05, + "loss": 2.0204, + "step": 5840 + }, + { + "epoch": 0.5, + "grad_norm": 1.6294224262237549, + "learning_rate": 2.5017099863201094e-05, + "loss": 2.0159, + "step": 5844 + }, + { + "epoch": 0.5, + "grad_norm": 1.4806592464447021, + "learning_rate": 2.5e-05, + "loss": 2.0233, + "step": 5848 + }, + { + "epoch": 0.5, + "grad_norm": 1.4078065156936646, + "learning_rate": 2.4982900136798905e-05, + "loss": 1.9797, + "step": 5852 + }, + { + "epoch": 0.5, + "grad_norm": 1.5420806407928467, + "learning_rate": 2.4965800273597812e-05, + "loss": 2.0582, + "step": 5856 + }, + { + "epoch": 0.5, + "grad_norm": 1.4052869081497192, + "learning_rate": 2.494870041039672e-05, + "loss": 2.0785, + "step": 5860 + }, + { + "epoch": 0.5, + "grad_norm": 1.4887535572052002, + "learning_rate": 2.4931600547195624e-05, + "loss": 2.1067, + "step": 5864 + }, + { + "epoch": 0.5, + "grad_norm": 1.723113775253296, + "learning_rate": 2.4914500683994528e-05, + "loss": 1.973, + "step": 5868 + }, + { + "epoch": 0.5, + "grad_norm": 4.109470367431641, + "learning_rate": 2.4897400820793435e-05, + "loss": 2.2463, + "step": 5872 + }, + { + "epoch": 0.5, + "grad_norm": 1.645058274269104, + "learning_rate": 2.488030095759234e-05, + "loss": 2.1643, + "step": 5876 + }, + { + "epoch": 0.5, + "grad_norm": 1.4571024179458618, + "learning_rate": 2.4863201094391243e-05, + "loss": 2.0342, + "step": 5880 + }, + { + "epoch": 0.5, + "grad_norm": 1.738869309425354, + "learning_rate": 2.4846101231190154e-05, + "loss": 2.106, + "step": 5884 + }, + { + "epoch": 0.5, + "grad_norm": 1.58241868019104, + "learning_rate": 2.4829001367989058e-05, + "loss": 2.1393, + "step": 5888 + }, + { + "epoch": 0.5, + "grad_norm": 1.5051425695419312, + "learning_rate": 2.481190150478796e-05, + "loss": 2.1032, + "step": 5892 + }, + { + "epoch": 0.5, + "grad_norm": 1.5358223915100098, + "learning_rate": 2.479480164158687e-05, + "loss": 2.065, + "step": 5896 + }, + { + "epoch": 0.5, + "grad_norm": 1.3522099256515503, + "learning_rate": 2.4777701778385773e-05, + "loss": 1.9355, + "step": 5900 + }, + { + "epoch": 0.5, + "grad_norm": 1.4918060302734375, + "learning_rate": 2.476060191518468e-05, + "loss": 2.0173, + "step": 5904 + }, + { + "epoch": 0.51, + "grad_norm": 1.5328975915908813, + "learning_rate": 2.4743502051983584e-05, + "loss": 2.1863, + "step": 5908 + }, + { + "epoch": 0.51, + "grad_norm": 1.4947617053985596, + "learning_rate": 2.472640218878249e-05, + "loss": 2.1405, + "step": 5912 + }, + { + "epoch": 0.51, + "grad_norm": 1.535042643547058, + "learning_rate": 2.4709302325581396e-05, + "loss": 1.9693, + "step": 5916 + }, + { + "epoch": 0.51, + "grad_norm": 1.497016191482544, + "learning_rate": 2.46922024623803e-05, + "loss": 1.9618, + "step": 5920 + }, + { + "epoch": 0.51, + "grad_norm": 1.490457534790039, + "learning_rate": 2.467510259917921e-05, + "loss": 2.0978, + "step": 5924 + }, + { + "epoch": 0.51, + "grad_norm": 1.4083185195922852, + "learning_rate": 2.4658002735978114e-05, + "loss": 1.9533, + "step": 5928 + }, + { + "epoch": 0.51, + "grad_norm": 1.638238549232483, + "learning_rate": 2.4640902872777018e-05, + "loss": 2.0625, + "step": 5932 + }, + { + "epoch": 0.51, + "grad_norm": 1.6385629177093506, + "learning_rate": 2.4623803009575925e-05, + "loss": 1.9919, + "step": 5936 + }, + { + "epoch": 0.51, + "grad_norm": 1.3852545022964478, + "learning_rate": 2.460670314637483e-05, + "loss": 2.0971, + "step": 5940 + }, + { + "epoch": 0.51, + "grad_norm": 1.4530996084213257, + "learning_rate": 2.4589603283173733e-05, + "loss": 2.0959, + "step": 5944 + }, + { + "epoch": 0.51, + "grad_norm": 1.573773980140686, + "learning_rate": 2.457250341997264e-05, + "loss": 2.1314, + "step": 5948 + }, + { + "epoch": 0.51, + "grad_norm": 1.4911787509918213, + "learning_rate": 2.4555403556771548e-05, + "loss": 1.9789, + "step": 5952 + }, + { + "epoch": 0.51, + "grad_norm": 1.5948853492736816, + "learning_rate": 2.4538303693570452e-05, + "loss": 2.1686, + "step": 5956 + }, + { + "epoch": 0.51, + "grad_norm": 1.3944917917251587, + "learning_rate": 2.4521203830369356e-05, + "loss": 1.9249, + "step": 5960 + }, + { + "epoch": 0.51, + "grad_norm": 1.5396506786346436, + "learning_rate": 2.4504103967168263e-05, + "loss": 2.0486, + "step": 5964 + }, + { + "epoch": 0.51, + "grad_norm": 1.5431126356124878, + "learning_rate": 2.448700410396717e-05, + "loss": 1.9906, + "step": 5968 + }, + { + "epoch": 0.51, + "grad_norm": 1.4996250867843628, + "learning_rate": 2.4469904240766075e-05, + "loss": 2.0529, + "step": 5972 + }, + { + "epoch": 0.51, + "grad_norm": 1.653665542602539, + "learning_rate": 2.4452804377564982e-05, + "loss": 2.0667, + "step": 5976 + }, + { + "epoch": 0.51, + "grad_norm": 1.543585181236267, + "learning_rate": 2.4435704514363886e-05, + "loss": 2.2203, + "step": 5980 + }, + { + "epoch": 0.51, + "grad_norm": 1.4741888046264648, + "learning_rate": 2.441860465116279e-05, + "loss": 2.0958, + "step": 5984 + }, + { + "epoch": 0.51, + "grad_norm": 1.7316616773605347, + "learning_rate": 2.4401504787961697e-05, + "loss": 2.0456, + "step": 5988 + }, + { + "epoch": 0.51, + "grad_norm": 1.6189284324645996, + "learning_rate": 2.4384404924760605e-05, + "loss": 2.0137, + "step": 5992 + }, + { + "epoch": 0.51, + "grad_norm": 1.3436368703842163, + "learning_rate": 2.436730506155951e-05, + "loss": 2.0219, + "step": 5996 + }, + { + "epoch": 0.51, + "grad_norm": 1.45026433467865, + "learning_rate": 2.4350205198358412e-05, + "loss": 1.938, + "step": 6000 + }, + { + "epoch": 0.51, + "grad_norm": 1.4618995189666748, + "learning_rate": 2.433310533515732e-05, + "loss": 2.1091, + "step": 6004 + }, + { + "epoch": 0.51, + "grad_norm": 1.3523361682891846, + "learning_rate": 2.4316005471956224e-05, + "loss": 2.0435, + "step": 6008 + }, + { + "epoch": 0.51, + "grad_norm": 1.414756178855896, + "learning_rate": 2.429890560875513e-05, + "loss": 2.0553, + "step": 6012 + }, + { + "epoch": 0.51, + "grad_norm": 1.8548967838287354, + "learning_rate": 2.428180574555404e-05, + "loss": 2.1002, + "step": 6016 + }, + { + "epoch": 0.51, + "grad_norm": 1.431153416633606, + "learning_rate": 2.4264705882352942e-05, + "loss": 1.9441, + "step": 6020 + }, + { + "epoch": 0.52, + "grad_norm": 1.4049307107925415, + "learning_rate": 2.4247606019151846e-05, + "loss": 2.1948, + "step": 6024 + }, + { + "epoch": 0.52, + "grad_norm": 1.4106427431106567, + "learning_rate": 2.4230506155950754e-05, + "loss": 1.9183, + "step": 6028 + }, + { + "epoch": 0.52, + "grad_norm": 1.8905525207519531, + "learning_rate": 2.421340629274966e-05, + "loss": 1.961, + "step": 6032 + }, + { + "epoch": 0.52, + "grad_norm": 1.5894415378570557, + "learning_rate": 2.4196306429548565e-05, + "loss": 2.1565, + "step": 6036 + }, + { + "epoch": 0.52, + "grad_norm": 1.6033257246017456, + "learning_rate": 2.417920656634747e-05, + "loss": 1.8504, + "step": 6040 + }, + { + "epoch": 0.52, + "grad_norm": 2.1501522064208984, + "learning_rate": 2.4162106703146376e-05, + "loss": 2.0988, + "step": 6044 + }, + { + "epoch": 0.52, + "grad_norm": 1.688828945159912, + "learning_rate": 2.414500683994528e-05, + "loss": 2.0132, + "step": 6048 + }, + { + "epoch": 0.52, + "grad_norm": 1.479693055152893, + "learning_rate": 2.4127906976744188e-05, + "loss": 2.0963, + "step": 6052 + }, + { + "epoch": 0.52, + "grad_norm": 1.6395564079284668, + "learning_rate": 2.4110807113543095e-05, + "loss": 2.1367, + "step": 6056 + }, + { + "epoch": 0.52, + "grad_norm": 1.4523251056671143, + "learning_rate": 2.4093707250342e-05, + "loss": 2.0349, + "step": 6060 + }, + { + "epoch": 0.52, + "grad_norm": 1.580451488494873, + "learning_rate": 2.4076607387140903e-05, + "loss": 1.9535, + "step": 6064 + }, + { + "epoch": 0.52, + "grad_norm": 1.5951658487319946, + "learning_rate": 2.405950752393981e-05, + "loss": 2.1379, + "step": 6068 + }, + { + "epoch": 0.52, + "grad_norm": 1.5413308143615723, + "learning_rate": 2.4042407660738714e-05, + "loss": 2.0473, + "step": 6072 + }, + { + "epoch": 0.52, + "grad_norm": 1.5242167711257935, + "learning_rate": 2.402530779753762e-05, + "loss": 2.026, + "step": 6076 + }, + { + "epoch": 0.52, + "grad_norm": 1.484012484550476, + "learning_rate": 2.4008207934336525e-05, + "loss": 2.0974, + "step": 6080 + }, + { + "epoch": 0.52, + "grad_norm": 1.6217378377914429, + "learning_rate": 2.3991108071135433e-05, + "loss": 2.0572, + "step": 6084 + }, + { + "epoch": 0.52, + "grad_norm": 1.622316837310791, + "learning_rate": 2.3974008207934337e-05, + "loss": 2.111, + "step": 6088 + }, + { + "epoch": 0.52, + "grad_norm": 1.4300661087036133, + "learning_rate": 2.395690834473324e-05, + "loss": 2.0311, + "step": 6092 + }, + { + "epoch": 0.52, + "grad_norm": 1.4880815744400024, + "learning_rate": 2.393980848153215e-05, + "loss": 1.9217, + "step": 6096 + }, + { + "epoch": 0.52, + "grad_norm": 1.5886203050613403, + "learning_rate": 2.3922708618331055e-05, + "loss": 2.1043, + "step": 6100 + }, + { + "epoch": 0.52, + "grad_norm": 1.4203860759735107, + "learning_rate": 2.390560875512996e-05, + "loss": 2.1326, + "step": 6104 + }, + { + "epoch": 0.52, + "grad_norm": 1.777569055557251, + "learning_rate": 2.3888508891928867e-05, + "loss": 2.0363, + "step": 6108 + }, + { + "epoch": 0.52, + "grad_norm": 1.4837594032287598, + "learning_rate": 2.387140902872777e-05, + "loss": 1.8943, + "step": 6112 + }, + { + "epoch": 0.52, + "grad_norm": 1.6649631261825562, + "learning_rate": 2.3854309165526678e-05, + "loss": 2.1792, + "step": 6116 + }, + { + "epoch": 0.52, + "grad_norm": 1.4544529914855957, + "learning_rate": 2.3837209302325582e-05, + "loss": 1.9499, + "step": 6120 + }, + { + "epoch": 0.52, + "grad_norm": 1.4706698656082153, + "learning_rate": 2.382010943912449e-05, + "loss": 1.9453, + "step": 6124 + }, + { + "epoch": 0.52, + "grad_norm": 1.8051691055297852, + "learning_rate": 2.3803009575923393e-05, + "loss": 2.124, + "step": 6128 + }, + { + "epoch": 0.52, + "grad_norm": 1.5130858421325684, + "learning_rate": 2.3785909712722297e-05, + "loss": 2.0967, + "step": 6132 + }, + { + "epoch": 0.52, + "grad_norm": 1.6385159492492676, + "learning_rate": 2.3768809849521205e-05, + "loss": 2.155, + "step": 6136 + }, + { + "epoch": 0.52, + "grad_norm": 1.579830527305603, + "learning_rate": 2.3751709986320112e-05, + "loss": 2.0291, + "step": 6140 + }, + { + "epoch": 0.53, + "grad_norm": 1.6579816341400146, + "learning_rate": 2.3734610123119016e-05, + "loss": 1.9364, + "step": 6144 + }, + { + "epoch": 0.53, + "grad_norm": 1.4621435403823853, + "learning_rate": 2.3717510259917923e-05, + "loss": 2.0041, + "step": 6148 + }, + { + "epoch": 0.53, + "grad_norm": 1.4901877641677856, + "learning_rate": 2.3700410396716827e-05, + "loss": 2.0768, + "step": 6152 + }, + { + "epoch": 0.53, + "grad_norm": 1.5023964643478394, + "learning_rate": 2.368331053351573e-05, + "loss": 1.8666, + "step": 6156 + }, + { + "epoch": 0.53, + "grad_norm": 1.394523024559021, + "learning_rate": 2.366621067031464e-05, + "loss": 2.0315, + "step": 6160 + }, + { + "epoch": 0.53, + "grad_norm": 1.3989911079406738, + "learning_rate": 2.3649110807113546e-05, + "loss": 2.0719, + "step": 6164 + }, + { + "epoch": 0.53, + "grad_norm": 1.6071873903274536, + "learning_rate": 2.363201094391245e-05, + "loss": 2.07, + "step": 6168 + }, + { + "epoch": 0.53, + "grad_norm": 1.4817439317703247, + "learning_rate": 2.3614911080711354e-05, + "loss": 2.0119, + "step": 6172 + }, + { + "epoch": 0.53, + "grad_norm": 1.3561433553695679, + "learning_rate": 2.359781121751026e-05, + "loss": 1.9405, + "step": 6176 + }, + { + "epoch": 0.53, + "grad_norm": 1.5416829586029053, + "learning_rate": 2.358071135430917e-05, + "loss": 2.0001, + "step": 6180 + }, + { + "epoch": 0.53, + "grad_norm": 1.4573452472686768, + "learning_rate": 2.3563611491108072e-05, + "loss": 1.9938, + "step": 6184 + }, + { + "epoch": 0.53, + "grad_norm": 1.6019344329833984, + "learning_rate": 2.354651162790698e-05, + "loss": 1.9446, + "step": 6188 + }, + { + "epoch": 0.53, + "grad_norm": 1.7344564199447632, + "learning_rate": 2.3529411764705884e-05, + "loss": 1.862, + "step": 6192 + }, + { + "epoch": 0.53, + "grad_norm": 1.6940727233886719, + "learning_rate": 2.3512311901504788e-05, + "loss": 1.9843, + "step": 6196 + }, + { + "epoch": 0.53, + "grad_norm": 1.5194576978683472, + "learning_rate": 2.3495212038303695e-05, + "loss": 2.0475, + "step": 6200 + }, + { + "epoch": 0.53, + "grad_norm": 1.7754817008972168, + "learning_rate": 2.3478112175102602e-05, + "loss": 2.0975, + "step": 6204 + }, + { + "epoch": 0.53, + "grad_norm": 1.676499605178833, + "learning_rate": 2.3461012311901506e-05, + "loss": 2.1042, + "step": 6208 + }, + { + "epoch": 0.53, + "grad_norm": 1.5539844036102295, + "learning_rate": 2.344391244870041e-05, + "loss": 2.0322, + "step": 6212 + }, + { + "epoch": 0.53, + "grad_norm": 1.5558497905731201, + "learning_rate": 2.3426812585499318e-05, + "loss": 2.1731, + "step": 6216 + }, + { + "epoch": 0.53, + "grad_norm": 1.4876151084899902, + "learning_rate": 2.340971272229822e-05, + "loss": 1.9864, + "step": 6220 + }, + { + "epoch": 0.53, + "grad_norm": 1.5103055238723755, + "learning_rate": 2.339261285909713e-05, + "loss": 2.0973, + "step": 6224 + }, + { + "epoch": 0.53, + "grad_norm": 1.617957353591919, + "learning_rate": 2.3375512995896033e-05, + "loss": 1.8867, + "step": 6228 + }, + { + "epoch": 0.53, + "grad_norm": 1.417094111442566, + "learning_rate": 2.335841313269494e-05, + "loss": 2.0805, + "step": 6232 + }, + { + "epoch": 0.53, + "grad_norm": 1.5233298540115356, + "learning_rate": 2.3341313269493844e-05, + "loss": 1.9551, + "step": 6236 + }, + { + "epoch": 0.53, + "grad_norm": 1.5211933851242065, + "learning_rate": 2.3324213406292748e-05, + "loss": 2.0242, + "step": 6240 + }, + { + "epoch": 0.53, + "grad_norm": 1.4824392795562744, + "learning_rate": 2.330711354309166e-05, + "loss": 1.9477, + "step": 6244 + }, + { + "epoch": 0.53, + "grad_norm": 1.4873578548431396, + "learning_rate": 2.3290013679890563e-05, + "loss": 2.1701, + "step": 6248 + }, + { + "epoch": 0.53, + "grad_norm": 1.5688248872756958, + "learning_rate": 2.3272913816689467e-05, + "loss": 2.0478, + "step": 6252 + }, + { + "epoch": 0.53, + "grad_norm": 1.4556810855865479, + "learning_rate": 2.3255813953488374e-05, + "loss": 1.9112, + "step": 6256 + }, + { + "epoch": 0.54, + "grad_norm": 1.8388420343399048, + "learning_rate": 2.3238714090287278e-05, + "loss": 2.1023, + "step": 6260 + }, + { + "epoch": 0.54, + "grad_norm": 1.615323543548584, + "learning_rate": 2.3221614227086182e-05, + "loss": 2.0831, + "step": 6264 + }, + { + "epoch": 0.54, + "grad_norm": 1.488853096961975, + "learning_rate": 2.320451436388509e-05, + "loss": 1.9403, + "step": 6268 + }, + { + "epoch": 0.54, + "grad_norm": 1.5217219591140747, + "learning_rate": 2.3187414500683997e-05, + "loss": 2.1556, + "step": 6272 + }, + { + "epoch": 0.54, + "grad_norm": 1.4916918277740479, + "learning_rate": 2.31703146374829e-05, + "loss": 2.0442, + "step": 6276 + }, + { + "epoch": 0.54, + "grad_norm": 1.976369857788086, + "learning_rate": 2.3153214774281805e-05, + "loss": 2.0054, + "step": 6280 + }, + { + "epoch": 0.54, + "grad_norm": 1.6137586832046509, + "learning_rate": 2.3136114911080712e-05, + "loss": 2.0149, + "step": 6284 + }, + { + "epoch": 0.54, + "grad_norm": 1.3688173294067383, + "learning_rate": 2.311901504787962e-05, + "loss": 2.0518, + "step": 6288 + }, + { + "epoch": 0.54, + "grad_norm": 1.489639163017273, + "learning_rate": 2.3101915184678523e-05, + "loss": 2.186, + "step": 6292 + }, + { + "epoch": 0.54, + "grad_norm": 1.4436581134796143, + "learning_rate": 2.308481532147743e-05, + "loss": 1.9726, + "step": 6296 + }, + { + "epoch": 0.54, + "grad_norm": 5.76146125793457, + "learning_rate": 2.3067715458276335e-05, + "loss": 1.9351, + "step": 6300 + }, + { + "epoch": 0.54, + "grad_norm": 1.7009334564208984, + "learning_rate": 2.305061559507524e-05, + "loss": 1.9967, + "step": 6304 + }, + { + "epoch": 0.54, + "grad_norm": 1.611024260520935, + "learning_rate": 2.3033515731874146e-05, + "loss": 2.0911, + "step": 6308 + }, + { + "epoch": 0.54, + "grad_norm": 1.5418471097946167, + "learning_rate": 2.3016415868673053e-05, + "loss": 1.9721, + "step": 6312 + }, + { + "epoch": 0.54, + "grad_norm": 1.5658395290374756, + "learning_rate": 2.2999316005471957e-05, + "loss": 2.0573, + "step": 6316 + }, + { + "epoch": 0.54, + "grad_norm": 1.365360140800476, + "learning_rate": 2.298221614227086e-05, + "loss": 1.9245, + "step": 6320 + }, + { + "epoch": 0.54, + "grad_norm": 1.7127231359481812, + "learning_rate": 2.296511627906977e-05, + "loss": 1.9568, + "step": 6324 + }, + { + "epoch": 0.54, + "grad_norm": 1.572302222251892, + "learning_rate": 2.2948016415868672e-05, + "loss": 2.0137, + "step": 6328 + }, + { + "epoch": 0.54, + "grad_norm": 1.7327531576156616, + "learning_rate": 2.293091655266758e-05, + "loss": 2.0781, + "step": 6332 + }, + { + "epoch": 0.54, + "grad_norm": 1.5620208978652954, + "learning_rate": 2.2913816689466487e-05, + "loss": 2.0689, + "step": 6336 + }, + { + "epoch": 0.54, + "grad_norm": 1.4865524768829346, + "learning_rate": 2.289671682626539e-05, + "loss": 1.9261, + "step": 6340 + }, + { + "epoch": 0.54, + "grad_norm": 1.6262495517730713, + "learning_rate": 2.2879616963064295e-05, + "loss": 2.1261, + "step": 6344 + }, + { + "epoch": 0.54, + "grad_norm": 1.536917805671692, + "learning_rate": 2.2862517099863202e-05, + "loss": 2.0757, + "step": 6348 + }, + { + "epoch": 0.54, + "grad_norm": 1.4335952997207642, + "learning_rate": 2.284541723666211e-05, + "loss": 1.967, + "step": 6352 + }, + { + "epoch": 0.54, + "grad_norm": 2.1377627849578857, + "learning_rate": 2.2828317373461014e-05, + "loss": 1.996, + "step": 6356 + }, + { + "epoch": 0.54, + "grad_norm": 1.705106496810913, + "learning_rate": 2.2811217510259918e-05, + "loss": 2.0619, + "step": 6360 + }, + { + "epoch": 0.54, + "grad_norm": 1.5044035911560059, + "learning_rate": 2.2794117647058825e-05, + "loss": 2.0941, + "step": 6364 + }, + { + "epoch": 0.54, + "grad_norm": 1.485723614692688, + "learning_rate": 2.277701778385773e-05, + "loss": 2.0471, + "step": 6368 + }, + { + "epoch": 0.54, + "grad_norm": 1.3270933628082275, + "learning_rate": 2.2759917920656636e-05, + "loss": 1.9035, + "step": 6372 + }, + { + "epoch": 0.55, + "grad_norm": 1.8408259153366089, + "learning_rate": 2.2742818057455544e-05, + "loss": 2.1103, + "step": 6376 + }, + { + "epoch": 0.55, + "grad_norm": 1.5372594594955444, + "learning_rate": 2.2725718194254448e-05, + "loss": 1.9872, + "step": 6380 + }, + { + "epoch": 0.55, + "grad_norm": 1.5519907474517822, + "learning_rate": 2.270861833105335e-05, + "loss": 2.0445, + "step": 6384 + }, + { + "epoch": 0.55, + "grad_norm": 1.5492377281188965, + "learning_rate": 2.269151846785226e-05, + "loss": 2.1288, + "step": 6388 + }, + { + "epoch": 0.55, + "grad_norm": 1.5490655899047852, + "learning_rate": 2.2674418604651163e-05, + "loss": 2.0827, + "step": 6392 + }, + { + "epoch": 0.55, + "grad_norm": 1.575005292892456, + "learning_rate": 2.265731874145007e-05, + "loss": 2.0462, + "step": 6396 + }, + { + "epoch": 0.55, + "grad_norm": 1.5636953115463257, + "learning_rate": 2.2640218878248974e-05, + "loss": 2.0272, + "step": 6400 + }, + { + "epoch": 0.55, + "grad_norm": 1.4085521697998047, + "learning_rate": 2.262311901504788e-05, + "loss": 1.9316, + "step": 6404 + }, + { + "epoch": 0.55, + "grad_norm": 1.375346064567566, + "learning_rate": 2.2606019151846785e-05, + "loss": 2.1006, + "step": 6408 + }, + { + "epoch": 0.55, + "grad_norm": 1.5177054405212402, + "learning_rate": 2.258891928864569e-05, + "loss": 2.104, + "step": 6412 + }, + { + "epoch": 0.55, + "grad_norm": 1.590615153312683, + "learning_rate": 2.25718194254446e-05, + "loss": 2.054, + "step": 6416 + }, + { + "epoch": 0.55, + "grad_norm": 1.6680008172988892, + "learning_rate": 2.2554719562243504e-05, + "loss": 2.0611, + "step": 6420 + }, + { + "epoch": 0.55, + "grad_norm": 2.7814908027648926, + "learning_rate": 2.2537619699042408e-05, + "loss": 2.0015, + "step": 6424 + }, + { + "epoch": 0.55, + "grad_norm": 1.419728398323059, + "learning_rate": 2.2520519835841315e-05, + "loss": 2.0902, + "step": 6428 + }, + { + "epoch": 0.55, + "grad_norm": 1.4798617362976074, + "learning_rate": 2.250341997264022e-05, + "loss": 2.05, + "step": 6432 + }, + { + "epoch": 0.55, + "grad_norm": 1.506953477859497, + "learning_rate": 2.2486320109439127e-05, + "loss": 2.0153, + "step": 6436 + }, + { + "epoch": 0.55, + "grad_norm": 1.6882991790771484, + "learning_rate": 2.246922024623803e-05, + "loss": 1.9237, + "step": 6440 + }, + { + "epoch": 0.55, + "grad_norm": 1.4662657976150513, + "learning_rate": 2.2452120383036938e-05, + "loss": 2.0524, + "step": 6444 + }, + { + "epoch": 0.55, + "grad_norm": 1.5177686214447021, + "learning_rate": 2.2435020519835842e-05, + "loss": 2.0207, + "step": 6448 + }, + { + "epoch": 0.55, + "grad_norm": 1.7615087032318115, + "learning_rate": 2.2417920656634746e-05, + "loss": 2.07, + "step": 6452 + }, + { + "epoch": 0.55, + "grad_norm": 1.7587693929672241, + "learning_rate": 2.2400820793433653e-05, + "loss": 2.0928, + "step": 6456 + }, + { + "epoch": 0.55, + "grad_norm": 1.6532100439071655, + "learning_rate": 2.238372093023256e-05, + "loss": 2.0425, + "step": 6460 + }, + { + "epoch": 0.55, + "grad_norm": 1.4801387786865234, + "learning_rate": 2.2366621067031464e-05, + "loss": 2.1567, + "step": 6464 + }, + { + "epoch": 0.55, + "grad_norm": 1.6623640060424805, + "learning_rate": 2.2349521203830372e-05, + "loss": 2.0281, + "step": 6468 + }, + { + "epoch": 0.55, + "grad_norm": 1.5329927206039429, + "learning_rate": 2.2332421340629276e-05, + "loss": 1.9157, + "step": 6472 + }, + { + "epoch": 0.55, + "grad_norm": 1.5152140855789185, + "learning_rate": 2.231532147742818e-05, + "loss": 2.0218, + "step": 6476 + }, + { + "epoch": 0.55, + "grad_norm": 1.4606438875198364, + "learning_rate": 2.2298221614227087e-05, + "loss": 1.9652, + "step": 6480 + }, + { + "epoch": 0.55, + "grad_norm": 1.4744431972503662, + "learning_rate": 2.2281121751025994e-05, + "loss": 1.9493, + "step": 6484 + }, + { + "epoch": 0.55, + "grad_norm": 1.527673363685608, + "learning_rate": 2.22640218878249e-05, + "loss": 2.1294, + "step": 6488 + }, + { + "epoch": 0.56, + "grad_norm": 1.5893338918685913, + "learning_rate": 2.2246922024623802e-05, + "loss": 2.0014, + "step": 6492 + }, + { + "epoch": 0.56, + "grad_norm": 1.53531014919281, + "learning_rate": 2.222982216142271e-05, + "loss": 2.1177, + "step": 6496 + }, + { + "epoch": 0.56, + "grad_norm": 1.4468746185302734, + "learning_rate": 2.2212722298221617e-05, + "loss": 1.9305, + "step": 6500 + }, + { + "epoch": 0.56, + "grad_norm": 1.5749852657318115, + "learning_rate": 2.219562243502052e-05, + "loss": 2.0401, + "step": 6504 + }, + { + "epoch": 0.56, + "grad_norm": 1.6165703535079956, + "learning_rate": 2.217852257181943e-05, + "loss": 2.1496, + "step": 6508 + }, + { + "epoch": 0.56, + "grad_norm": 1.5743823051452637, + "learning_rate": 2.2161422708618332e-05, + "loss": 2.0051, + "step": 6512 + }, + { + "epoch": 0.56, + "grad_norm": 1.5725358724594116, + "learning_rate": 2.2144322845417236e-05, + "loss": 2.1795, + "step": 6516 + }, + { + "epoch": 0.56, + "grad_norm": 1.5228146314620972, + "learning_rate": 2.2127222982216144e-05, + "loss": 2.0832, + "step": 6520 + }, + { + "epoch": 0.56, + "grad_norm": 1.584122657775879, + "learning_rate": 2.211012311901505e-05, + "loss": 1.9658, + "step": 6524 + }, + { + "epoch": 0.56, + "grad_norm": 1.6192436218261719, + "learning_rate": 2.2093023255813955e-05, + "loss": 2.047, + "step": 6528 + }, + { + "epoch": 0.56, + "grad_norm": 1.4840277433395386, + "learning_rate": 2.207592339261286e-05, + "loss": 2.0195, + "step": 6532 + }, + { + "epoch": 0.56, + "grad_norm": 1.5676360130310059, + "learning_rate": 2.2058823529411766e-05, + "loss": 2.0233, + "step": 6536 + }, + { + "epoch": 0.56, + "grad_norm": 1.5410796403884888, + "learning_rate": 2.204172366621067e-05, + "loss": 1.9596, + "step": 6540 + }, + { + "epoch": 0.56, + "grad_norm": 1.6324695348739624, + "learning_rate": 2.2024623803009577e-05, + "loss": 1.9401, + "step": 6544 + }, + { + "epoch": 0.56, + "grad_norm": 1.4897451400756836, + "learning_rate": 2.2007523939808485e-05, + "loss": 2.0069, + "step": 6548 + }, + { + "epoch": 0.56, + "grad_norm": 1.5398979187011719, + "learning_rate": 2.199042407660739e-05, + "loss": 2.0448, + "step": 6552 + }, + { + "epoch": 0.56, + "grad_norm": 1.3995730876922607, + "learning_rate": 2.1973324213406293e-05, + "loss": 1.9606, + "step": 6556 + }, + { + "epoch": 0.56, + "grad_norm": 1.6123522520065308, + "learning_rate": 2.19562243502052e-05, + "loss": 1.9598, + "step": 6560 + }, + { + "epoch": 0.56, + "grad_norm": 1.5927785634994507, + "learning_rate": 2.1939124487004107e-05, + "loss": 1.8914, + "step": 6564 + }, + { + "epoch": 0.56, + "grad_norm": 1.3952325582504272, + "learning_rate": 2.192202462380301e-05, + "loss": 1.8675, + "step": 6568 + }, + { + "epoch": 0.56, + "grad_norm": 1.780047059059143, + "learning_rate": 2.1904924760601915e-05, + "loss": 2.037, + "step": 6572 + }, + { + "epoch": 0.56, + "grad_norm": 1.719232439994812, + "learning_rate": 2.1887824897400823e-05, + "loss": 1.9979, + "step": 6576 + }, + { + "epoch": 0.56, + "grad_norm": 1.8734204769134521, + "learning_rate": 2.1870725034199727e-05, + "loss": 2.0388, + "step": 6580 + }, + { + "epoch": 0.56, + "grad_norm": 1.5046261548995972, + "learning_rate": 2.185362517099863e-05, + "loss": 2.0383, + "step": 6584 + }, + { + "epoch": 0.56, + "grad_norm": 1.5430729389190674, + "learning_rate": 2.1836525307797538e-05, + "loss": 2.1083, + "step": 6588 + }, + { + "epoch": 0.56, + "grad_norm": 1.647208571434021, + "learning_rate": 2.1819425444596445e-05, + "loss": 2.0813, + "step": 6592 + }, + { + "epoch": 0.56, + "grad_norm": 1.5349608659744263, + "learning_rate": 2.180232558139535e-05, + "loss": 1.9528, + "step": 6596 + }, + { + "epoch": 0.56, + "grad_norm": 1.6204650402069092, + "learning_rate": 2.1785225718194253e-05, + "loss": 2.1957, + "step": 6600 + }, + { + "epoch": 0.56, + "grad_norm": 1.6103018522262573, + "learning_rate": 2.176812585499316e-05, + "loss": 1.9506, + "step": 6604 + }, + { + "epoch": 0.56, + "grad_norm": 2.1254968643188477, + "learning_rate": 2.1751025991792068e-05, + "loss": 2.1188, + "step": 6608 + }, + { + "epoch": 0.57, + "grad_norm": 1.5318729877471924, + "learning_rate": 2.1733926128590972e-05, + "loss": 2.0858, + "step": 6612 + }, + { + "epoch": 0.57, + "grad_norm": 1.4337228536605835, + "learning_rate": 2.171682626538988e-05, + "loss": 1.9164, + "step": 6616 + }, + { + "epoch": 0.57, + "grad_norm": 1.4926187992095947, + "learning_rate": 2.1699726402188783e-05, + "loss": 1.9784, + "step": 6620 + }, + { + "epoch": 0.57, + "grad_norm": 1.419592022895813, + "learning_rate": 2.1682626538987687e-05, + "loss": 1.9949, + "step": 6624 + }, + { + "epoch": 0.57, + "grad_norm": 1.4496034383773804, + "learning_rate": 2.1665526675786594e-05, + "loss": 2.068, + "step": 6628 + }, + { + "epoch": 0.57, + "grad_norm": 1.4800695180892944, + "learning_rate": 2.1648426812585502e-05, + "loss": 1.9682, + "step": 6632 + }, + { + "epoch": 0.57, + "grad_norm": 1.7047966718673706, + "learning_rate": 2.1631326949384406e-05, + "loss": 2.0647, + "step": 6636 + }, + { + "epoch": 0.57, + "grad_norm": 1.4624961614608765, + "learning_rate": 2.161422708618331e-05, + "loss": 1.9154, + "step": 6640 + }, + { + "epoch": 0.57, + "grad_norm": 1.4518662691116333, + "learning_rate": 2.1597127222982217e-05, + "loss": 2.1044, + "step": 6644 + }, + { + "epoch": 0.57, + "grad_norm": 1.488329529762268, + "learning_rate": 2.158002735978112e-05, + "loss": 2.0274, + "step": 6648 + }, + { + "epoch": 0.57, + "grad_norm": 1.4353028535842896, + "learning_rate": 2.156292749658003e-05, + "loss": 1.9888, + "step": 6652 + }, + { + "epoch": 0.57, + "grad_norm": 1.470017433166504, + "learning_rate": 2.1545827633378936e-05, + "loss": 1.9887, + "step": 6656 + }, + { + "epoch": 0.57, + "grad_norm": 1.5084627866744995, + "learning_rate": 2.152872777017784e-05, + "loss": 1.9513, + "step": 6660 + }, + { + "epoch": 0.57, + "grad_norm": 1.5296393632888794, + "learning_rate": 2.1511627906976744e-05, + "loss": 1.8803, + "step": 6664 + }, + { + "epoch": 0.57, + "grad_norm": 1.597259283065796, + "learning_rate": 2.149452804377565e-05, + "loss": 1.9239, + "step": 6668 + }, + { + "epoch": 0.57, + "grad_norm": 1.476731300354004, + "learning_rate": 2.1477428180574558e-05, + "loss": 2.0592, + "step": 6672 + }, + { + "epoch": 0.57, + "grad_norm": 1.5087945461273193, + "learning_rate": 2.1460328317373462e-05, + "loss": 2.0161, + "step": 6676 + }, + { + "epoch": 0.57, + "grad_norm": 1.6463593244552612, + "learning_rate": 2.1443228454172366e-05, + "loss": 2.0282, + "step": 6680 + }, + { + "epoch": 0.57, + "grad_norm": 1.6463061571121216, + "learning_rate": 2.1426128590971274e-05, + "loss": 2.0664, + "step": 6684 + }, + { + "epoch": 0.57, + "grad_norm": 1.5039496421813965, + "learning_rate": 2.1409028727770177e-05, + "loss": 2.0983, + "step": 6688 + }, + { + "epoch": 0.57, + "grad_norm": 1.4754881858825684, + "learning_rate": 2.1391928864569085e-05, + "loss": 2.0178, + "step": 6692 + }, + { + "epoch": 0.57, + "grad_norm": 1.7630116939544678, + "learning_rate": 2.1374829001367992e-05, + "loss": 2.0397, + "step": 6696 + }, + { + "epoch": 0.57, + "grad_norm": 1.4631503820419312, + "learning_rate": 2.1357729138166896e-05, + "loss": 1.9825, + "step": 6700 + }, + { + "epoch": 0.57, + "grad_norm": 1.5634981393814087, + "learning_rate": 2.13406292749658e-05, + "loss": 1.8953, + "step": 6704 + }, + { + "epoch": 0.57, + "grad_norm": 1.5709517002105713, + "learning_rate": 2.1323529411764707e-05, + "loss": 1.9242, + "step": 6708 + }, + { + "epoch": 0.57, + "grad_norm": 1.5792012214660645, + "learning_rate": 2.130642954856361e-05, + "loss": 2.2404, + "step": 6712 + }, + { + "epoch": 0.57, + "grad_norm": 1.5689932107925415, + "learning_rate": 2.128932968536252e-05, + "loss": 2.1333, + "step": 6716 + }, + { + "epoch": 0.57, + "grad_norm": 1.3986164331436157, + "learning_rate": 2.1272229822161423e-05, + "loss": 1.8481, + "step": 6720 + }, + { + "epoch": 0.57, + "grad_norm": 1.62111496925354, + "learning_rate": 2.125512995896033e-05, + "loss": 2.057, + "step": 6724 + }, + { + "epoch": 0.58, + "grad_norm": 1.6501810550689697, + "learning_rate": 2.1238030095759234e-05, + "loss": 2.0681, + "step": 6728 + }, + { + "epoch": 0.58, + "grad_norm": 1.5174288749694824, + "learning_rate": 2.1220930232558138e-05, + "loss": 2.0297, + "step": 6732 + }, + { + "epoch": 0.58, + "grad_norm": 1.5836937427520752, + "learning_rate": 2.120383036935705e-05, + "loss": 2.0201, + "step": 6736 + }, + { + "epoch": 0.58, + "grad_norm": 1.6528754234313965, + "learning_rate": 2.1186730506155953e-05, + "loss": 2.0832, + "step": 6740 + }, + { + "epoch": 0.58, + "grad_norm": 1.435103416442871, + "learning_rate": 2.1169630642954857e-05, + "loss": 1.9953, + "step": 6744 + }, + { + "epoch": 0.58, + "grad_norm": 1.4534496068954468, + "learning_rate": 2.1152530779753764e-05, + "loss": 2.0343, + "step": 6748 + }, + { + "epoch": 0.58, + "grad_norm": 1.5175713300704956, + "learning_rate": 2.1135430916552668e-05, + "loss": 1.9486, + "step": 6752 + }, + { + "epoch": 0.58, + "grad_norm": 1.3781896829605103, + "learning_rate": 2.1118331053351575e-05, + "loss": 1.8642, + "step": 6756 + }, + { + "epoch": 0.58, + "grad_norm": 1.5847692489624023, + "learning_rate": 2.110123119015048e-05, + "loss": 1.9693, + "step": 6760 + }, + { + "epoch": 0.58, + "grad_norm": 1.9104549884796143, + "learning_rate": 2.1084131326949386e-05, + "loss": 2.0385, + "step": 6764 + }, + { + "epoch": 0.58, + "grad_norm": 1.6398965120315552, + "learning_rate": 2.106703146374829e-05, + "loss": 2.0187, + "step": 6768 + }, + { + "epoch": 0.58, + "grad_norm": 1.4324244260787964, + "learning_rate": 2.1049931600547194e-05, + "loss": 1.8963, + "step": 6772 + }, + { + "epoch": 0.58, + "grad_norm": 1.57370126247406, + "learning_rate": 2.1032831737346102e-05, + "loss": 2.0443, + "step": 6776 + }, + { + "epoch": 0.58, + "grad_norm": 1.39073646068573, + "learning_rate": 2.101573187414501e-05, + "loss": 1.9976, + "step": 6780 + }, + { + "epoch": 0.58, + "grad_norm": 1.6951498985290527, + "learning_rate": 2.0998632010943913e-05, + "loss": 1.7654, + "step": 6784 + }, + { + "epoch": 0.58, + "grad_norm": 1.5796420574188232, + "learning_rate": 2.098153214774282e-05, + "loss": 1.9724, + "step": 6788 + }, + { + "epoch": 0.58, + "grad_norm": 1.536661148071289, + "learning_rate": 2.0964432284541724e-05, + "loss": 2.0424, + "step": 6792 + }, + { + "epoch": 0.58, + "grad_norm": 1.6385177373886108, + "learning_rate": 2.094733242134063e-05, + "loss": 2.0934, + "step": 6796 + }, + { + "epoch": 0.58, + "grad_norm": 1.5548115968704224, + "learning_rate": 2.0930232558139536e-05, + "loss": 1.8169, + "step": 6800 + }, + { + "epoch": 0.58, + "grad_norm": 1.4583497047424316, + "learning_rate": 2.0913132694938443e-05, + "loss": 1.9631, + "step": 6804 + }, + { + "epoch": 0.58, + "grad_norm": 1.6827950477600098, + "learning_rate": 2.0896032831737347e-05, + "loss": 1.8003, + "step": 6808 + }, + { + "epoch": 0.58, + "grad_norm": 1.881792664527893, + "learning_rate": 2.087893296853625e-05, + "loss": 1.9482, + "step": 6812 + }, + { + "epoch": 0.58, + "grad_norm": 1.5031013488769531, + "learning_rate": 2.0861833105335158e-05, + "loss": 1.888, + "step": 6816 + }, + { + "epoch": 0.58, + "grad_norm": 1.7408212423324585, + "learning_rate": 2.0844733242134066e-05, + "loss": 2.1306, + "step": 6820 + }, + { + "epoch": 0.58, + "grad_norm": 1.5273849964141846, + "learning_rate": 2.082763337893297e-05, + "loss": 1.9256, + "step": 6824 + }, + { + "epoch": 0.58, + "grad_norm": 1.6354838609695435, + "learning_rate": 2.0810533515731877e-05, + "loss": 1.9018, + "step": 6828 + }, + { + "epoch": 0.58, + "grad_norm": 1.5808496475219727, + "learning_rate": 2.079343365253078e-05, + "loss": 1.9239, + "step": 6832 + }, + { + "epoch": 0.58, + "grad_norm": 1.5880577564239502, + "learning_rate": 2.0776333789329685e-05, + "loss": 1.9876, + "step": 6836 + }, + { + "epoch": 0.58, + "grad_norm": 1.554107904434204, + "learning_rate": 2.0759233926128592e-05, + "loss": 2.1325, + "step": 6840 + }, + { + "epoch": 0.59, + "grad_norm": 1.7010103464126587, + "learning_rate": 2.07421340629275e-05, + "loss": 2.1108, + "step": 6844 + }, + { + "epoch": 0.59, + "grad_norm": 1.4548836946487427, + "learning_rate": 2.0725034199726403e-05, + "loss": 1.854, + "step": 6848 + }, + { + "epoch": 0.59, + "grad_norm": 1.602513074874878, + "learning_rate": 2.0707934336525307e-05, + "loss": 1.9275, + "step": 6852 + }, + { + "epoch": 0.59, + "grad_norm": 1.379732608795166, + "learning_rate": 2.0690834473324215e-05, + "loss": 1.8298, + "step": 6856 + }, + { + "epoch": 0.59, + "grad_norm": 1.5978758335113525, + "learning_rate": 2.067373461012312e-05, + "loss": 2.0292, + "step": 6860 + }, + { + "epoch": 0.59, + "grad_norm": 1.5919899940490723, + "learning_rate": 2.0656634746922026e-05, + "loss": 1.9795, + "step": 6864 + }, + { + "epoch": 0.59, + "grad_norm": 1.4929299354553223, + "learning_rate": 2.0639534883720933e-05, + "loss": 2.0711, + "step": 6868 + }, + { + "epoch": 0.59, + "grad_norm": 1.5734412670135498, + "learning_rate": 2.0622435020519837e-05, + "loss": 1.9286, + "step": 6872 + }, + { + "epoch": 0.59, + "grad_norm": 1.3680405616760254, + "learning_rate": 2.060533515731874e-05, + "loss": 1.9525, + "step": 6876 + }, + { + "epoch": 0.59, + "grad_norm": 1.4015724658966064, + "learning_rate": 2.058823529411765e-05, + "loss": 1.8651, + "step": 6880 + }, + { + "epoch": 0.59, + "grad_norm": 1.4925768375396729, + "learning_rate": 2.0571135430916556e-05, + "loss": 1.8487, + "step": 6884 + }, + { + "epoch": 0.59, + "grad_norm": 1.5962053537368774, + "learning_rate": 2.055403556771546e-05, + "loss": 2.0726, + "step": 6888 + }, + { + "epoch": 0.59, + "grad_norm": 1.5657358169555664, + "learning_rate": 2.0536935704514364e-05, + "loss": 2.0781, + "step": 6892 + }, + { + "epoch": 0.59, + "grad_norm": 1.4741593599319458, + "learning_rate": 2.051983584131327e-05, + "loss": 1.9721, + "step": 6896 + }, + { + "epoch": 0.59, + "grad_norm": 1.5878019332885742, + "learning_rate": 2.0502735978112175e-05, + "loss": 1.9813, + "step": 6900 + }, + { + "epoch": 0.59, + "grad_norm": 1.4257550239562988, + "learning_rate": 2.048563611491108e-05, + "loss": 1.9342, + "step": 6904 + }, + { + "epoch": 0.59, + "grad_norm": 1.603327751159668, + "learning_rate": 2.046853625170999e-05, + "loss": 2.0587, + "step": 6908 + }, + { + "epoch": 0.59, + "grad_norm": 1.4854941368103027, + "learning_rate": 2.0451436388508894e-05, + "loss": 1.8446, + "step": 6912 + }, + { + "epoch": 0.59, + "grad_norm": 1.5847572088241577, + "learning_rate": 2.0434336525307798e-05, + "loss": 2.1653, + "step": 6916 + }, + { + "epoch": 0.59, + "grad_norm": 1.638177514076233, + "learning_rate": 2.0417236662106705e-05, + "loss": 2.0012, + "step": 6920 + }, + { + "epoch": 0.59, + "grad_norm": 1.627571702003479, + "learning_rate": 2.040013679890561e-05, + "loss": 1.9958, + "step": 6924 + }, + { + "epoch": 0.59, + "grad_norm": 1.5089852809906006, + "learning_rate": 2.0383036935704516e-05, + "loss": 1.9816, + "step": 6928 + }, + { + "epoch": 0.59, + "grad_norm": 1.6622666120529175, + "learning_rate": 2.036593707250342e-05, + "loss": 2.0482, + "step": 6932 + }, + { + "epoch": 0.59, + "grad_norm": 1.5746067762374878, + "learning_rate": 2.0348837209302328e-05, + "loss": 2.0877, + "step": 6936 + }, + { + "epoch": 0.59, + "grad_norm": 1.4563333988189697, + "learning_rate": 2.033173734610123e-05, + "loss": 2.0376, + "step": 6940 + }, + { + "epoch": 0.59, + "grad_norm": 1.417210340499878, + "learning_rate": 2.0314637482900136e-05, + "loss": 1.8537, + "step": 6944 + }, + { + "epoch": 0.59, + "grad_norm": 1.644472360610962, + "learning_rate": 2.0297537619699043e-05, + "loss": 2.0294, + "step": 6948 + }, + { + "epoch": 0.59, + "grad_norm": 1.462584137916565, + "learning_rate": 2.028043775649795e-05, + "loss": 2.0171, + "step": 6952 + }, + { + "epoch": 0.59, + "grad_norm": 1.6051154136657715, + "learning_rate": 2.0263337893296854e-05, + "loss": 2.0095, + "step": 6956 + }, + { + "epoch": 0.6, + "grad_norm": 1.6152054071426392, + "learning_rate": 2.0246238030095758e-05, + "loss": 1.9219, + "step": 6960 + }, + { + "epoch": 0.6, + "grad_norm": 1.3505607843399048, + "learning_rate": 2.0229138166894666e-05, + "loss": 2.0034, + "step": 6964 + }, + { + "epoch": 0.6, + "grad_norm": 1.522525429725647, + "learning_rate": 2.021203830369357e-05, + "loss": 1.8819, + "step": 6968 + }, + { + "epoch": 0.6, + "grad_norm": 1.6121808290481567, + "learning_rate": 2.0194938440492477e-05, + "loss": 2.2193, + "step": 6972 + }, + { + "epoch": 0.6, + "grad_norm": 1.8233622312545776, + "learning_rate": 2.0177838577291384e-05, + "loss": 1.8937, + "step": 6976 + }, + { + "epoch": 0.6, + "grad_norm": 1.6225485801696777, + "learning_rate": 2.0160738714090288e-05, + "loss": 1.9533, + "step": 6980 + }, + { + "epoch": 0.6, + "grad_norm": 1.5252420902252197, + "learning_rate": 2.0143638850889192e-05, + "loss": 1.9956, + "step": 6984 + }, + { + "epoch": 0.6, + "grad_norm": 1.5580767393112183, + "learning_rate": 2.01265389876881e-05, + "loss": 2.0131, + "step": 6988 + }, + { + "epoch": 0.6, + "grad_norm": 1.5372722148895264, + "learning_rate": 2.0109439124487007e-05, + "loss": 1.9964, + "step": 6992 + }, + { + "epoch": 0.6, + "grad_norm": 1.500069499015808, + "learning_rate": 2.009233926128591e-05, + "loss": 2.0251, + "step": 6996 + }, + { + "epoch": 0.6, + "grad_norm": 1.6729307174682617, + "learning_rate": 2.0075239398084815e-05, + "loss": 1.9491, + "step": 7000 + }, + { + "epoch": 0.6, + "grad_norm": 1.8070749044418335, + "learning_rate": 2.0058139534883722e-05, + "loss": 1.9621, + "step": 7004 + }, + { + "epoch": 0.6, + "grad_norm": 1.5209249258041382, + "learning_rate": 2.0041039671682626e-05, + "loss": 1.8494, + "step": 7008 + }, + { + "epoch": 0.6, + "grad_norm": 1.4102156162261963, + "learning_rate": 2.0023939808481533e-05, + "loss": 1.8024, + "step": 7012 + }, + { + "epoch": 0.6, + "grad_norm": 1.6024938821792603, + "learning_rate": 2.000683994528044e-05, + "loss": 2.0584, + "step": 7016 + }, + { + "epoch": 0.6, + "grad_norm": 1.5164461135864258, + "learning_rate": 1.9989740082079345e-05, + "loss": 1.9705, + "step": 7020 + }, + { + "epoch": 0.6, + "grad_norm": 1.4683334827423096, + "learning_rate": 1.997264021887825e-05, + "loss": 1.8526, + "step": 7024 + }, + { + "epoch": 0.6, + "grad_norm": 1.5955207347869873, + "learning_rate": 1.9955540355677156e-05, + "loss": 2.0018, + "step": 7028 + }, + { + "epoch": 0.6, + "grad_norm": 1.6623783111572266, + "learning_rate": 1.993844049247606e-05, + "loss": 2.1312, + "step": 7032 + }, + { + "epoch": 0.6, + "grad_norm": 1.660420298576355, + "learning_rate": 1.9921340629274967e-05, + "loss": 1.8685, + "step": 7036 + }, + { + "epoch": 0.6, + "grad_norm": 1.7066700458526611, + "learning_rate": 1.990424076607387e-05, + "loss": 1.9847, + "step": 7040 + }, + { + "epoch": 0.6, + "grad_norm": 1.5725295543670654, + "learning_rate": 1.988714090287278e-05, + "loss": 1.9324, + "step": 7044 + }, + { + "epoch": 0.6, + "grad_norm": 1.5613770484924316, + "learning_rate": 1.9870041039671683e-05, + "loss": 2.0648, + "step": 7048 + }, + { + "epoch": 0.6, + "grad_norm": 1.4569567441940308, + "learning_rate": 1.9852941176470586e-05, + "loss": 1.8675, + "step": 7052 + }, + { + "epoch": 0.6, + "grad_norm": 1.610832691192627, + "learning_rate": 1.9835841313269497e-05, + "loss": 2.0513, + "step": 7056 + }, + { + "epoch": 0.6, + "grad_norm": 1.7159889936447144, + "learning_rate": 1.98187414500684e-05, + "loss": 1.8884, + "step": 7060 + }, + { + "epoch": 0.6, + "grad_norm": 1.6798243522644043, + "learning_rate": 1.9801641586867305e-05, + "loss": 1.9021, + "step": 7064 + }, + { + "epoch": 0.6, + "grad_norm": 1.87516450881958, + "learning_rate": 1.9784541723666212e-05, + "loss": 2.1314, + "step": 7068 + }, + { + "epoch": 0.6, + "grad_norm": 1.465100646018982, + "learning_rate": 1.9767441860465116e-05, + "loss": 1.9359, + "step": 7072 + }, + { + "epoch": 0.6, + "grad_norm": 1.7637323141098022, + "learning_rate": 1.9750341997264024e-05, + "loss": 2.0628, + "step": 7076 + }, + { + "epoch": 0.61, + "grad_norm": 1.4772495031356812, + "learning_rate": 1.9733242134062928e-05, + "loss": 2.0258, + "step": 7080 + }, + { + "epoch": 0.61, + "grad_norm": 1.5895733833312988, + "learning_rate": 1.9716142270861835e-05, + "loss": 1.8629, + "step": 7084 + }, + { + "epoch": 0.61, + "grad_norm": 1.5556437969207764, + "learning_rate": 1.969904240766074e-05, + "loss": 1.9945, + "step": 7088 + }, + { + "epoch": 0.61, + "grad_norm": 1.502622365951538, + "learning_rate": 1.9681942544459643e-05, + "loss": 2.0398, + "step": 7092 + }, + { + "epoch": 0.61, + "grad_norm": 1.5926116704940796, + "learning_rate": 1.966484268125855e-05, + "loss": 1.8625, + "step": 7096 + }, + { + "epoch": 0.61, + "grad_norm": 1.5113784074783325, + "learning_rate": 1.9647742818057458e-05, + "loss": 2.0816, + "step": 7100 + }, + { + "epoch": 0.61, + "grad_norm": 1.5072494745254517, + "learning_rate": 1.963064295485636e-05, + "loss": 1.9681, + "step": 7104 + }, + { + "epoch": 0.61, + "grad_norm": 1.6104905605316162, + "learning_rate": 1.961354309165527e-05, + "loss": 2.1155, + "step": 7108 + }, + { + "epoch": 0.61, + "grad_norm": 1.6391701698303223, + "learning_rate": 1.9596443228454173e-05, + "loss": 1.9331, + "step": 7112 + }, + { + "epoch": 0.61, + "grad_norm": 1.5969183444976807, + "learning_rate": 1.9579343365253077e-05, + "loss": 1.9331, + "step": 7116 + }, + { + "epoch": 0.61, + "grad_norm": 1.6461735963821411, + "learning_rate": 1.9562243502051984e-05, + "loss": 2.115, + "step": 7120 + }, + { + "epoch": 0.61, + "grad_norm": 1.572561264038086, + "learning_rate": 1.954514363885089e-05, + "loss": 2.0272, + "step": 7124 + }, + { + "epoch": 0.61, + "grad_norm": 1.5880368947982788, + "learning_rate": 1.9528043775649796e-05, + "loss": 2.0262, + "step": 7128 + }, + { + "epoch": 0.61, + "grad_norm": 1.697801947593689, + "learning_rate": 1.95109439124487e-05, + "loss": 2.1504, + "step": 7132 + }, + { + "epoch": 0.61, + "grad_norm": 1.5090667009353638, + "learning_rate": 1.9493844049247607e-05, + "loss": 2.0096, + "step": 7136 + }, + { + "epoch": 0.61, + "grad_norm": 1.6617227792739868, + "learning_rate": 1.9476744186046514e-05, + "loss": 1.9429, + "step": 7140 + }, + { + "epoch": 0.61, + "grad_norm": 1.489587426185608, + "learning_rate": 1.9459644322845418e-05, + "loss": 1.9735, + "step": 7144 + }, + { + "epoch": 0.61, + "grad_norm": 1.4970664978027344, + "learning_rate": 1.9442544459644325e-05, + "loss": 2.0434, + "step": 7148 + }, + { + "epoch": 0.61, + "grad_norm": 1.7819324731826782, + "learning_rate": 1.942544459644323e-05, + "loss": 2.0847, + "step": 7152 + }, + { + "epoch": 0.61, + "grad_norm": 1.572629690170288, + "learning_rate": 1.9408344733242133e-05, + "loss": 2.0716, + "step": 7156 + }, + { + "epoch": 0.61, + "grad_norm": 1.6527976989746094, + "learning_rate": 1.939124487004104e-05, + "loss": 1.9941, + "step": 7160 + }, + { + "epoch": 0.61, + "grad_norm": 1.7278069257736206, + "learning_rate": 1.9374145006839948e-05, + "loss": 1.9261, + "step": 7164 + }, + { + "epoch": 0.61, + "grad_norm": 1.5078670978546143, + "learning_rate": 1.9357045143638852e-05, + "loss": 1.9513, + "step": 7168 + }, + { + "epoch": 0.61, + "grad_norm": 1.6161733865737915, + "learning_rate": 1.9339945280437756e-05, + "loss": 1.9143, + "step": 7172 + }, + { + "epoch": 0.61, + "grad_norm": 1.7204173803329468, + "learning_rate": 1.9322845417236663e-05, + "loss": 1.8862, + "step": 7176 + }, + { + "epoch": 0.61, + "grad_norm": 1.607851266860962, + "learning_rate": 1.9305745554035567e-05, + "loss": 1.98, + "step": 7180 + }, + { + "epoch": 0.61, + "grad_norm": 1.5279322862625122, + "learning_rate": 1.9288645690834475e-05, + "loss": 1.8538, + "step": 7184 + }, + { + "epoch": 0.61, + "grad_norm": 1.5147992372512817, + "learning_rate": 1.9271545827633382e-05, + "loss": 1.9975, + "step": 7188 + }, + { + "epoch": 0.61, + "grad_norm": 1.6696971654891968, + "learning_rate": 1.9254445964432286e-05, + "loss": 1.8899, + "step": 7192 + }, + { + "epoch": 0.62, + "grad_norm": 1.3951395750045776, + "learning_rate": 1.923734610123119e-05, + "loss": 1.9392, + "step": 7196 + }, + { + "epoch": 0.62, + "grad_norm": 1.5910269021987915, + "learning_rate": 1.9220246238030097e-05, + "loss": 2.082, + "step": 7200 + }, + { + "epoch": 0.62, + "grad_norm": 1.5352290868759155, + "learning_rate": 1.9203146374829005e-05, + "loss": 2.0371, + "step": 7204 + }, + { + "epoch": 0.62, + "grad_norm": 1.521520972251892, + "learning_rate": 1.918604651162791e-05, + "loss": 1.9586, + "step": 7208 + }, + { + "epoch": 0.62, + "grad_norm": 1.501772403717041, + "learning_rate": 1.9168946648426812e-05, + "loss": 1.982, + "step": 7212 + }, + { + "epoch": 0.62, + "grad_norm": 1.548953890800476, + "learning_rate": 1.915184678522572e-05, + "loss": 2.0191, + "step": 7216 + }, + { + "epoch": 0.62, + "grad_norm": 1.6001508235931396, + "learning_rate": 1.9134746922024624e-05, + "loss": 2.0142, + "step": 7220 + }, + { + "epoch": 0.62, + "grad_norm": 1.6897025108337402, + "learning_rate": 1.9117647058823528e-05, + "loss": 1.9539, + "step": 7224 + }, + { + "epoch": 0.62, + "grad_norm": 1.6484875679016113, + "learning_rate": 1.910054719562244e-05, + "loss": 1.9652, + "step": 7228 + }, + { + "epoch": 0.62, + "grad_norm": 1.504267692565918, + "learning_rate": 1.9083447332421342e-05, + "loss": 1.8661, + "step": 7232 + }, + { + "epoch": 0.62, + "grad_norm": 1.442720890045166, + "learning_rate": 1.9066347469220246e-05, + "loss": 1.849, + "step": 7236 + }, + { + "epoch": 0.62, + "grad_norm": 1.3908931016921997, + "learning_rate": 1.9049247606019154e-05, + "loss": 2.0333, + "step": 7240 + }, + { + "epoch": 0.62, + "grad_norm": 1.7459537982940674, + "learning_rate": 1.9032147742818058e-05, + "loss": 1.98, + "step": 7244 + }, + { + "epoch": 0.62, + "grad_norm": 1.5512782335281372, + "learning_rate": 1.9015047879616965e-05, + "loss": 2.1063, + "step": 7248 + }, + { + "epoch": 0.62, + "grad_norm": 1.5178790092468262, + "learning_rate": 1.899794801641587e-05, + "loss": 2.0102, + "step": 7252 + }, + { + "epoch": 0.62, + "grad_norm": 1.7210770845413208, + "learning_rate": 1.8980848153214776e-05, + "loss": 1.9342, + "step": 7256 + }, + { + "epoch": 0.62, + "grad_norm": 1.472294569015503, + "learning_rate": 1.896374829001368e-05, + "loss": 2.0667, + "step": 7260 + }, + { + "epoch": 0.62, + "grad_norm": 1.5506442785263062, + "learning_rate": 1.8946648426812584e-05, + "loss": 2.2188, + "step": 7264 + }, + { + "epoch": 0.62, + "grad_norm": 1.6018651723861694, + "learning_rate": 1.8929548563611495e-05, + "loss": 1.9151, + "step": 7268 + }, + { + "epoch": 0.62, + "grad_norm": 1.6608229875564575, + "learning_rate": 1.89124487004104e-05, + "loss": 1.8974, + "step": 7272 + }, + { + "epoch": 0.62, + "grad_norm": 1.580942153930664, + "learning_rate": 1.8895348837209303e-05, + "loss": 1.9818, + "step": 7276 + }, + { + "epoch": 0.62, + "grad_norm": 1.5641908645629883, + "learning_rate": 1.887824897400821e-05, + "loss": 1.8878, + "step": 7280 + }, + { + "epoch": 0.62, + "grad_norm": 1.6174907684326172, + "learning_rate": 1.8861149110807114e-05, + "loss": 2.0124, + "step": 7284 + }, + { + "epoch": 0.62, + "grad_norm": 1.6148037910461426, + "learning_rate": 1.8844049247606018e-05, + "loss": 1.902, + "step": 7288 + }, + { + "epoch": 0.62, + "grad_norm": 1.6356844902038574, + "learning_rate": 1.8826949384404925e-05, + "loss": 2.0167, + "step": 7292 + }, + { + "epoch": 0.62, + "grad_norm": 1.441463589668274, + "learning_rate": 1.8809849521203833e-05, + "loss": 1.8633, + "step": 7296 + }, + { + "epoch": 0.62, + "grad_norm": 1.5125188827514648, + "learning_rate": 1.8792749658002737e-05, + "loss": 1.7871, + "step": 7300 + }, + { + "epoch": 0.62, + "grad_norm": 1.4316954612731934, + "learning_rate": 1.877564979480164e-05, + "loss": 2.0366, + "step": 7304 + }, + { + "epoch": 0.62, + "grad_norm": 1.7826207876205444, + "learning_rate": 1.8758549931600548e-05, + "loss": 1.9861, + "step": 7308 + }, + { + "epoch": 0.63, + "grad_norm": 1.7452281713485718, + "learning_rate": 1.8741450068399455e-05, + "loss": 2.1054, + "step": 7312 + }, + { + "epoch": 0.63, + "grad_norm": 1.5087822675704956, + "learning_rate": 1.872435020519836e-05, + "loss": 1.9194, + "step": 7316 + }, + { + "epoch": 0.63, + "grad_norm": 1.560201644897461, + "learning_rate": 1.8707250341997263e-05, + "loss": 1.983, + "step": 7320 + }, + { + "epoch": 0.63, + "grad_norm": 1.5828901529312134, + "learning_rate": 1.869015047879617e-05, + "loss": 2.0051, + "step": 7324 + }, + { + "epoch": 0.63, + "grad_norm": 1.5497353076934814, + "learning_rate": 1.8673050615595075e-05, + "loss": 2.0197, + "step": 7328 + }, + { + "epoch": 0.63, + "grad_norm": 1.5281920433044434, + "learning_rate": 1.8655950752393982e-05, + "loss": 1.9502, + "step": 7332 + }, + { + "epoch": 0.63, + "grad_norm": 1.6356514692306519, + "learning_rate": 1.863885088919289e-05, + "loss": 2.069, + "step": 7336 + }, + { + "epoch": 0.63, + "grad_norm": 1.4687514305114746, + "learning_rate": 1.8621751025991793e-05, + "loss": 1.931, + "step": 7340 + }, + { + "epoch": 0.63, + "grad_norm": 1.604845643043518, + "learning_rate": 1.8604651162790697e-05, + "loss": 2.0305, + "step": 7344 + }, + { + "epoch": 0.63, + "grad_norm": 1.5669852495193481, + "learning_rate": 1.8587551299589605e-05, + "loss": 2.0094, + "step": 7348 + }, + { + "epoch": 0.63, + "grad_norm": 1.586909532546997, + "learning_rate": 1.857045143638851e-05, + "loss": 2.0053, + "step": 7352 + }, + { + "epoch": 0.63, + "grad_norm": 1.5457959175109863, + "learning_rate": 1.8553351573187416e-05, + "loss": 1.9575, + "step": 7356 + }, + { + "epoch": 0.63, + "grad_norm": 1.4828368425369263, + "learning_rate": 1.853625170998632e-05, + "loss": 1.9047, + "step": 7360 + }, + { + "epoch": 0.63, + "grad_norm": 1.5736109018325806, + "learning_rate": 1.8519151846785227e-05, + "loss": 1.93, + "step": 7364 + }, + { + "epoch": 0.63, + "grad_norm": 1.7825120687484741, + "learning_rate": 1.850205198358413e-05, + "loss": 1.9667, + "step": 7368 + }, + { + "epoch": 0.63, + "grad_norm": 1.529508113861084, + "learning_rate": 1.8484952120383035e-05, + "loss": 1.8791, + "step": 7372 + }, + { + "epoch": 0.63, + "grad_norm": 1.5138580799102783, + "learning_rate": 1.8467852257181946e-05, + "loss": 1.932, + "step": 7376 + }, + { + "epoch": 0.63, + "grad_norm": 1.619393229484558, + "learning_rate": 1.845075239398085e-05, + "loss": 1.9461, + "step": 7380 + }, + { + "epoch": 0.63, + "grad_norm": 1.4028980731964111, + "learning_rate": 1.8433652530779754e-05, + "loss": 1.9428, + "step": 7384 + }, + { + "epoch": 0.63, + "grad_norm": 1.631604552268982, + "learning_rate": 1.841655266757866e-05, + "loss": 1.9728, + "step": 7388 + }, + { + "epoch": 0.63, + "grad_norm": 1.4649015665054321, + "learning_rate": 1.8399452804377565e-05, + "loss": 1.8762, + "step": 7392 + }, + { + "epoch": 0.63, + "grad_norm": 1.48709237575531, + "learning_rate": 1.8382352941176472e-05, + "loss": 1.9312, + "step": 7396 + }, + { + "epoch": 0.63, + "grad_norm": 1.6089003086090088, + "learning_rate": 1.8365253077975376e-05, + "loss": 1.8671, + "step": 7400 + }, + { + "epoch": 0.63, + "grad_norm": 1.723889708518982, + "learning_rate": 1.8348153214774284e-05, + "loss": 1.9107, + "step": 7404 + }, + { + "epoch": 0.63, + "grad_norm": 1.5013092756271362, + "learning_rate": 1.8331053351573188e-05, + "loss": 1.9322, + "step": 7408 + }, + { + "epoch": 0.63, + "grad_norm": 1.8294460773468018, + "learning_rate": 1.831395348837209e-05, + "loss": 2.1004, + "step": 7412 + }, + { + "epoch": 0.63, + "grad_norm": 1.4940433502197266, + "learning_rate": 1.8296853625171e-05, + "loss": 1.882, + "step": 7416 + }, + { + "epoch": 0.63, + "grad_norm": 1.6243213415145874, + "learning_rate": 1.8279753761969906e-05, + "loss": 2.1548, + "step": 7420 + }, + { + "epoch": 0.63, + "grad_norm": 1.934673547744751, + "learning_rate": 1.826265389876881e-05, + "loss": 1.9928, + "step": 7424 + }, + { + "epoch": 0.64, + "grad_norm": 1.7047098875045776, + "learning_rate": 1.8245554035567718e-05, + "loss": 1.9881, + "step": 7428 + }, + { + "epoch": 0.64, + "grad_norm": 1.5831687450408936, + "learning_rate": 1.822845417236662e-05, + "loss": 1.9889, + "step": 7432 + }, + { + "epoch": 0.64, + "grad_norm": 1.6739925146102905, + "learning_rate": 1.8211354309165525e-05, + "loss": 1.9835, + "step": 7436 + }, + { + "epoch": 0.64, + "grad_norm": 1.4848778247833252, + "learning_rate": 1.8194254445964433e-05, + "loss": 1.9711, + "step": 7440 + }, + { + "epoch": 0.64, + "grad_norm": 1.499522089958191, + "learning_rate": 1.817715458276334e-05, + "loss": 1.9454, + "step": 7444 + }, + { + "epoch": 0.64, + "grad_norm": 1.6421815156936646, + "learning_rate": 1.8160054719562244e-05, + "loss": 1.9402, + "step": 7448 + }, + { + "epoch": 0.64, + "grad_norm": 1.7279685735702515, + "learning_rate": 1.8142954856361148e-05, + "loss": 1.7985, + "step": 7452 + }, + { + "epoch": 0.64, + "grad_norm": 1.6793206930160522, + "learning_rate": 1.8125854993160055e-05, + "loss": 2.0187, + "step": 7456 + }, + { + "epoch": 0.64, + "grad_norm": 1.4948447942733765, + "learning_rate": 1.8108755129958963e-05, + "loss": 1.9569, + "step": 7460 + }, + { + "epoch": 0.64, + "grad_norm": 1.5078566074371338, + "learning_rate": 1.8091655266757867e-05, + "loss": 1.8296, + "step": 7464 + }, + { + "epoch": 0.64, + "grad_norm": 1.7886958122253418, + "learning_rate": 1.8074555403556774e-05, + "loss": 2.0134, + "step": 7468 + }, + { + "epoch": 0.64, + "grad_norm": 1.4695848226547241, + "learning_rate": 1.8057455540355678e-05, + "loss": 1.9271, + "step": 7472 + }, + { + "epoch": 0.64, + "grad_norm": 1.4936541318893433, + "learning_rate": 1.8040355677154582e-05, + "loss": 1.9308, + "step": 7476 + }, + { + "epoch": 0.64, + "grad_norm": 1.5521645545959473, + "learning_rate": 1.802325581395349e-05, + "loss": 1.8857, + "step": 7480 + }, + { + "epoch": 0.64, + "grad_norm": 1.5674141645431519, + "learning_rate": 1.8006155950752397e-05, + "loss": 1.9064, + "step": 7484 + }, + { + "epoch": 0.64, + "grad_norm": 1.6228454113006592, + "learning_rate": 1.79890560875513e-05, + "loss": 1.8739, + "step": 7488 + }, + { + "epoch": 0.64, + "grad_norm": 1.4367411136627197, + "learning_rate": 1.7971956224350205e-05, + "loss": 1.9576, + "step": 7492 + }, + { + "epoch": 0.64, + "grad_norm": 1.5149922370910645, + "learning_rate": 1.7954856361149112e-05, + "loss": 1.9571, + "step": 7496 + }, + { + "epoch": 0.64, + "grad_norm": 1.6180402040481567, + "learning_rate": 1.7937756497948016e-05, + "loss": 1.8322, + "step": 7500 + }, + { + "epoch": 0.64, + "grad_norm": 1.410872459411621, + "learning_rate": 1.7920656634746923e-05, + "loss": 1.8944, + "step": 7504 + }, + { + "epoch": 0.64, + "grad_norm": 1.4997954368591309, + "learning_rate": 1.790355677154583e-05, + "loss": 2.014, + "step": 7508 + }, + { + "epoch": 0.64, + "grad_norm": 1.4796793460845947, + "learning_rate": 1.7886456908344735e-05, + "loss": 1.908, + "step": 7512 + }, + { + "epoch": 0.64, + "grad_norm": 1.6083449125289917, + "learning_rate": 1.786935704514364e-05, + "loss": 1.9159, + "step": 7516 + }, + { + "epoch": 0.64, + "grad_norm": 1.6810011863708496, + "learning_rate": 1.7852257181942546e-05, + "loss": 1.9426, + "step": 7520 + }, + { + "epoch": 0.64, + "grad_norm": 1.5123205184936523, + "learning_rate": 1.7835157318741453e-05, + "loss": 1.8627, + "step": 7524 + }, + { + "epoch": 0.64, + "grad_norm": 1.4853988885879517, + "learning_rate": 1.7818057455540357e-05, + "loss": 1.9814, + "step": 7528 + }, + { + "epoch": 0.64, + "grad_norm": 1.720802903175354, + "learning_rate": 1.780095759233926e-05, + "loss": 1.9168, + "step": 7532 + }, + { + "epoch": 0.64, + "grad_norm": 1.6009372472763062, + "learning_rate": 1.778385772913817e-05, + "loss": 1.9071, + "step": 7536 + }, + { + "epoch": 0.64, + "grad_norm": 2.1640713214874268, + "learning_rate": 1.7766757865937072e-05, + "loss": 1.9403, + "step": 7540 + }, + { + "epoch": 0.65, + "grad_norm": 1.9134138822555542, + "learning_rate": 1.7749658002735976e-05, + "loss": 2.013, + "step": 7544 + }, + { + "epoch": 0.65, + "grad_norm": 1.6833550930023193, + "learning_rate": 1.7732558139534887e-05, + "loss": 2.0307, + "step": 7548 + }, + { + "epoch": 0.65, + "grad_norm": 1.643018364906311, + "learning_rate": 1.771545827633379e-05, + "loss": 1.8817, + "step": 7552 + }, + { + "epoch": 0.65, + "grad_norm": 1.5238628387451172, + "learning_rate": 1.7698358413132695e-05, + "loss": 2.0012, + "step": 7556 + }, + { + "epoch": 0.65, + "grad_norm": 1.612910270690918, + "learning_rate": 1.7681258549931602e-05, + "loss": 1.9054, + "step": 7560 + }, + { + "epoch": 0.65, + "grad_norm": 1.4849001169204712, + "learning_rate": 1.7664158686730506e-05, + "loss": 1.8482, + "step": 7564 + }, + { + "epoch": 0.65, + "grad_norm": 1.7270807027816772, + "learning_rate": 1.7647058823529414e-05, + "loss": 2.0145, + "step": 7568 + }, + { + "epoch": 0.65, + "grad_norm": 1.676148533821106, + "learning_rate": 1.7629958960328318e-05, + "loss": 2.0066, + "step": 7572 + }, + { + "epoch": 0.65, + "grad_norm": 1.3805028200149536, + "learning_rate": 1.7612859097127225e-05, + "loss": 1.813, + "step": 7576 + }, + { + "epoch": 0.65, + "grad_norm": 1.528660774230957, + "learning_rate": 1.759575923392613e-05, + "loss": 1.9291, + "step": 7580 + }, + { + "epoch": 0.65, + "grad_norm": 1.623838186264038, + "learning_rate": 1.7578659370725033e-05, + "loss": 1.9315, + "step": 7584 + }, + { + "epoch": 0.65, + "grad_norm": 1.4332454204559326, + "learning_rate": 1.7561559507523944e-05, + "loss": 1.9888, + "step": 7588 + }, + { + "epoch": 0.65, + "grad_norm": 1.5631705522537231, + "learning_rate": 1.7544459644322848e-05, + "loss": 2.1324, + "step": 7592 + }, + { + "epoch": 0.65, + "grad_norm": 2.10516357421875, + "learning_rate": 1.752735978112175e-05, + "loss": 1.9798, + "step": 7596 + }, + { + "epoch": 0.65, + "grad_norm": 1.6901365518569946, + "learning_rate": 1.751025991792066e-05, + "loss": 1.9679, + "step": 7600 + }, + { + "epoch": 0.65, + "grad_norm": 1.9404395818710327, + "learning_rate": 1.7493160054719563e-05, + "loss": 2.0198, + "step": 7604 + }, + { + "epoch": 0.65, + "grad_norm": 1.5954433679580688, + "learning_rate": 1.7476060191518467e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 0.65, + "grad_norm": 1.4495494365692139, + "learning_rate": 1.7458960328317374e-05, + "loss": 2.0086, + "step": 7612 + }, + { + "epoch": 0.65, + "grad_norm": 1.513266682624817, + "learning_rate": 1.744186046511628e-05, + "loss": 1.9934, + "step": 7616 + }, + { + "epoch": 0.65, + "grad_norm": 1.6928620338439941, + "learning_rate": 1.7424760601915185e-05, + "loss": 1.9103, + "step": 7620 + }, + { + "epoch": 0.65, + "grad_norm": 1.7382714748382568, + "learning_rate": 1.740766073871409e-05, + "loss": 2.015, + "step": 7624 + }, + { + "epoch": 0.65, + "grad_norm": 1.4485249519348145, + "learning_rate": 1.7390560875512997e-05, + "loss": 2.076, + "step": 7628 + }, + { + "epoch": 0.65, + "grad_norm": 1.5897313356399536, + "learning_rate": 1.7373461012311904e-05, + "loss": 1.8503, + "step": 7632 + }, + { + "epoch": 0.65, + "grad_norm": 1.6106144189834595, + "learning_rate": 1.7356361149110808e-05, + "loss": 1.7611, + "step": 7636 + }, + { + "epoch": 0.65, + "grad_norm": 1.6251461505889893, + "learning_rate": 1.7339261285909715e-05, + "loss": 1.8426, + "step": 7640 + }, + { + "epoch": 0.65, + "grad_norm": 1.6952441930770874, + "learning_rate": 1.732216142270862e-05, + "loss": 1.9492, + "step": 7644 + }, + { + "epoch": 0.65, + "grad_norm": 1.6333529949188232, + "learning_rate": 1.7305061559507523e-05, + "loss": 1.9824, + "step": 7648 + }, + { + "epoch": 0.65, + "grad_norm": 1.5729711055755615, + "learning_rate": 1.728796169630643e-05, + "loss": 1.9888, + "step": 7652 + }, + { + "epoch": 0.65, + "grad_norm": 1.5947141647338867, + "learning_rate": 1.7270861833105338e-05, + "loss": 1.8368, + "step": 7656 + }, + { + "epoch": 0.65, + "grad_norm": 1.5854895114898682, + "learning_rate": 1.7253761969904242e-05, + "loss": 1.9155, + "step": 7660 + }, + { + "epoch": 0.66, + "grad_norm": 1.6690934896469116, + "learning_rate": 1.7236662106703146e-05, + "loss": 1.9139, + "step": 7664 + }, + { + "epoch": 0.66, + "grad_norm": 1.5118485689163208, + "learning_rate": 1.7219562243502053e-05, + "loss": 1.8872, + "step": 7668 + }, + { + "epoch": 0.66, + "grad_norm": 1.6435084342956543, + "learning_rate": 1.7202462380300957e-05, + "loss": 2.1278, + "step": 7672 + }, + { + "epoch": 0.66, + "grad_norm": 1.6225100755691528, + "learning_rate": 1.7185362517099864e-05, + "loss": 1.9057, + "step": 7676 + }, + { + "epoch": 0.66, + "grad_norm": 1.603485345840454, + "learning_rate": 1.716826265389877e-05, + "loss": 1.7631, + "step": 7680 + }, + { + "epoch": 0.66, + "grad_norm": 1.588488221168518, + "learning_rate": 1.7151162790697676e-05, + "loss": 2.0738, + "step": 7684 + }, + { + "epoch": 0.66, + "grad_norm": 1.5990886688232422, + "learning_rate": 1.713406292749658e-05, + "loss": 1.9249, + "step": 7688 + }, + { + "epoch": 0.66, + "grad_norm": 1.5695997476577759, + "learning_rate": 1.7116963064295484e-05, + "loss": 1.8807, + "step": 7692 + }, + { + "epoch": 0.66, + "grad_norm": 1.8457928895950317, + "learning_rate": 1.7099863201094394e-05, + "loss": 2.0667, + "step": 7696 + }, + { + "epoch": 0.66, + "grad_norm": 1.7506129741668701, + "learning_rate": 1.70827633378933e-05, + "loss": 1.9462, + "step": 7700 + }, + { + "epoch": 0.66, + "grad_norm": 1.5219964981079102, + "learning_rate": 1.7065663474692202e-05, + "loss": 1.8061, + "step": 7704 + }, + { + "epoch": 0.66, + "grad_norm": 1.5406177043914795, + "learning_rate": 1.704856361149111e-05, + "loss": 1.9219, + "step": 7708 + }, + { + "epoch": 0.66, + "grad_norm": 1.5064541101455688, + "learning_rate": 1.7031463748290014e-05, + "loss": 1.9149, + "step": 7712 + }, + { + "epoch": 0.66, + "grad_norm": 1.7326499223709106, + "learning_rate": 1.701436388508892e-05, + "loss": 1.8593, + "step": 7716 + }, + { + "epoch": 0.66, + "grad_norm": 1.6578881740570068, + "learning_rate": 1.6997264021887825e-05, + "loss": 2.0094, + "step": 7720 + }, + { + "epoch": 0.66, + "grad_norm": 1.724426031112671, + "learning_rate": 1.6980164158686732e-05, + "loss": 2.0086, + "step": 7724 + }, + { + "epoch": 0.66, + "grad_norm": 1.7272109985351562, + "learning_rate": 1.6963064295485636e-05, + "loss": 2.0135, + "step": 7728 + }, + { + "epoch": 0.66, + "grad_norm": 1.8231357336044312, + "learning_rate": 1.694596443228454e-05, + "loss": 1.9537, + "step": 7732 + }, + { + "epoch": 0.66, + "grad_norm": 1.6103249788284302, + "learning_rate": 1.6928864569083448e-05, + "loss": 2.0054, + "step": 7736 + }, + { + "epoch": 0.66, + "grad_norm": 1.507234811782837, + "learning_rate": 1.6911764705882355e-05, + "loss": 2.0489, + "step": 7740 + }, + { + "epoch": 0.66, + "grad_norm": 1.49418306350708, + "learning_rate": 1.689466484268126e-05, + "loss": 1.8426, + "step": 7744 + }, + { + "epoch": 0.66, + "grad_norm": 1.828412652015686, + "learning_rate": 1.6877564979480166e-05, + "loss": 2.0645, + "step": 7748 + }, + { + "epoch": 0.66, + "grad_norm": 1.5563526153564453, + "learning_rate": 1.686046511627907e-05, + "loss": 1.9228, + "step": 7752 + }, + { + "epoch": 0.66, + "grad_norm": 1.680085301399231, + "learning_rate": 1.6843365253077974e-05, + "loss": 2.0503, + "step": 7756 + }, + { + "epoch": 0.66, + "grad_norm": 1.8583743572235107, + "learning_rate": 1.682626538987688e-05, + "loss": 1.9134, + "step": 7760 + }, + { + "epoch": 0.66, + "grad_norm": 1.5803444385528564, + "learning_rate": 1.680916552667579e-05, + "loss": 1.8787, + "step": 7764 + }, + { + "epoch": 0.66, + "grad_norm": 1.4421805143356323, + "learning_rate": 1.6792065663474693e-05, + "loss": 1.7833, + "step": 7768 + }, + { + "epoch": 0.66, + "grad_norm": 1.6548757553100586, + "learning_rate": 1.6774965800273597e-05, + "loss": 1.9397, + "step": 7772 + }, + { + "epoch": 0.66, + "grad_norm": 1.6442538499832153, + "learning_rate": 1.6757865937072504e-05, + "loss": 2.1046, + "step": 7776 + }, + { + "epoch": 0.67, + "grad_norm": 1.643537163734436, + "learning_rate": 1.674076607387141e-05, + "loss": 1.9088, + "step": 7780 + }, + { + "epoch": 0.67, + "grad_norm": 1.5146218538284302, + "learning_rate": 1.6723666210670315e-05, + "loss": 1.9828, + "step": 7784 + }, + { + "epoch": 0.67, + "grad_norm": 1.689788579940796, + "learning_rate": 1.6706566347469223e-05, + "loss": 1.9587, + "step": 7788 + }, + { + "epoch": 0.67, + "grad_norm": 1.498406171798706, + "learning_rate": 1.6689466484268127e-05, + "loss": 1.8098, + "step": 7792 + }, + { + "epoch": 0.67, + "grad_norm": 1.6559585332870483, + "learning_rate": 1.667236662106703e-05, + "loss": 2.0373, + "step": 7796 + }, + { + "epoch": 0.67, + "grad_norm": 1.675614595413208, + "learning_rate": 1.6655266757865938e-05, + "loss": 1.9298, + "step": 7800 + }, + { + "epoch": 0.67, + "grad_norm": 1.6751221418380737, + "learning_rate": 1.6638166894664845e-05, + "loss": 1.9871, + "step": 7804 + }, + { + "epoch": 0.67, + "grad_norm": 1.4255478382110596, + "learning_rate": 1.662106703146375e-05, + "loss": 1.821, + "step": 7808 + }, + { + "epoch": 0.67, + "grad_norm": 1.5127853155136108, + "learning_rate": 1.6603967168262653e-05, + "loss": 2.0976, + "step": 7812 + }, + { + "epoch": 0.67, + "grad_norm": 1.5522574186325073, + "learning_rate": 1.658686730506156e-05, + "loss": 2.027, + "step": 7816 + }, + { + "epoch": 0.67, + "grad_norm": 1.492023229598999, + "learning_rate": 1.6569767441860464e-05, + "loss": 1.9444, + "step": 7820 + }, + { + "epoch": 0.67, + "grad_norm": 1.5379337072372437, + "learning_rate": 1.6552667578659372e-05, + "loss": 1.9637, + "step": 7824 + }, + { + "epoch": 0.67, + "grad_norm": 1.615531086921692, + "learning_rate": 1.653556771545828e-05, + "loss": 1.9079, + "step": 7828 + }, + { + "epoch": 0.67, + "grad_norm": 1.5299206972122192, + "learning_rate": 1.6518467852257183e-05, + "loss": 1.8934, + "step": 7832 + }, + { + "epoch": 0.67, + "grad_norm": 1.5533653497695923, + "learning_rate": 1.6501367989056087e-05, + "loss": 1.8824, + "step": 7836 + }, + { + "epoch": 0.67, + "grad_norm": 1.5589085817337036, + "learning_rate": 1.6484268125854994e-05, + "loss": 1.8289, + "step": 7840 + }, + { + "epoch": 0.67, + "grad_norm": 1.5232523679733276, + "learning_rate": 1.6467168262653902e-05, + "loss": 1.9622, + "step": 7844 + }, + { + "epoch": 0.67, + "grad_norm": 1.4827030897140503, + "learning_rate": 1.6450068399452806e-05, + "loss": 2.0214, + "step": 7848 + }, + { + "epoch": 0.67, + "grad_norm": 1.5622025728225708, + "learning_rate": 1.643296853625171e-05, + "loss": 1.9528, + "step": 7852 + }, + { + "epoch": 0.67, + "grad_norm": 1.578583836555481, + "learning_rate": 1.6415868673050617e-05, + "loss": 1.9292, + "step": 7856 + }, + { + "epoch": 0.67, + "grad_norm": 1.5761486291885376, + "learning_rate": 1.639876880984952e-05, + "loss": 1.8721, + "step": 7860 + }, + { + "epoch": 0.67, + "grad_norm": 1.434203028678894, + "learning_rate": 1.6381668946648425e-05, + "loss": 1.882, + "step": 7864 + }, + { + "epoch": 0.67, + "grad_norm": 1.7951371669769287, + "learning_rate": 1.6364569083447336e-05, + "loss": 1.8724, + "step": 7868 + }, + { + "epoch": 0.67, + "grad_norm": 1.5747816562652588, + "learning_rate": 1.634746922024624e-05, + "loss": 1.9838, + "step": 7872 + }, + { + "epoch": 0.67, + "grad_norm": 1.6416630744934082, + "learning_rate": 1.6330369357045144e-05, + "loss": 2.0448, + "step": 7876 + }, + { + "epoch": 0.67, + "grad_norm": 1.5873196125030518, + "learning_rate": 1.631326949384405e-05, + "loss": 1.8606, + "step": 7880 + }, + { + "epoch": 0.67, + "grad_norm": 1.708465576171875, + "learning_rate": 1.6296169630642955e-05, + "loss": 2.0646, + "step": 7884 + }, + { + "epoch": 0.67, + "grad_norm": 1.764533281326294, + "learning_rate": 1.6279069767441862e-05, + "loss": 1.9685, + "step": 7888 + }, + { + "epoch": 0.67, + "grad_norm": 1.6329145431518555, + "learning_rate": 1.6261969904240766e-05, + "loss": 1.9665, + "step": 7892 + }, + { + "epoch": 0.68, + "grad_norm": 1.5862176418304443, + "learning_rate": 1.6244870041039673e-05, + "loss": 1.9459, + "step": 7896 + }, + { + "epoch": 0.68, + "grad_norm": 1.66335129737854, + "learning_rate": 1.6227770177838577e-05, + "loss": 1.9046, + "step": 7900 + }, + { + "epoch": 0.68, + "grad_norm": 1.854282259941101, + "learning_rate": 1.621067031463748e-05, + "loss": 1.8697, + "step": 7904 + }, + { + "epoch": 0.68, + "grad_norm": 1.5997343063354492, + "learning_rate": 1.6193570451436392e-05, + "loss": 1.8564, + "step": 7908 + }, + { + "epoch": 0.68, + "grad_norm": 1.5500624179840088, + "learning_rate": 1.6176470588235296e-05, + "loss": 1.9197, + "step": 7912 + }, + { + "epoch": 0.68, + "grad_norm": 1.7503950595855713, + "learning_rate": 1.61593707250342e-05, + "loss": 1.8817, + "step": 7916 + }, + { + "epoch": 0.68, + "grad_norm": 1.5188136100769043, + "learning_rate": 1.6142270861833107e-05, + "loss": 1.9714, + "step": 7920 + }, + { + "epoch": 0.68, + "grad_norm": 1.632093906402588, + "learning_rate": 1.612517099863201e-05, + "loss": 1.9189, + "step": 7924 + }, + { + "epoch": 0.68, + "grad_norm": 2.079958915710449, + "learning_rate": 1.6108071135430915e-05, + "loss": 1.9399, + "step": 7928 + }, + { + "epoch": 0.68, + "grad_norm": 1.5415632724761963, + "learning_rate": 1.6090971272229823e-05, + "loss": 1.977, + "step": 7932 + }, + { + "epoch": 0.68, + "grad_norm": 1.6359429359436035, + "learning_rate": 1.607387140902873e-05, + "loss": 1.9225, + "step": 7936 + }, + { + "epoch": 0.68, + "grad_norm": 1.6793197393417358, + "learning_rate": 1.6056771545827634e-05, + "loss": 1.891, + "step": 7940 + }, + { + "epoch": 0.68, + "grad_norm": 1.5686537027359009, + "learning_rate": 1.6039671682626538e-05, + "loss": 1.8904, + "step": 7944 + }, + { + "epoch": 0.68, + "grad_norm": 1.8018839359283447, + "learning_rate": 1.6022571819425445e-05, + "loss": 1.8233, + "step": 7948 + }, + { + "epoch": 0.68, + "grad_norm": 1.547400951385498, + "learning_rate": 1.6005471956224353e-05, + "loss": 1.9372, + "step": 7952 + }, + { + "epoch": 0.68, + "grad_norm": 1.5246180295944214, + "learning_rate": 1.5988372093023257e-05, + "loss": 1.9839, + "step": 7956 + }, + { + "epoch": 0.68, + "grad_norm": 1.4722802639007568, + "learning_rate": 1.5971272229822164e-05, + "loss": 1.7585, + "step": 7960 + }, + { + "epoch": 0.68, + "grad_norm": 1.5736894607543945, + "learning_rate": 1.5954172366621068e-05, + "loss": 1.9647, + "step": 7964 + }, + { + "epoch": 0.68, + "grad_norm": 1.6675376892089844, + "learning_rate": 1.5937072503419972e-05, + "loss": 1.9783, + "step": 7968 + }, + { + "epoch": 0.68, + "grad_norm": 1.5381276607513428, + "learning_rate": 1.591997264021888e-05, + "loss": 1.8318, + "step": 7972 + }, + { + "epoch": 0.68, + "grad_norm": 1.672789454460144, + "learning_rate": 1.5902872777017786e-05, + "loss": 1.9432, + "step": 7976 + }, + { + "epoch": 0.68, + "grad_norm": 1.5092620849609375, + "learning_rate": 1.588577291381669e-05, + "loss": 1.8645, + "step": 7980 + }, + { + "epoch": 0.68, + "grad_norm": 1.7272820472717285, + "learning_rate": 1.5868673050615594e-05, + "loss": 2.0409, + "step": 7984 + }, + { + "epoch": 0.68, + "grad_norm": 1.5391128063201904, + "learning_rate": 1.5851573187414502e-05, + "loss": 1.7384, + "step": 7988 + }, + { + "epoch": 0.68, + "grad_norm": 1.4302643537521362, + "learning_rate": 1.5834473324213406e-05, + "loss": 1.891, + "step": 7992 + }, + { + "epoch": 0.68, + "grad_norm": 1.5676170587539673, + "learning_rate": 1.5817373461012313e-05, + "loss": 1.7741, + "step": 7996 + }, + { + "epoch": 0.68, + "grad_norm": 1.7254034280776978, + "learning_rate": 1.580027359781122e-05, + "loss": 1.9008, + "step": 8000 + }, + { + "epoch": 0.68, + "grad_norm": 1.5930966138839722, + "learning_rate": 1.5783173734610124e-05, + "loss": 1.9161, + "step": 8004 + }, + { + "epoch": 0.68, + "grad_norm": 2.011584758758545, + "learning_rate": 1.5766073871409028e-05, + "loss": 1.9344, + "step": 8008 + }, + { + "epoch": 0.69, + "grad_norm": 1.5066322088241577, + "learning_rate": 1.5748974008207936e-05, + "loss": 1.8778, + "step": 8012 + }, + { + "epoch": 0.69, + "grad_norm": 1.775755763053894, + "learning_rate": 1.5731874145006843e-05, + "loss": 1.8985, + "step": 8016 + }, + { + "epoch": 0.69, + "grad_norm": 1.5539259910583496, + "learning_rate": 1.5714774281805747e-05, + "loss": 1.8531, + "step": 8020 + }, + { + "epoch": 0.69, + "grad_norm": 1.7370346784591675, + "learning_rate": 1.569767441860465e-05, + "loss": 1.9098, + "step": 8024 + }, + { + "epoch": 0.69, + "grad_norm": 1.4785212278366089, + "learning_rate": 1.5680574555403558e-05, + "loss": 2.0605, + "step": 8028 + }, + { + "epoch": 0.69, + "grad_norm": 1.6083650588989258, + "learning_rate": 1.5663474692202462e-05, + "loss": 2.0839, + "step": 8032 + }, + { + "epoch": 0.69, + "grad_norm": 1.6726301908493042, + "learning_rate": 1.564637482900137e-05, + "loss": 1.8263, + "step": 8036 + }, + { + "epoch": 0.69, + "grad_norm": 4.202913761138916, + "learning_rate": 1.5629274965800273e-05, + "loss": 1.8298, + "step": 8040 + }, + { + "epoch": 0.69, + "grad_norm": 1.6206969022750854, + "learning_rate": 1.561217510259918e-05, + "loss": 1.9383, + "step": 8044 + }, + { + "epoch": 0.69, + "grad_norm": 1.6992504596710205, + "learning_rate": 1.5595075239398085e-05, + "loss": 1.9817, + "step": 8048 + }, + { + "epoch": 0.69, + "grad_norm": 1.685895323753357, + "learning_rate": 1.557797537619699e-05, + "loss": 2.0378, + "step": 8052 + }, + { + "epoch": 0.69, + "grad_norm": 1.6659513711929321, + "learning_rate": 1.5560875512995896e-05, + "loss": 1.923, + "step": 8056 + }, + { + "epoch": 0.69, + "grad_norm": 1.6059319972991943, + "learning_rate": 1.5543775649794803e-05, + "loss": 1.9304, + "step": 8060 + }, + { + "epoch": 0.69, + "grad_norm": 1.725585699081421, + "learning_rate": 1.5526675786593707e-05, + "loss": 2.0079, + "step": 8064 + }, + { + "epoch": 0.69, + "grad_norm": 1.7117788791656494, + "learning_rate": 1.5509575923392615e-05, + "loss": 2.1715, + "step": 8068 + }, + { + "epoch": 0.69, + "grad_norm": 1.5837129354476929, + "learning_rate": 1.549247606019152e-05, + "loss": 1.9936, + "step": 8072 + }, + { + "epoch": 0.69, + "grad_norm": 1.9159271717071533, + "learning_rate": 1.5475376196990423e-05, + "loss": 1.9387, + "step": 8076 + }, + { + "epoch": 0.69, + "grad_norm": 1.7202599048614502, + "learning_rate": 1.545827633378933e-05, + "loss": 1.9975, + "step": 8080 + }, + { + "epoch": 0.69, + "grad_norm": 1.5703368186950684, + "learning_rate": 1.5441176470588237e-05, + "loss": 1.8077, + "step": 8084 + }, + { + "epoch": 0.69, + "grad_norm": 1.520732045173645, + "learning_rate": 1.542407660738714e-05, + "loss": 2.0593, + "step": 8088 + }, + { + "epoch": 0.69, + "grad_norm": 1.8129254579544067, + "learning_rate": 1.5406976744186045e-05, + "loss": 1.8897, + "step": 8092 + }, + { + "epoch": 0.69, + "grad_norm": 1.4720978736877441, + "learning_rate": 1.5389876880984953e-05, + "loss": 1.9366, + "step": 8096 + }, + { + "epoch": 0.69, + "grad_norm": 1.5979335308074951, + "learning_rate": 1.537277701778386e-05, + "loss": 1.843, + "step": 8100 + }, + { + "epoch": 0.69, + "grad_norm": 1.4189525842666626, + "learning_rate": 1.5355677154582764e-05, + "loss": 1.7472, + "step": 8104 + }, + { + "epoch": 0.69, + "grad_norm": 1.724085807800293, + "learning_rate": 1.533857729138167e-05, + "loss": 2.108, + "step": 8108 + }, + { + "epoch": 0.69, + "grad_norm": 1.5960993766784668, + "learning_rate": 1.5321477428180575e-05, + "loss": 1.8707, + "step": 8112 + }, + { + "epoch": 0.69, + "grad_norm": 1.624754786491394, + "learning_rate": 1.530437756497948e-05, + "loss": 1.9318, + "step": 8116 + }, + { + "epoch": 0.69, + "grad_norm": 1.758505940437317, + "learning_rate": 1.5287277701778386e-05, + "loss": 1.8996, + "step": 8120 + }, + { + "epoch": 0.69, + "grad_norm": 1.633908987045288, + "learning_rate": 1.5270177838577294e-05, + "loss": 1.8455, + "step": 8124 + }, + { + "epoch": 0.69, + "grad_norm": 1.578237533569336, + "learning_rate": 1.5253077975376198e-05, + "loss": 2.071, + "step": 8128 + }, + { + "epoch": 0.7, + "grad_norm": 1.5181132555007935, + "learning_rate": 1.5235978112175103e-05, + "loss": 1.766, + "step": 8132 + }, + { + "epoch": 0.7, + "grad_norm": 1.6433082818984985, + "learning_rate": 1.5218878248974009e-05, + "loss": 2.0446, + "step": 8136 + }, + { + "epoch": 0.7, + "grad_norm": 1.6200603246688843, + "learning_rate": 1.5201778385772913e-05, + "loss": 1.7629, + "step": 8140 + }, + { + "epoch": 0.7, + "grad_norm": 1.7067992687225342, + "learning_rate": 1.5184678522571822e-05, + "loss": 1.8624, + "step": 8144 + }, + { + "epoch": 0.7, + "grad_norm": 1.515853762626648, + "learning_rate": 1.5167578659370726e-05, + "loss": 1.8187, + "step": 8148 + }, + { + "epoch": 0.7, + "grad_norm": 1.7134236097335815, + "learning_rate": 1.5150478796169632e-05, + "loss": 1.9738, + "step": 8152 + }, + { + "epoch": 0.7, + "grad_norm": 1.5560367107391357, + "learning_rate": 1.5133378932968537e-05, + "loss": 1.9252, + "step": 8156 + }, + { + "epoch": 0.7, + "grad_norm": 1.476830005645752, + "learning_rate": 1.5116279069767441e-05, + "loss": 1.7564, + "step": 8160 + }, + { + "epoch": 0.7, + "grad_norm": 1.6756997108459473, + "learning_rate": 1.509917920656635e-05, + "loss": 1.8457, + "step": 8164 + }, + { + "epoch": 0.7, + "grad_norm": 1.6191574335098267, + "learning_rate": 1.5082079343365254e-05, + "loss": 1.9078, + "step": 8168 + }, + { + "epoch": 0.7, + "grad_norm": 1.7781707048416138, + "learning_rate": 1.506497948016416e-05, + "loss": 1.9178, + "step": 8172 + }, + { + "epoch": 0.7, + "grad_norm": 1.361154556274414, + "learning_rate": 1.5047879616963066e-05, + "loss": 1.9068, + "step": 8176 + }, + { + "epoch": 0.7, + "grad_norm": 1.6916831731796265, + "learning_rate": 1.503077975376197e-05, + "loss": 1.9089, + "step": 8180 + }, + { + "epoch": 0.7, + "grad_norm": 1.682424783706665, + "learning_rate": 1.5013679890560875e-05, + "loss": 2.0016, + "step": 8184 + }, + { + "epoch": 0.7, + "grad_norm": 1.4187464714050293, + "learning_rate": 1.4996580027359783e-05, + "loss": 1.811, + "step": 8188 + }, + { + "epoch": 0.7, + "grad_norm": 1.5043885707855225, + "learning_rate": 1.4979480164158688e-05, + "loss": 1.8448, + "step": 8192 + }, + { + "epoch": 0.7, + "grad_norm": 1.6151576042175293, + "learning_rate": 1.4962380300957594e-05, + "loss": 1.9776, + "step": 8196 + }, + { + "epoch": 0.7, + "grad_norm": 1.5769206285476685, + "learning_rate": 1.4945280437756498e-05, + "loss": 1.937, + "step": 8200 + }, + { + "epoch": 0.7, + "grad_norm": 1.5044043064117432, + "learning_rate": 1.4928180574555403e-05, + "loss": 1.852, + "step": 8204 + }, + { + "epoch": 0.7, + "grad_norm": 1.6411508321762085, + "learning_rate": 1.491108071135431e-05, + "loss": 1.9599, + "step": 8208 + }, + { + "epoch": 0.7, + "grad_norm": 1.6469234228134155, + "learning_rate": 1.4893980848153216e-05, + "loss": 1.9749, + "step": 8212 + }, + { + "epoch": 0.7, + "grad_norm": 1.64458167552948, + "learning_rate": 1.4876880984952122e-05, + "loss": 1.728, + "step": 8216 + }, + { + "epoch": 0.7, + "grad_norm": 1.5000782012939453, + "learning_rate": 1.4859781121751026e-05, + "loss": 1.8083, + "step": 8220 + }, + { + "epoch": 0.7, + "grad_norm": 1.521461844444275, + "learning_rate": 1.4842681258549932e-05, + "loss": 1.9935, + "step": 8224 + }, + { + "epoch": 0.7, + "grad_norm": 1.5517728328704834, + "learning_rate": 1.4825581395348839e-05, + "loss": 2.0673, + "step": 8228 + }, + { + "epoch": 0.7, + "grad_norm": 1.566142201423645, + "learning_rate": 1.4808481532147745e-05, + "loss": 1.8942, + "step": 8232 + }, + { + "epoch": 0.7, + "grad_norm": 1.5397378206253052, + "learning_rate": 1.4791381668946649e-05, + "loss": 1.963, + "step": 8236 + }, + { + "epoch": 0.7, + "grad_norm": 1.3787508010864258, + "learning_rate": 1.4774281805745554e-05, + "loss": 1.9094, + "step": 8240 + }, + { + "epoch": 0.7, + "grad_norm": 1.4250317811965942, + "learning_rate": 1.475718194254446e-05, + "loss": 1.8181, + "step": 8244 + }, + { + "epoch": 0.71, + "grad_norm": 1.868123173713684, + "learning_rate": 1.4740082079343364e-05, + "loss": 1.975, + "step": 8248 + }, + { + "epoch": 0.71, + "grad_norm": 1.6380820274353027, + "learning_rate": 1.4722982216142273e-05, + "loss": 1.8019, + "step": 8252 + }, + { + "epoch": 0.71, + "grad_norm": 1.7641537189483643, + "learning_rate": 1.4705882352941177e-05, + "loss": 1.8762, + "step": 8256 + }, + { + "epoch": 0.71, + "grad_norm": 1.704601526260376, + "learning_rate": 1.4688782489740083e-05, + "loss": 1.938, + "step": 8260 + }, + { + "epoch": 0.71, + "grad_norm": 1.5377604961395264, + "learning_rate": 1.4671682626538988e-05, + "loss": 1.837, + "step": 8264 + }, + { + "epoch": 0.71, + "grad_norm": 1.5826222896575928, + "learning_rate": 1.4654582763337892e-05, + "loss": 1.9301, + "step": 8268 + }, + { + "epoch": 0.71, + "grad_norm": 1.691020131111145, + "learning_rate": 1.4637482900136801e-05, + "loss": 1.9719, + "step": 8272 + }, + { + "epoch": 0.71, + "grad_norm": 1.5105916261672974, + "learning_rate": 1.4620383036935705e-05, + "loss": 1.9823, + "step": 8276 + }, + { + "epoch": 0.71, + "grad_norm": 1.5825799703598022, + "learning_rate": 1.460328317373461e-05, + "loss": 1.8319, + "step": 8280 + }, + { + "epoch": 0.71, + "grad_norm": 1.7014378309249878, + "learning_rate": 1.4586183310533516e-05, + "loss": 1.85, + "step": 8284 + }, + { + "epoch": 0.71, + "grad_norm": 1.5736719369888306, + "learning_rate": 1.456908344733242e-05, + "loss": 1.7308, + "step": 8288 + }, + { + "epoch": 0.71, + "grad_norm": 1.4500672817230225, + "learning_rate": 1.455198358413133e-05, + "loss": 1.8872, + "step": 8292 + }, + { + "epoch": 0.71, + "grad_norm": 1.5889211893081665, + "learning_rate": 1.4534883720930233e-05, + "loss": 1.9407, + "step": 8296 + }, + { + "epoch": 0.71, + "grad_norm": 1.7505712509155273, + "learning_rate": 1.4517783857729139e-05, + "loss": 1.9576, + "step": 8300 + }, + { + "epoch": 0.71, + "grad_norm": 1.6860861778259277, + "learning_rate": 1.4500683994528045e-05, + "loss": 2.0681, + "step": 8304 + }, + { + "epoch": 0.71, + "grad_norm": 1.6268845796585083, + "learning_rate": 1.4483584131326949e-05, + "loss": 2.0465, + "step": 8308 + }, + { + "epoch": 0.71, + "grad_norm": 1.5005725622177124, + "learning_rate": 1.4466484268125854e-05, + "loss": 1.8097, + "step": 8312 + }, + { + "epoch": 0.71, + "grad_norm": 1.6980187892913818, + "learning_rate": 1.4449384404924762e-05, + "loss": 1.8862, + "step": 8316 + }, + { + "epoch": 0.71, + "grad_norm": 1.7522437572479248, + "learning_rate": 1.4432284541723667e-05, + "loss": 1.8532, + "step": 8320 + }, + { + "epoch": 0.71, + "grad_norm": 1.590436577796936, + "learning_rate": 1.4415184678522573e-05, + "loss": 1.8276, + "step": 8324 + }, + { + "epoch": 0.71, + "grad_norm": 1.487074613571167, + "learning_rate": 1.4398084815321477e-05, + "loss": 1.9804, + "step": 8328 + }, + { + "epoch": 0.71, + "grad_norm": 1.9625794887542725, + "learning_rate": 1.4380984952120383e-05, + "loss": 1.9065, + "step": 8332 + }, + { + "epoch": 0.71, + "grad_norm": 1.6127842664718628, + "learning_rate": 1.436388508891929e-05, + "loss": 1.8961, + "step": 8336 + }, + { + "epoch": 0.71, + "grad_norm": 1.7317554950714111, + "learning_rate": 1.4346785225718196e-05, + "loss": 1.9341, + "step": 8340 + }, + { + "epoch": 0.71, + "grad_norm": 1.5197057723999023, + "learning_rate": 1.4329685362517101e-05, + "loss": 1.9849, + "step": 8344 + }, + { + "epoch": 0.71, + "grad_norm": 1.5553900003433228, + "learning_rate": 1.4312585499316005e-05, + "loss": 1.7892, + "step": 8348 + }, + { + "epoch": 0.71, + "grad_norm": 1.52109956741333, + "learning_rate": 1.429548563611491e-05, + "loss": 1.8928, + "step": 8352 + }, + { + "epoch": 0.71, + "grad_norm": 1.6558705568313599, + "learning_rate": 1.4278385772913818e-05, + "loss": 1.799, + "step": 8356 + }, + { + "epoch": 0.71, + "grad_norm": 1.612926959991455, + "learning_rate": 1.4261285909712724e-05, + "loss": 2.0211, + "step": 8360 + }, + { + "epoch": 0.72, + "grad_norm": 1.5769792795181274, + "learning_rate": 1.424418604651163e-05, + "loss": 1.9197, + "step": 8364 + }, + { + "epoch": 0.72, + "grad_norm": 1.557539939880371, + "learning_rate": 1.4227086183310533e-05, + "loss": 1.973, + "step": 8368 + }, + { + "epoch": 0.72, + "grad_norm": 1.6096733808517456, + "learning_rate": 1.4209986320109439e-05, + "loss": 1.9608, + "step": 8372 + }, + { + "epoch": 0.72, + "grad_norm": 1.759639024734497, + "learning_rate": 1.4192886456908345e-05, + "loss": 2.0181, + "step": 8376 + }, + { + "epoch": 0.72, + "grad_norm": 1.7144453525543213, + "learning_rate": 1.4175786593707252e-05, + "loss": 1.9385, + "step": 8380 + }, + { + "epoch": 0.72, + "grad_norm": 1.6760494709014893, + "learning_rate": 1.4158686730506158e-05, + "loss": 2.1005, + "step": 8384 + }, + { + "epoch": 0.72, + "grad_norm": 1.7116944789886475, + "learning_rate": 1.4141586867305062e-05, + "loss": 1.9031, + "step": 8388 + }, + { + "epoch": 0.72, + "grad_norm": 1.5736141204833984, + "learning_rate": 1.4124487004103967e-05, + "loss": 2.0717, + "step": 8392 + }, + { + "epoch": 0.72, + "grad_norm": 1.5009993314743042, + "learning_rate": 1.4107387140902873e-05, + "loss": 1.9062, + "step": 8396 + }, + { + "epoch": 0.72, + "grad_norm": 1.734379529953003, + "learning_rate": 1.409028727770178e-05, + "loss": 1.9607, + "step": 8400 + }, + { + "epoch": 0.72, + "grad_norm": 1.7106738090515137, + "learning_rate": 1.4073187414500686e-05, + "loss": 1.9597, + "step": 8404 + }, + { + "epoch": 0.72, + "grad_norm": 1.503051996231079, + "learning_rate": 1.405608755129959e-05, + "loss": 1.8636, + "step": 8408 + }, + { + "epoch": 0.72, + "grad_norm": 1.544773817062378, + "learning_rate": 1.4038987688098496e-05, + "loss": 1.903, + "step": 8412 + }, + { + "epoch": 0.72, + "grad_norm": 1.6054282188415527, + "learning_rate": 1.4021887824897401e-05, + "loss": 2.01, + "step": 8416 + }, + { + "epoch": 0.72, + "grad_norm": 1.5877810716629028, + "learning_rate": 1.4004787961696309e-05, + "loss": 1.8562, + "step": 8420 + }, + { + "epoch": 0.72, + "grad_norm": 1.669568419456482, + "learning_rate": 1.3987688098495214e-05, + "loss": 1.8486, + "step": 8424 + }, + { + "epoch": 0.72, + "grad_norm": 1.780972957611084, + "learning_rate": 1.3970588235294118e-05, + "loss": 1.9747, + "step": 8428 + }, + { + "epoch": 0.72, + "grad_norm": 1.4789724349975586, + "learning_rate": 1.3953488372093024e-05, + "loss": 1.9874, + "step": 8432 + }, + { + "epoch": 0.72, + "grad_norm": 1.5372753143310547, + "learning_rate": 1.393638850889193e-05, + "loss": 1.6921, + "step": 8436 + }, + { + "epoch": 0.72, + "grad_norm": 1.5397859811782837, + "learning_rate": 1.3919288645690833e-05, + "loss": 1.9584, + "step": 8440 + }, + { + "epoch": 0.72, + "grad_norm": 1.774103045463562, + "learning_rate": 1.3902188782489742e-05, + "loss": 1.9538, + "step": 8444 + }, + { + "epoch": 0.72, + "grad_norm": 1.6088874340057373, + "learning_rate": 1.3885088919288646e-05, + "loss": 1.9065, + "step": 8448 + }, + { + "epoch": 0.72, + "grad_norm": 1.5402274131774902, + "learning_rate": 1.3867989056087552e-05, + "loss": 1.8515, + "step": 8452 + }, + { + "epoch": 0.72, + "grad_norm": 1.530392050743103, + "learning_rate": 1.3850889192886458e-05, + "loss": 1.6816, + "step": 8456 + }, + { + "epoch": 0.72, + "grad_norm": 1.4472659826278687, + "learning_rate": 1.3833789329685362e-05, + "loss": 1.8966, + "step": 8460 + }, + { + "epoch": 0.72, + "grad_norm": 1.587443470954895, + "learning_rate": 1.381668946648427e-05, + "loss": 1.7889, + "step": 8464 + }, + { + "epoch": 0.72, + "grad_norm": 1.7226911783218384, + "learning_rate": 1.3799589603283175e-05, + "loss": 1.9593, + "step": 8468 + }, + { + "epoch": 0.72, + "grad_norm": 1.587898850440979, + "learning_rate": 1.378248974008208e-05, + "loss": 1.8951, + "step": 8472 + }, + { + "epoch": 0.72, + "grad_norm": 1.9148885011672974, + "learning_rate": 1.3765389876880986e-05, + "loss": 1.9351, + "step": 8476 + }, + { + "epoch": 0.73, + "grad_norm": 1.8292063474655151, + "learning_rate": 1.374829001367989e-05, + "loss": 1.8462, + "step": 8480 + }, + { + "epoch": 0.73, + "grad_norm": 1.73773992061615, + "learning_rate": 1.3731190150478799e-05, + "loss": 1.8633, + "step": 8484 + }, + { + "epoch": 0.73, + "grad_norm": 1.4449963569641113, + "learning_rate": 1.3714090287277703e-05, + "loss": 1.924, + "step": 8488 + }, + { + "epoch": 0.73, + "grad_norm": 1.5996627807617188, + "learning_rate": 1.3696990424076609e-05, + "loss": 1.8774, + "step": 8492 + }, + { + "epoch": 0.73, + "grad_norm": 1.6882178783416748, + "learning_rate": 1.3679890560875514e-05, + "loss": 1.9221, + "step": 8496 + }, + { + "epoch": 0.73, + "grad_norm": 1.6781047582626343, + "learning_rate": 1.3662790697674418e-05, + "loss": 1.7734, + "step": 8500 + }, + { + "epoch": 0.73, + "grad_norm": 1.6286660432815552, + "learning_rate": 1.3645690834473324e-05, + "loss": 1.8426, + "step": 8504 + }, + { + "epoch": 0.73, + "grad_norm": 1.4449224472045898, + "learning_rate": 1.3628590971272231e-05, + "loss": 1.8119, + "step": 8508 + }, + { + "epoch": 0.73, + "grad_norm": 1.6157493591308594, + "learning_rate": 1.3611491108071137e-05, + "loss": 1.7127, + "step": 8512 + }, + { + "epoch": 0.73, + "grad_norm": 1.8456206321716309, + "learning_rate": 1.3594391244870042e-05, + "loss": 1.8209, + "step": 8516 + }, + { + "epoch": 0.73, + "grad_norm": 1.5280355215072632, + "learning_rate": 1.3577291381668946e-05, + "loss": 2.0005, + "step": 8520 + }, + { + "epoch": 0.73, + "grad_norm": 1.8175022602081299, + "learning_rate": 1.3560191518467852e-05, + "loss": 1.9212, + "step": 8524 + }, + { + "epoch": 0.73, + "grad_norm": 1.5971537828445435, + "learning_rate": 1.354309165526676e-05, + "loss": 1.8606, + "step": 8528 + }, + { + "epoch": 0.73, + "grad_norm": 1.7127560377120972, + "learning_rate": 1.3525991792065665e-05, + "loss": 1.8265, + "step": 8532 + }, + { + "epoch": 0.73, + "grad_norm": 1.723050832748413, + "learning_rate": 1.350889192886457e-05, + "loss": 1.9374, + "step": 8536 + }, + { + "epoch": 0.73, + "grad_norm": 1.4820556640625, + "learning_rate": 1.3491792065663475e-05, + "loss": 1.8634, + "step": 8540 + }, + { + "epoch": 0.73, + "grad_norm": 1.731012225151062, + "learning_rate": 1.347469220246238e-05, + "loss": 1.9204, + "step": 8544 + }, + { + "epoch": 0.73, + "grad_norm": 1.586348295211792, + "learning_rate": 1.3457592339261288e-05, + "loss": 1.8462, + "step": 8548 + }, + { + "epoch": 0.73, + "grad_norm": 1.7355362176895142, + "learning_rate": 1.3440492476060193e-05, + "loss": 2.0345, + "step": 8552 + }, + { + "epoch": 0.73, + "grad_norm": 1.4833712577819824, + "learning_rate": 1.3423392612859099e-05, + "loss": 1.9132, + "step": 8556 + }, + { + "epoch": 0.73, + "grad_norm": 1.999471664428711, + "learning_rate": 1.3406292749658003e-05, + "loss": 2.0444, + "step": 8560 + }, + { + "epoch": 0.73, + "grad_norm": 1.6343189477920532, + "learning_rate": 1.3389192886456909e-05, + "loss": 1.8392, + "step": 8564 + }, + { + "epoch": 0.73, + "grad_norm": 1.5069808959960938, + "learning_rate": 1.3372093023255814e-05, + "loss": 1.8399, + "step": 8568 + }, + { + "epoch": 0.73, + "grad_norm": 1.521087884902954, + "learning_rate": 1.3354993160054722e-05, + "loss": 1.9186, + "step": 8572 + }, + { + "epoch": 0.73, + "grad_norm": 1.4739211797714233, + "learning_rate": 1.3337893296853627e-05, + "loss": 1.9674, + "step": 8576 + }, + { + "epoch": 0.73, + "grad_norm": 1.5574839115142822, + "learning_rate": 1.3320793433652531e-05, + "loss": 1.8879, + "step": 8580 + }, + { + "epoch": 0.73, + "grad_norm": 1.7786009311676025, + "learning_rate": 1.3303693570451437e-05, + "loss": 1.9308, + "step": 8584 + }, + { + "epoch": 0.73, + "grad_norm": 1.627382516860962, + "learning_rate": 1.3286593707250342e-05, + "loss": 1.8588, + "step": 8588 + }, + { + "epoch": 0.73, + "grad_norm": 1.5507442951202393, + "learning_rate": 1.326949384404925e-05, + "loss": 2.0934, + "step": 8592 + }, + { + "epoch": 0.73, + "grad_norm": 1.486575722694397, + "learning_rate": 1.3252393980848154e-05, + "loss": 2.1199, + "step": 8596 + }, + { + "epoch": 0.74, + "grad_norm": 1.3391540050506592, + "learning_rate": 1.323529411764706e-05, + "loss": 1.9324, + "step": 8600 + }, + { + "epoch": 0.74, + "grad_norm": 1.69169020652771, + "learning_rate": 1.3218194254445965e-05, + "loss": 1.906, + "step": 8604 + }, + { + "epoch": 0.74, + "grad_norm": 1.7022899389266968, + "learning_rate": 1.3201094391244869e-05, + "loss": 1.9422, + "step": 8608 + }, + { + "epoch": 0.74, + "grad_norm": 1.4805599451065063, + "learning_rate": 1.3183994528043778e-05, + "loss": 1.8661, + "step": 8612 + }, + { + "epoch": 0.74, + "grad_norm": 1.5892281532287598, + "learning_rate": 1.3166894664842682e-05, + "loss": 1.8542, + "step": 8616 + }, + { + "epoch": 0.74, + "grad_norm": 1.6527740955352783, + "learning_rate": 1.3149794801641588e-05, + "loss": 1.8107, + "step": 8620 + }, + { + "epoch": 0.74, + "grad_norm": 1.5607056617736816, + "learning_rate": 1.3132694938440493e-05, + "loss": 1.8057, + "step": 8624 + }, + { + "epoch": 0.74, + "grad_norm": 1.5519297122955322, + "learning_rate": 1.3115595075239397e-05, + "loss": 1.8115, + "step": 8628 + }, + { + "epoch": 0.74, + "grad_norm": 1.5834949016571045, + "learning_rate": 1.3098495212038303e-05, + "loss": 1.8907, + "step": 8632 + }, + { + "epoch": 0.74, + "grad_norm": 1.6823513507843018, + "learning_rate": 1.308139534883721e-05, + "loss": 1.7454, + "step": 8636 + }, + { + "epoch": 0.74, + "grad_norm": 1.6757451295852661, + "learning_rate": 1.3064295485636116e-05, + "loss": 1.7607, + "step": 8640 + }, + { + "epoch": 0.74, + "grad_norm": 1.6643555164337158, + "learning_rate": 1.3047195622435022e-05, + "loss": 1.9063, + "step": 8644 + }, + { + "epoch": 0.74, + "grad_norm": 1.7445098161697388, + "learning_rate": 1.3030095759233925e-05, + "loss": 1.7924, + "step": 8648 + }, + { + "epoch": 0.74, + "grad_norm": 1.6503303050994873, + "learning_rate": 1.3012995896032831e-05, + "loss": 2.072, + "step": 8652 + }, + { + "epoch": 0.74, + "grad_norm": 1.5941767692565918, + "learning_rate": 1.2995896032831738e-05, + "loss": 2.0103, + "step": 8656 + }, + { + "epoch": 0.74, + "grad_norm": 1.6478708982467651, + "learning_rate": 1.2978796169630644e-05, + "loss": 1.7316, + "step": 8660 + }, + { + "epoch": 0.74, + "grad_norm": 1.5611132383346558, + "learning_rate": 1.296169630642955e-05, + "loss": 1.822, + "step": 8664 + }, + { + "epoch": 0.74, + "grad_norm": 1.5451310873031616, + "learning_rate": 1.2944596443228454e-05, + "loss": 2.0152, + "step": 8668 + }, + { + "epoch": 0.74, + "grad_norm": 1.7043241262435913, + "learning_rate": 1.292749658002736e-05, + "loss": 1.9034, + "step": 8672 + }, + { + "epoch": 0.74, + "grad_norm": 1.7091752290725708, + "learning_rate": 1.2910396716826267e-05, + "loss": 1.8423, + "step": 8676 + }, + { + "epoch": 0.74, + "grad_norm": 1.6127060651779175, + "learning_rate": 1.2893296853625172e-05, + "loss": 1.6937, + "step": 8680 + }, + { + "epoch": 0.74, + "grad_norm": 1.7381840944290161, + "learning_rate": 1.2876196990424078e-05, + "loss": 1.9543, + "step": 8684 + }, + { + "epoch": 0.74, + "grad_norm": 1.5597513914108276, + "learning_rate": 1.2859097127222982e-05, + "loss": 1.8262, + "step": 8688 + }, + { + "epoch": 0.74, + "grad_norm": 1.6478689908981323, + "learning_rate": 1.2841997264021888e-05, + "loss": 2.0061, + "step": 8692 + }, + { + "epoch": 0.74, + "grad_norm": 1.6866868734359741, + "learning_rate": 1.2824897400820793e-05, + "loss": 2.0661, + "step": 8696 + }, + { + "epoch": 0.74, + "grad_norm": 1.6700700521469116, + "learning_rate": 1.28077975376197e-05, + "loss": 1.8317, + "step": 8700 + }, + { + "epoch": 0.74, + "grad_norm": 1.6782305240631104, + "learning_rate": 1.2790697674418606e-05, + "loss": 1.7571, + "step": 8704 + }, + { + "epoch": 0.74, + "grad_norm": 1.5588642358779907, + "learning_rate": 1.277359781121751e-05, + "loss": 1.8465, + "step": 8708 + }, + { + "epoch": 0.74, + "grad_norm": 1.5376290082931519, + "learning_rate": 1.2756497948016416e-05, + "loss": 1.8062, + "step": 8712 + }, + { + "epoch": 0.75, + "grad_norm": 1.6274547576904297, + "learning_rate": 1.2739398084815322e-05, + "loss": 1.8891, + "step": 8716 + }, + { + "epoch": 0.75, + "grad_norm": 1.577243447303772, + "learning_rate": 1.2722298221614229e-05, + "loss": 2.0104, + "step": 8720 + }, + { + "epoch": 0.75, + "grad_norm": 1.848371148109436, + "learning_rate": 1.2705198358413134e-05, + "loss": 1.7878, + "step": 8724 + }, + { + "epoch": 0.75, + "grad_norm": 1.5511852502822876, + "learning_rate": 1.2688098495212038e-05, + "loss": 1.765, + "step": 8728 + }, + { + "epoch": 0.75, + "grad_norm": 1.580466389656067, + "learning_rate": 1.2670998632010944e-05, + "loss": 1.9052, + "step": 8732 + }, + { + "epoch": 0.75, + "grad_norm": 1.6786214113235474, + "learning_rate": 1.265389876880985e-05, + "loss": 2.0108, + "step": 8736 + }, + { + "epoch": 0.75, + "grad_norm": 1.574545979499817, + "learning_rate": 1.2636798905608757e-05, + "loss": 1.9098, + "step": 8740 + }, + { + "epoch": 0.75, + "grad_norm": 1.7882972955703735, + "learning_rate": 1.2619699042407663e-05, + "loss": 1.9742, + "step": 8744 + }, + { + "epoch": 0.75, + "grad_norm": 1.6370630264282227, + "learning_rate": 1.2602599179206567e-05, + "loss": 1.8861, + "step": 8748 + }, + { + "epoch": 0.75, + "grad_norm": 2.044381618499756, + "learning_rate": 1.2585499316005472e-05, + "loss": 1.7537, + "step": 8752 + }, + { + "epoch": 0.75, + "grad_norm": 1.6843827962875366, + "learning_rate": 1.2568399452804378e-05, + "loss": 1.9561, + "step": 8756 + }, + { + "epoch": 0.75, + "grad_norm": 1.605976939201355, + "learning_rate": 1.2551299589603282e-05, + "loss": 1.8912, + "step": 8760 + }, + { + "epoch": 0.75, + "grad_norm": 1.6944398880004883, + "learning_rate": 1.2534199726402191e-05, + "loss": 1.988, + "step": 8764 + }, + { + "epoch": 0.75, + "grad_norm": 1.5753448009490967, + "learning_rate": 1.2517099863201095e-05, + "loss": 1.9168, + "step": 8768 + }, + { + "epoch": 0.75, + "grad_norm": 1.6419624090194702, + "learning_rate": 1.25e-05, + "loss": 1.9642, + "step": 8772 + }, + { + "epoch": 0.75, + "grad_norm": 1.6650840044021606, + "learning_rate": 1.2482900136798906e-05, + "loss": 1.9391, + "step": 8776 + }, + { + "epoch": 0.75, + "grad_norm": 1.5701370239257812, + "learning_rate": 1.2465800273597812e-05, + "loss": 1.7449, + "step": 8780 + }, + { + "epoch": 0.75, + "grad_norm": 1.6118509769439697, + "learning_rate": 1.2448700410396718e-05, + "loss": 1.9024, + "step": 8784 + }, + { + "epoch": 0.75, + "grad_norm": 1.4989128112792969, + "learning_rate": 1.2431600547195622e-05, + "loss": 1.8511, + "step": 8788 + }, + { + "epoch": 0.75, + "grad_norm": 1.8597005605697632, + "learning_rate": 1.2414500683994529e-05, + "loss": 2.0558, + "step": 8792 + }, + { + "epoch": 0.75, + "grad_norm": 1.6900529861450195, + "learning_rate": 1.2397400820793434e-05, + "loss": 1.9513, + "step": 8796 + }, + { + "epoch": 0.75, + "grad_norm": 1.8187960386276245, + "learning_rate": 1.238030095759234e-05, + "loss": 2.057, + "step": 8800 + }, + { + "epoch": 0.75, + "grad_norm": 1.5813332796096802, + "learning_rate": 1.2363201094391246e-05, + "loss": 1.8454, + "step": 8804 + }, + { + "epoch": 0.75, + "grad_norm": 1.655596137046814, + "learning_rate": 1.234610123119015e-05, + "loss": 1.8531, + "step": 8808 + }, + { + "epoch": 0.75, + "grad_norm": 1.681291937828064, + "learning_rate": 1.2329001367989057e-05, + "loss": 1.8698, + "step": 8812 + }, + { + "epoch": 0.75, + "grad_norm": 1.6871446371078491, + "learning_rate": 1.2311901504787963e-05, + "loss": 1.7355, + "step": 8816 + }, + { + "epoch": 0.75, + "grad_norm": 1.62662935256958, + "learning_rate": 1.2294801641586867e-05, + "loss": 2.067, + "step": 8820 + }, + { + "epoch": 0.75, + "grad_norm": 1.6432703733444214, + "learning_rate": 1.2277701778385774e-05, + "loss": 1.8635, + "step": 8824 + }, + { + "epoch": 0.75, + "grad_norm": 1.410051941871643, + "learning_rate": 1.2260601915184678e-05, + "loss": 1.7336, + "step": 8828 + }, + { + "epoch": 0.76, + "grad_norm": 1.6481599807739258, + "learning_rate": 1.2243502051983585e-05, + "loss": 2.025, + "step": 8832 + }, + { + "epoch": 0.76, + "grad_norm": 1.7318426370620728, + "learning_rate": 1.2226402188782491e-05, + "loss": 1.9353, + "step": 8836 + }, + { + "epoch": 0.76, + "grad_norm": 1.6890826225280762, + "learning_rate": 1.2209302325581395e-05, + "loss": 1.8659, + "step": 8840 + }, + { + "epoch": 0.76, + "grad_norm": 1.5665086507797241, + "learning_rate": 1.2192202462380302e-05, + "loss": 1.9448, + "step": 8844 + }, + { + "epoch": 0.76, + "grad_norm": 1.7020827531814575, + "learning_rate": 1.2175102599179206e-05, + "loss": 2.0202, + "step": 8848 + }, + { + "epoch": 0.76, + "grad_norm": 1.6117048263549805, + "learning_rate": 1.2158002735978112e-05, + "loss": 1.9785, + "step": 8852 + }, + { + "epoch": 0.76, + "grad_norm": 1.5090599060058594, + "learning_rate": 1.214090287277702e-05, + "loss": 1.929, + "step": 8856 + }, + { + "epoch": 0.76, + "grad_norm": 1.5808215141296387, + "learning_rate": 1.2123803009575923e-05, + "loss": 1.9534, + "step": 8860 + }, + { + "epoch": 0.76, + "grad_norm": 1.5516841411590576, + "learning_rate": 1.210670314637483e-05, + "loss": 1.9717, + "step": 8864 + }, + { + "epoch": 0.76, + "grad_norm": 1.79678475856781, + "learning_rate": 1.2089603283173734e-05, + "loss": 1.8398, + "step": 8868 + }, + { + "epoch": 0.76, + "grad_norm": 1.5560812950134277, + "learning_rate": 1.207250341997264e-05, + "loss": 1.9108, + "step": 8872 + }, + { + "epoch": 0.76, + "grad_norm": 1.660593032836914, + "learning_rate": 1.2055403556771547e-05, + "loss": 1.9138, + "step": 8876 + }, + { + "epoch": 0.76, + "grad_norm": 1.6071677207946777, + "learning_rate": 1.2038303693570451e-05, + "loss": 1.7974, + "step": 8880 + }, + { + "epoch": 0.76, + "grad_norm": 1.5675511360168457, + "learning_rate": 1.2021203830369357e-05, + "loss": 1.7748, + "step": 8884 + }, + { + "epoch": 0.76, + "grad_norm": 1.5390551090240479, + "learning_rate": 1.2004103967168263e-05, + "loss": 1.8618, + "step": 8888 + }, + { + "epoch": 0.76, + "grad_norm": 1.4867161512374878, + "learning_rate": 1.1987004103967168e-05, + "loss": 1.7825, + "step": 8892 + }, + { + "epoch": 0.76, + "grad_norm": 1.6912176609039307, + "learning_rate": 1.1969904240766076e-05, + "loss": 1.9014, + "step": 8896 + }, + { + "epoch": 0.76, + "grad_norm": 1.5211007595062256, + "learning_rate": 1.195280437756498e-05, + "loss": 1.8937, + "step": 8900 + }, + { + "epoch": 0.76, + "grad_norm": 1.582693099975586, + "learning_rate": 1.1935704514363885e-05, + "loss": 1.8975, + "step": 8904 + }, + { + "epoch": 0.76, + "grad_norm": 1.5599603652954102, + "learning_rate": 1.1918604651162791e-05, + "loss": 1.8864, + "step": 8908 + }, + { + "epoch": 0.76, + "grad_norm": 1.5778361558914185, + "learning_rate": 1.1901504787961697e-05, + "loss": 2.0265, + "step": 8912 + }, + { + "epoch": 0.76, + "grad_norm": 1.5679864883422852, + "learning_rate": 1.1884404924760602e-05, + "loss": 1.8832, + "step": 8916 + }, + { + "epoch": 0.76, + "grad_norm": 1.741749882698059, + "learning_rate": 1.1867305061559508e-05, + "loss": 1.9069, + "step": 8920 + }, + { + "epoch": 0.76, + "grad_norm": 1.6839423179626465, + "learning_rate": 1.1850205198358414e-05, + "loss": 1.8291, + "step": 8924 + }, + { + "epoch": 0.76, + "grad_norm": 1.638995885848999, + "learning_rate": 1.183310533515732e-05, + "loss": 1.8998, + "step": 8928 + }, + { + "epoch": 0.76, + "grad_norm": 1.6037232875823975, + "learning_rate": 1.1816005471956225e-05, + "loss": 1.8874, + "step": 8932 + }, + { + "epoch": 0.76, + "grad_norm": 2.105273962020874, + "learning_rate": 1.179890560875513e-05, + "loss": 1.9178, + "step": 8936 + }, + { + "epoch": 0.76, + "grad_norm": 1.7534767389297485, + "learning_rate": 1.1781805745554036e-05, + "loss": 1.893, + "step": 8940 + }, + { + "epoch": 0.76, + "grad_norm": 1.6804143190383911, + "learning_rate": 1.1764705882352942e-05, + "loss": 1.9738, + "step": 8944 + }, + { + "epoch": 0.77, + "grad_norm": 1.6752735376358032, + "learning_rate": 1.1747606019151847e-05, + "loss": 1.774, + "step": 8948 + }, + { + "epoch": 0.77, + "grad_norm": 1.6710683107376099, + "learning_rate": 1.1730506155950753e-05, + "loss": 1.9189, + "step": 8952 + }, + { + "epoch": 0.77, + "grad_norm": 1.5679121017456055, + "learning_rate": 1.1713406292749659e-05, + "loss": 1.7889, + "step": 8956 + }, + { + "epoch": 0.77, + "grad_norm": 1.6607458591461182, + "learning_rate": 1.1696306429548564e-05, + "loss": 1.7928, + "step": 8960 + }, + { + "epoch": 0.77, + "grad_norm": 1.458150029182434, + "learning_rate": 1.167920656634747e-05, + "loss": 1.8047, + "step": 8964 + }, + { + "epoch": 0.77, + "grad_norm": 1.8078303337097168, + "learning_rate": 1.1662106703146374e-05, + "loss": 2.096, + "step": 8968 + }, + { + "epoch": 0.77, + "grad_norm": 1.7263890504837036, + "learning_rate": 1.1645006839945281e-05, + "loss": 2.0121, + "step": 8972 + }, + { + "epoch": 0.77, + "grad_norm": 1.5924429893493652, + "learning_rate": 1.1627906976744187e-05, + "loss": 1.9003, + "step": 8976 + }, + { + "epoch": 0.77, + "grad_norm": 1.7311723232269287, + "learning_rate": 1.1610807113543091e-05, + "loss": 1.8617, + "step": 8980 + }, + { + "epoch": 0.77, + "grad_norm": 1.6257747411727905, + "learning_rate": 1.1593707250341998e-05, + "loss": 1.7866, + "step": 8984 + }, + { + "epoch": 0.77, + "grad_norm": 1.6690551042556763, + "learning_rate": 1.1576607387140902e-05, + "loss": 1.9114, + "step": 8988 + }, + { + "epoch": 0.77, + "grad_norm": 1.6894851922988892, + "learning_rate": 1.155950752393981e-05, + "loss": 1.9979, + "step": 8992 + }, + { + "epoch": 0.77, + "grad_norm": 1.7198370695114136, + "learning_rate": 1.1542407660738715e-05, + "loss": 2.0374, + "step": 8996 + }, + { + "epoch": 0.77, + "grad_norm": 1.5919808149337769, + "learning_rate": 1.152530779753762e-05, + "loss": 1.7399, + "step": 9000 + }, + { + "epoch": 0.77, + "grad_norm": 1.6823852062225342, + "learning_rate": 1.1508207934336527e-05, + "loss": 1.8478, + "step": 9004 + }, + { + "epoch": 0.77, + "grad_norm": 1.7140239477157593, + "learning_rate": 1.149110807113543e-05, + "loss": 1.9611, + "step": 9008 + }, + { + "epoch": 0.77, + "grad_norm": 1.5392545461654663, + "learning_rate": 1.1474008207934336e-05, + "loss": 1.7857, + "step": 9012 + }, + { + "epoch": 0.77, + "grad_norm": 1.8829190731048584, + "learning_rate": 1.1456908344733244e-05, + "loss": 1.8474, + "step": 9016 + }, + { + "epoch": 0.77, + "grad_norm": 1.5163602828979492, + "learning_rate": 1.1439808481532147e-05, + "loss": 1.9006, + "step": 9020 + }, + { + "epoch": 0.77, + "grad_norm": 1.5375906229019165, + "learning_rate": 1.1422708618331055e-05, + "loss": 2.0124, + "step": 9024 + }, + { + "epoch": 0.77, + "grad_norm": 1.5933159589767456, + "learning_rate": 1.1405608755129959e-05, + "loss": 1.8251, + "step": 9028 + }, + { + "epoch": 0.77, + "grad_norm": 1.8997576236724854, + "learning_rate": 1.1388508891928864e-05, + "loss": 1.9688, + "step": 9032 + }, + { + "epoch": 0.77, + "grad_norm": 1.5310240983963013, + "learning_rate": 1.1371409028727772e-05, + "loss": 1.9101, + "step": 9036 + }, + { + "epoch": 0.77, + "grad_norm": 1.7153500318527222, + "learning_rate": 1.1354309165526676e-05, + "loss": 1.7864, + "step": 9040 + }, + { + "epoch": 0.77, + "grad_norm": 1.7716693878173828, + "learning_rate": 1.1337209302325581e-05, + "loss": 1.9119, + "step": 9044 + }, + { + "epoch": 0.77, + "grad_norm": 1.8320080041885376, + "learning_rate": 1.1320109439124487e-05, + "loss": 2.0897, + "step": 9048 + }, + { + "epoch": 0.77, + "grad_norm": 1.6159136295318604, + "learning_rate": 1.1303009575923393e-05, + "loss": 2.0392, + "step": 9052 + }, + { + "epoch": 0.77, + "grad_norm": 1.5778326988220215, + "learning_rate": 1.12859097127223e-05, + "loss": 1.9324, + "step": 9056 + }, + { + "epoch": 0.77, + "grad_norm": 1.5392941236495972, + "learning_rate": 1.1268809849521204e-05, + "loss": 1.8133, + "step": 9060 + }, + { + "epoch": 0.77, + "grad_norm": 1.6904296875, + "learning_rate": 1.125170998632011e-05, + "loss": 1.9265, + "step": 9064 + }, + { + "epoch": 0.78, + "grad_norm": 1.7861131429672241, + "learning_rate": 1.1234610123119015e-05, + "loss": 1.7405, + "step": 9068 + }, + { + "epoch": 0.78, + "grad_norm": 1.6475653648376465, + "learning_rate": 1.1217510259917921e-05, + "loss": 1.8417, + "step": 9072 + }, + { + "epoch": 0.78, + "grad_norm": 1.4819046258926392, + "learning_rate": 1.1200410396716827e-05, + "loss": 1.7927, + "step": 9076 + }, + { + "epoch": 0.78, + "grad_norm": 1.5303575992584229, + "learning_rate": 1.1183310533515732e-05, + "loss": 1.735, + "step": 9080 + }, + { + "epoch": 0.78, + "grad_norm": 1.5807466506958008, + "learning_rate": 1.1166210670314638e-05, + "loss": 1.7733, + "step": 9084 + }, + { + "epoch": 0.78, + "grad_norm": 1.6154811382293701, + "learning_rate": 1.1149110807113544e-05, + "loss": 1.985, + "step": 9088 + }, + { + "epoch": 0.78, + "grad_norm": 1.588836908340454, + "learning_rate": 1.113201094391245e-05, + "loss": 1.8861, + "step": 9092 + }, + { + "epoch": 0.78, + "grad_norm": 1.6571130752563477, + "learning_rate": 1.1114911080711355e-05, + "loss": 1.9013, + "step": 9096 + }, + { + "epoch": 0.78, + "grad_norm": 1.7480363845825195, + "learning_rate": 1.109781121751026e-05, + "loss": 2.0298, + "step": 9100 + }, + { + "epoch": 0.78, + "grad_norm": 1.53179931640625, + "learning_rate": 1.1080711354309166e-05, + "loss": 1.8883, + "step": 9104 + }, + { + "epoch": 0.78, + "grad_norm": 1.7249871492385864, + "learning_rate": 1.1063611491108072e-05, + "loss": 1.888, + "step": 9108 + }, + { + "epoch": 0.78, + "grad_norm": 1.771964192390442, + "learning_rate": 1.1046511627906977e-05, + "loss": 1.7898, + "step": 9112 + }, + { + "epoch": 0.78, + "grad_norm": 1.8706704378128052, + "learning_rate": 1.1029411764705883e-05, + "loss": 1.8936, + "step": 9116 + }, + { + "epoch": 0.78, + "grad_norm": 1.6252784729003906, + "learning_rate": 1.1012311901504789e-05, + "loss": 1.909, + "step": 9120 + }, + { + "epoch": 0.78, + "grad_norm": 1.5766993761062622, + "learning_rate": 1.0995212038303694e-05, + "loss": 1.7883, + "step": 9124 + }, + { + "epoch": 0.78, + "grad_norm": 1.5931488275527954, + "learning_rate": 1.09781121751026e-05, + "loss": 1.8009, + "step": 9128 + }, + { + "epoch": 0.78, + "grad_norm": 1.652861475944519, + "learning_rate": 1.0961012311901506e-05, + "loss": 1.9441, + "step": 9132 + }, + { + "epoch": 0.78, + "grad_norm": 1.604109764099121, + "learning_rate": 1.0943912448700411e-05, + "loss": 1.944, + "step": 9136 + }, + { + "epoch": 0.78, + "grad_norm": 1.648501992225647, + "learning_rate": 1.0926812585499315e-05, + "loss": 1.7164, + "step": 9140 + }, + { + "epoch": 0.78, + "grad_norm": 1.697082757949829, + "learning_rate": 1.0909712722298223e-05, + "loss": 1.9967, + "step": 9144 + }, + { + "epoch": 0.78, + "grad_norm": 1.5939291715621948, + "learning_rate": 1.0892612859097127e-05, + "loss": 1.8479, + "step": 9148 + }, + { + "epoch": 0.78, + "grad_norm": 1.4296135902404785, + "learning_rate": 1.0875512995896034e-05, + "loss": 1.7912, + "step": 9152 + }, + { + "epoch": 0.78, + "grad_norm": 1.5867764949798584, + "learning_rate": 1.085841313269494e-05, + "loss": 2.0543, + "step": 9156 + }, + { + "epoch": 0.78, + "grad_norm": 1.877381443977356, + "learning_rate": 1.0841313269493844e-05, + "loss": 1.9605, + "step": 9160 + }, + { + "epoch": 0.78, + "grad_norm": 1.5704667568206787, + "learning_rate": 1.0824213406292751e-05, + "loss": 1.8716, + "step": 9164 + }, + { + "epoch": 0.78, + "grad_norm": 1.7016427516937256, + "learning_rate": 1.0807113543091655e-05, + "loss": 1.8002, + "step": 9168 + }, + { + "epoch": 0.78, + "grad_norm": 1.5268316268920898, + "learning_rate": 1.079001367989056e-05, + "loss": 1.8328, + "step": 9172 + }, + { + "epoch": 0.78, + "grad_norm": 1.773216962814331, + "learning_rate": 1.0772913816689468e-05, + "loss": 2.0852, + "step": 9176 + }, + { + "epoch": 0.78, + "grad_norm": 1.8278284072875977, + "learning_rate": 1.0755813953488372e-05, + "loss": 1.7678, + "step": 9180 + }, + { + "epoch": 0.79, + "grad_norm": 1.9138497114181519, + "learning_rate": 1.0738714090287279e-05, + "loss": 1.8216, + "step": 9184 + }, + { + "epoch": 0.79, + "grad_norm": 1.6149177551269531, + "learning_rate": 1.0721614227086183e-05, + "loss": 2.0147, + "step": 9188 + }, + { + "epoch": 0.79, + "grad_norm": 1.5738332271575928, + "learning_rate": 1.0704514363885089e-05, + "loss": 1.9592, + "step": 9192 + }, + { + "epoch": 0.79, + "grad_norm": 1.617006778717041, + "learning_rate": 1.0687414500683996e-05, + "loss": 1.9349, + "step": 9196 + }, + { + "epoch": 0.79, + "grad_norm": 1.6684281826019287, + "learning_rate": 1.06703146374829e-05, + "loss": 1.9102, + "step": 9200 + }, + { + "epoch": 0.79, + "grad_norm": 1.5881692171096802, + "learning_rate": 1.0653214774281806e-05, + "loss": 1.7868, + "step": 9204 + }, + { + "epoch": 0.79, + "grad_norm": 1.6087418794631958, + "learning_rate": 1.0636114911080711e-05, + "loss": 1.8317, + "step": 9208 + }, + { + "epoch": 0.79, + "grad_norm": 1.5474470853805542, + "learning_rate": 1.0619015047879617e-05, + "loss": 1.8203, + "step": 9212 + }, + { + "epoch": 0.79, + "grad_norm": 1.6170072555541992, + "learning_rate": 1.0601915184678524e-05, + "loss": 1.9812, + "step": 9216 + }, + { + "epoch": 0.79, + "grad_norm": 1.6150282621383667, + "learning_rate": 1.0584815321477428e-05, + "loss": 1.831, + "step": 9220 + }, + { + "epoch": 0.79, + "grad_norm": 1.888481855392456, + "learning_rate": 1.0567715458276334e-05, + "loss": 1.9179, + "step": 9224 + }, + { + "epoch": 0.79, + "grad_norm": 1.499454379081726, + "learning_rate": 1.055061559507524e-05, + "loss": 1.9117, + "step": 9228 + }, + { + "epoch": 0.79, + "grad_norm": 1.8907907009124756, + "learning_rate": 1.0533515731874145e-05, + "loss": 1.9377, + "step": 9232 + }, + { + "epoch": 0.79, + "grad_norm": 1.6337422132492065, + "learning_rate": 1.0516415868673051e-05, + "loss": 1.946, + "step": 9236 + }, + { + "epoch": 0.79, + "grad_norm": 1.692440390586853, + "learning_rate": 1.0499316005471957e-05, + "loss": 2.0171, + "step": 9240 + }, + { + "epoch": 0.79, + "grad_norm": 1.8192511796951294, + "learning_rate": 1.0482216142270862e-05, + "loss": 2.0665, + "step": 9244 + }, + { + "epoch": 0.79, + "grad_norm": 1.7105309963226318, + "learning_rate": 1.0465116279069768e-05, + "loss": 1.8517, + "step": 9248 + }, + { + "epoch": 0.79, + "grad_norm": 1.570191740989685, + "learning_rate": 1.0448016415868673e-05, + "loss": 1.879, + "step": 9252 + }, + { + "epoch": 0.79, + "grad_norm": 1.5352414846420288, + "learning_rate": 1.0430916552667579e-05, + "loss": 1.9844, + "step": 9256 + }, + { + "epoch": 0.79, + "grad_norm": 1.6077239513397217, + "learning_rate": 1.0413816689466485e-05, + "loss": 1.8066, + "step": 9260 + }, + { + "epoch": 0.79, + "grad_norm": 1.6216827630996704, + "learning_rate": 1.039671682626539e-05, + "loss": 1.7569, + "step": 9264 + }, + { + "epoch": 0.79, + "grad_norm": 1.5977295637130737, + "learning_rate": 1.0379616963064296e-05, + "loss": 1.841, + "step": 9268 + }, + { + "epoch": 0.79, + "grad_norm": 1.7415016889572144, + "learning_rate": 1.0362517099863202e-05, + "loss": 2.0001, + "step": 9272 + }, + { + "epoch": 0.79, + "grad_norm": 1.6122945547103882, + "learning_rate": 1.0345417236662107e-05, + "loss": 1.7887, + "step": 9276 + }, + { + "epoch": 0.79, + "grad_norm": 1.7112343311309814, + "learning_rate": 1.0328317373461013e-05, + "loss": 1.9709, + "step": 9280 + }, + { + "epoch": 0.79, + "grad_norm": 1.583895206451416, + "learning_rate": 1.0311217510259919e-05, + "loss": 2.0006, + "step": 9284 + }, + { + "epoch": 0.79, + "grad_norm": 1.5309836864471436, + "learning_rate": 1.0294117647058824e-05, + "loss": 1.8978, + "step": 9288 + }, + { + "epoch": 0.79, + "grad_norm": 1.6894787549972534, + "learning_rate": 1.027701778385773e-05, + "loss": 1.856, + "step": 9292 + }, + { + "epoch": 0.79, + "grad_norm": 1.5949068069458008, + "learning_rate": 1.0259917920656636e-05, + "loss": 1.84, + "step": 9296 + }, + { + "epoch": 0.8, + "grad_norm": 1.5734741687774658, + "learning_rate": 1.024281805745554e-05, + "loss": 1.9617, + "step": 9300 + }, + { + "epoch": 0.8, + "grad_norm": 1.7839090824127197, + "learning_rate": 1.0225718194254447e-05, + "loss": 1.7957, + "step": 9304 + }, + { + "epoch": 0.8, + "grad_norm": 1.9882410764694214, + "learning_rate": 1.0208618331053353e-05, + "loss": 2.0439, + "step": 9308 + }, + { + "epoch": 0.8, + "grad_norm": 1.6469732522964478, + "learning_rate": 1.0191518467852258e-05, + "loss": 1.82, + "step": 9312 + }, + { + "epoch": 0.8, + "grad_norm": 1.597993016242981, + "learning_rate": 1.0174418604651164e-05, + "loss": 2.0186, + "step": 9316 + }, + { + "epoch": 0.8, + "grad_norm": 1.5456162691116333, + "learning_rate": 1.0157318741450068e-05, + "loss": 1.9057, + "step": 9320 + }, + { + "epoch": 0.8, + "grad_norm": 1.589836597442627, + "learning_rate": 1.0140218878248975e-05, + "loss": 1.8665, + "step": 9324 + }, + { + "epoch": 0.8, + "grad_norm": 1.5683108568191528, + "learning_rate": 1.0123119015047879e-05, + "loss": 1.7532, + "step": 9328 + }, + { + "epoch": 0.8, + "grad_norm": 1.5445048809051514, + "learning_rate": 1.0106019151846785e-05, + "loss": 1.9741, + "step": 9332 + }, + { + "epoch": 0.8, + "grad_norm": 1.6579010486602783, + "learning_rate": 1.0088919288645692e-05, + "loss": 1.8469, + "step": 9336 + }, + { + "epoch": 0.8, + "grad_norm": 1.813675045967102, + "learning_rate": 1.0071819425444596e-05, + "loss": 1.9811, + "step": 9340 + }, + { + "epoch": 0.8, + "grad_norm": 1.69435453414917, + "learning_rate": 1.0054719562243503e-05, + "loss": 1.6031, + "step": 9344 + }, + { + "epoch": 0.8, + "grad_norm": 1.9789328575134277, + "learning_rate": 1.0037619699042407e-05, + "loss": 2.0603, + "step": 9348 + }, + { + "epoch": 0.8, + "grad_norm": 1.7569694519042969, + "learning_rate": 1.0020519835841313e-05, + "loss": 1.8548, + "step": 9352 + }, + { + "epoch": 0.8, + "grad_norm": 1.7058336734771729, + "learning_rate": 1.000341997264022e-05, + "loss": 1.8937, + "step": 9356 + }, + { + "epoch": 0.8, + "grad_norm": 1.7999751567840576, + "learning_rate": 9.986320109439124e-06, + "loss": 2.0138, + "step": 9360 + }, + { + "epoch": 0.8, + "grad_norm": 1.6166166067123413, + "learning_rate": 9.96922024623803e-06, + "loss": 1.8514, + "step": 9364 + }, + { + "epoch": 0.8, + "grad_norm": 1.5393308401107788, + "learning_rate": 9.952120383036936e-06, + "loss": 1.8538, + "step": 9368 + }, + { + "epoch": 0.8, + "grad_norm": 1.7060655355453491, + "learning_rate": 9.935020519835841e-06, + "loss": 1.8204, + "step": 9372 + }, + { + "epoch": 0.8, + "grad_norm": 1.5741777420043945, + "learning_rate": 9.917920656634749e-06, + "loss": 1.8723, + "step": 9376 + }, + { + "epoch": 0.8, + "grad_norm": 1.7236156463623047, + "learning_rate": 9.900820793433653e-06, + "loss": 2.0826, + "step": 9380 + }, + { + "epoch": 0.8, + "grad_norm": 1.6071019172668457, + "learning_rate": 9.883720930232558e-06, + "loss": 1.8699, + "step": 9384 + }, + { + "epoch": 0.8, + "grad_norm": 1.5609468221664429, + "learning_rate": 9.866621067031464e-06, + "loss": 1.7908, + "step": 9388 + }, + { + "epoch": 0.8, + "grad_norm": 1.5828368663787842, + "learning_rate": 9.84952120383037e-06, + "loss": 1.9035, + "step": 9392 + }, + { + "epoch": 0.8, + "grad_norm": 1.6804313659667969, + "learning_rate": 9.832421340629275e-06, + "loss": 1.873, + "step": 9396 + }, + { + "epoch": 0.8, + "grad_norm": 1.7444369792938232, + "learning_rate": 9.81532147742818e-06, + "loss": 1.9714, + "step": 9400 + }, + { + "epoch": 0.8, + "grad_norm": 1.478379487991333, + "learning_rate": 9.798221614227086e-06, + "loss": 1.825, + "step": 9404 + }, + { + "epoch": 0.8, + "grad_norm": 1.8659464120864868, + "learning_rate": 9.781121751025992e-06, + "loss": 2.0924, + "step": 9408 + }, + { + "epoch": 0.8, + "grad_norm": 1.6995452642440796, + "learning_rate": 9.764021887824898e-06, + "loss": 1.7909, + "step": 9412 + }, + { + "epoch": 0.81, + "grad_norm": 1.7288949489593506, + "learning_rate": 9.746922024623803e-06, + "loss": 1.9683, + "step": 9416 + }, + { + "epoch": 0.81, + "grad_norm": 1.728184461593628, + "learning_rate": 9.729822161422709e-06, + "loss": 1.867, + "step": 9420 + }, + { + "epoch": 0.81, + "grad_norm": 1.6516950130462646, + "learning_rate": 9.712722298221615e-06, + "loss": 1.847, + "step": 9424 + }, + { + "epoch": 0.81, + "grad_norm": 1.6084563732147217, + "learning_rate": 9.69562243502052e-06, + "loss": 1.8446, + "step": 9428 + }, + { + "epoch": 0.81, + "grad_norm": 1.6345360279083252, + "learning_rate": 9.678522571819426e-06, + "loss": 1.8856, + "step": 9432 + }, + { + "epoch": 0.81, + "grad_norm": 1.6293641328811646, + "learning_rate": 9.661422708618332e-06, + "loss": 1.8721, + "step": 9436 + }, + { + "epoch": 0.81, + "grad_norm": 1.7901579141616821, + "learning_rate": 9.644322845417237e-06, + "loss": 1.8333, + "step": 9440 + }, + { + "epoch": 0.81, + "grad_norm": 1.8444390296936035, + "learning_rate": 9.627222982216143e-06, + "loss": 1.8012, + "step": 9444 + }, + { + "epoch": 0.81, + "grad_norm": 1.9100682735443115, + "learning_rate": 9.610123119015049e-06, + "loss": 1.8989, + "step": 9448 + }, + { + "epoch": 0.81, + "grad_norm": 1.5829551219940186, + "learning_rate": 9.593023255813954e-06, + "loss": 1.692, + "step": 9452 + }, + { + "epoch": 0.81, + "grad_norm": 1.7920702695846558, + "learning_rate": 9.57592339261286e-06, + "loss": 1.9008, + "step": 9456 + }, + { + "epoch": 0.81, + "grad_norm": 1.8126853704452515, + "learning_rate": 9.558823529411764e-06, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 0.81, + "grad_norm": 1.6333680152893066, + "learning_rate": 9.541723666210671e-06, + "loss": 1.8486, + "step": 9464 + }, + { + "epoch": 0.81, + "grad_norm": 1.6809686422348022, + "learning_rate": 9.524623803009577e-06, + "loss": 1.8182, + "step": 9468 + }, + { + "epoch": 0.81, + "grad_norm": 1.5512781143188477, + "learning_rate": 9.507523939808483e-06, + "loss": 1.782, + "step": 9472 + }, + { + "epoch": 0.81, + "grad_norm": 1.4863585233688354, + "learning_rate": 9.490424076607388e-06, + "loss": 1.9, + "step": 9476 + }, + { + "epoch": 0.81, + "grad_norm": 1.8519527912139893, + "learning_rate": 9.473324213406292e-06, + "loss": 1.8792, + "step": 9480 + }, + { + "epoch": 0.81, + "grad_norm": 1.68942391872406, + "learning_rate": 9.4562243502052e-06, + "loss": 1.81, + "step": 9484 + }, + { + "epoch": 0.81, + "grad_norm": 1.6430591344833374, + "learning_rate": 9.439124487004105e-06, + "loss": 1.7057, + "step": 9488 + }, + { + "epoch": 0.81, + "grad_norm": 1.7153677940368652, + "learning_rate": 9.422024623803009e-06, + "loss": 1.7675, + "step": 9492 + }, + { + "epoch": 0.81, + "grad_norm": 2.2399628162384033, + "learning_rate": 9.404924760601916e-06, + "loss": 1.9985, + "step": 9496 + }, + { + "epoch": 0.81, + "grad_norm": 1.6189675331115723, + "learning_rate": 9.38782489740082e-06, + "loss": 1.8922, + "step": 9500 + }, + { + "epoch": 0.81, + "grad_norm": 1.6087392568588257, + "learning_rate": 9.370725034199728e-06, + "loss": 1.8978, + "step": 9504 + }, + { + "epoch": 0.81, + "grad_norm": 1.7213088274002075, + "learning_rate": 9.353625170998632e-06, + "loss": 1.9047, + "step": 9508 + }, + { + "epoch": 0.81, + "grad_norm": 1.6473731994628906, + "learning_rate": 9.336525307797537e-06, + "loss": 1.922, + "step": 9512 + }, + { + "epoch": 0.81, + "grad_norm": 1.6384577751159668, + "learning_rate": 9.319425444596445e-06, + "loss": 1.9051, + "step": 9516 + }, + { + "epoch": 0.81, + "grad_norm": 1.6715625524520874, + "learning_rate": 9.302325581395349e-06, + "loss": 1.7335, + "step": 9520 + }, + { + "epoch": 0.81, + "grad_norm": 1.7468408346176147, + "learning_rate": 9.285225718194254e-06, + "loss": 1.8317, + "step": 9524 + }, + { + "epoch": 0.81, + "grad_norm": 1.6021628379821777, + "learning_rate": 9.26812585499316e-06, + "loss": 1.7717, + "step": 9528 + }, + { + "epoch": 0.81, + "grad_norm": 1.4728972911834717, + "learning_rate": 9.251025991792066e-06, + "loss": 1.8383, + "step": 9532 + }, + { + "epoch": 0.82, + "grad_norm": 1.641257643699646, + "learning_rate": 9.233926128590973e-06, + "loss": 1.9203, + "step": 9536 + }, + { + "epoch": 0.82, + "grad_norm": 1.5959044694900513, + "learning_rate": 9.216826265389877e-06, + "loss": 1.8186, + "step": 9540 + }, + { + "epoch": 0.82, + "grad_norm": 1.8256573677062988, + "learning_rate": 9.199726402188783e-06, + "loss": 1.9279, + "step": 9544 + }, + { + "epoch": 0.82, + "grad_norm": 1.6256192922592163, + "learning_rate": 9.182626538987688e-06, + "loss": 1.8773, + "step": 9548 + }, + { + "epoch": 0.82, + "grad_norm": 1.7054941654205322, + "learning_rate": 9.165526675786594e-06, + "loss": 1.8826, + "step": 9552 + }, + { + "epoch": 0.82, + "grad_norm": 1.7333673238754272, + "learning_rate": 9.1484268125855e-06, + "loss": 1.8855, + "step": 9556 + }, + { + "epoch": 0.82, + "grad_norm": 1.9579733610153198, + "learning_rate": 9.131326949384405e-06, + "loss": 1.7222, + "step": 9560 + }, + { + "epoch": 0.82, + "grad_norm": 1.6406586170196533, + "learning_rate": 9.11422708618331e-06, + "loss": 1.8063, + "step": 9564 + }, + { + "epoch": 0.82, + "grad_norm": 1.8166844844818115, + "learning_rate": 9.097127222982216e-06, + "loss": 1.8802, + "step": 9568 + }, + { + "epoch": 0.82, + "grad_norm": 1.815399169921875, + "learning_rate": 9.080027359781122e-06, + "loss": 1.9574, + "step": 9572 + }, + { + "epoch": 0.82, + "grad_norm": 1.7717112302780151, + "learning_rate": 9.062927496580028e-06, + "loss": 1.8687, + "step": 9576 + }, + { + "epoch": 0.82, + "grad_norm": 1.8283933401107788, + "learning_rate": 9.045827633378933e-06, + "loss": 1.7794, + "step": 9580 + }, + { + "epoch": 0.82, + "grad_norm": 1.5563534498214722, + "learning_rate": 9.028727770177839e-06, + "loss": 1.9697, + "step": 9584 + }, + { + "epoch": 0.82, + "grad_norm": 1.7888858318328857, + "learning_rate": 9.011627906976745e-06, + "loss": 1.8549, + "step": 9588 + }, + { + "epoch": 0.82, + "grad_norm": 1.6961920261383057, + "learning_rate": 8.99452804377565e-06, + "loss": 1.8044, + "step": 9592 + }, + { + "epoch": 0.82, + "grad_norm": 1.6775288581848145, + "learning_rate": 8.977428180574556e-06, + "loss": 1.7884, + "step": 9596 + }, + { + "epoch": 0.82, + "grad_norm": 1.5376509428024292, + "learning_rate": 8.960328317373462e-06, + "loss": 1.9472, + "step": 9600 + }, + { + "epoch": 0.82, + "grad_norm": 1.5770347118377686, + "learning_rate": 8.943228454172367e-06, + "loss": 1.7587, + "step": 9604 + }, + { + "epoch": 0.82, + "grad_norm": 1.5645045042037964, + "learning_rate": 8.926128590971273e-06, + "loss": 1.9036, + "step": 9608 + }, + { + "epoch": 0.82, + "grad_norm": 1.6209347248077393, + "learning_rate": 8.909028727770179e-06, + "loss": 1.9395, + "step": 9612 + }, + { + "epoch": 0.82, + "grad_norm": 1.5157575607299805, + "learning_rate": 8.891928864569084e-06, + "loss": 1.9096, + "step": 9616 + }, + { + "epoch": 0.82, + "grad_norm": 1.6890482902526855, + "learning_rate": 8.874829001367988e-06, + "loss": 1.8541, + "step": 9620 + }, + { + "epoch": 0.82, + "grad_norm": 1.6792246103286743, + "learning_rate": 8.857729138166896e-06, + "loss": 1.9734, + "step": 9624 + }, + { + "epoch": 0.82, + "grad_norm": 1.5776066780090332, + "learning_rate": 8.840629274965801e-06, + "loss": 1.9166, + "step": 9628 + }, + { + "epoch": 0.82, + "grad_norm": 1.5487701892852783, + "learning_rate": 8.823529411764707e-06, + "loss": 1.7777, + "step": 9632 + }, + { + "epoch": 0.82, + "grad_norm": 1.6531468629837036, + "learning_rate": 8.806429548563612e-06, + "loss": 1.6488, + "step": 9636 + }, + { + "epoch": 0.82, + "grad_norm": 1.6902531385421753, + "learning_rate": 8.789329685362516e-06, + "loss": 1.8287, + "step": 9640 + }, + { + "epoch": 0.82, + "grad_norm": 1.6954971551895142, + "learning_rate": 8.772229822161424e-06, + "loss": 1.896, + "step": 9644 + }, + { + "epoch": 0.82, + "grad_norm": 1.5442079305648804, + "learning_rate": 8.75512995896033e-06, + "loss": 1.8335, + "step": 9648 + }, + { + "epoch": 0.83, + "grad_norm": 1.739211916923523, + "learning_rate": 8.738030095759233e-06, + "loss": 1.8265, + "step": 9652 + }, + { + "epoch": 0.83, + "grad_norm": 1.6437937021255493, + "learning_rate": 8.72093023255814e-06, + "loss": 1.7487, + "step": 9656 + }, + { + "epoch": 0.83, + "grad_norm": 1.568172812461853, + "learning_rate": 8.703830369357045e-06, + "loss": 1.8618, + "step": 9660 + }, + { + "epoch": 0.83, + "grad_norm": 1.84135103225708, + "learning_rate": 8.686730506155952e-06, + "loss": 1.8962, + "step": 9664 + }, + { + "epoch": 0.83, + "grad_norm": 1.599940538406372, + "learning_rate": 8.669630642954858e-06, + "loss": 1.7502, + "step": 9668 + }, + { + "epoch": 0.83, + "grad_norm": 1.9855149984359741, + "learning_rate": 8.652530779753762e-06, + "loss": 2.0228, + "step": 9672 + }, + { + "epoch": 0.83, + "grad_norm": 1.641782283782959, + "learning_rate": 8.635430916552669e-06, + "loss": 2.0184, + "step": 9676 + }, + { + "epoch": 0.83, + "grad_norm": 1.9988453388214111, + "learning_rate": 8.618331053351573e-06, + "loss": 2.0865, + "step": 9680 + }, + { + "epoch": 0.83, + "grad_norm": 1.7934141159057617, + "learning_rate": 8.601231190150479e-06, + "loss": 1.8933, + "step": 9684 + }, + { + "epoch": 0.83, + "grad_norm": 1.5691959857940674, + "learning_rate": 8.584131326949384e-06, + "loss": 1.8344, + "step": 9688 + }, + { + "epoch": 0.83, + "grad_norm": 1.7705130577087402, + "learning_rate": 8.56703146374829e-06, + "loss": 1.646, + "step": 9692 + }, + { + "epoch": 0.83, + "grad_norm": 1.6504237651824951, + "learning_rate": 8.549931600547197e-06, + "loss": 1.7268, + "step": 9696 + }, + { + "epoch": 0.83, + "grad_norm": 1.4613388776779175, + "learning_rate": 8.532831737346101e-06, + "loss": 1.7871, + "step": 9700 + }, + { + "epoch": 0.83, + "grad_norm": 1.7658060789108276, + "learning_rate": 8.515731874145007e-06, + "loss": 1.8519, + "step": 9704 + }, + { + "epoch": 0.83, + "grad_norm": 1.84966242313385, + "learning_rate": 8.498632010943912e-06, + "loss": 1.8277, + "step": 9708 + }, + { + "epoch": 0.83, + "grad_norm": 1.674241542816162, + "learning_rate": 8.481532147742818e-06, + "loss": 1.7632, + "step": 9712 + }, + { + "epoch": 0.83, + "grad_norm": 1.7811789512634277, + "learning_rate": 8.464432284541724e-06, + "loss": 1.899, + "step": 9716 + }, + { + "epoch": 0.83, + "grad_norm": 1.7027822732925415, + "learning_rate": 8.44733242134063e-06, + "loss": 1.9129, + "step": 9720 + }, + { + "epoch": 0.83, + "grad_norm": 1.7912241220474243, + "learning_rate": 8.430232558139535e-06, + "loss": 1.8546, + "step": 9724 + }, + { + "epoch": 0.83, + "grad_norm": 1.5889939069747925, + "learning_rate": 8.41313269493844e-06, + "loss": 2.0574, + "step": 9728 + }, + { + "epoch": 0.83, + "grad_norm": 1.55410635471344, + "learning_rate": 8.396032831737346e-06, + "loss": 1.8547, + "step": 9732 + }, + { + "epoch": 0.83, + "grad_norm": 1.6358247995376587, + "learning_rate": 8.378932968536252e-06, + "loss": 1.7924, + "step": 9736 + }, + { + "epoch": 0.83, + "grad_norm": 1.698111891746521, + "learning_rate": 8.361833105335158e-06, + "loss": 1.8366, + "step": 9740 + }, + { + "epoch": 0.83, + "grad_norm": 1.5512586832046509, + "learning_rate": 8.344733242134063e-06, + "loss": 1.7097, + "step": 9744 + }, + { + "epoch": 0.83, + "grad_norm": 1.5253231525421143, + "learning_rate": 8.327633378932969e-06, + "loss": 1.9076, + "step": 9748 + }, + { + "epoch": 0.83, + "grad_norm": 1.6231822967529297, + "learning_rate": 8.310533515731875e-06, + "loss": 1.9026, + "step": 9752 + }, + { + "epoch": 0.83, + "grad_norm": 1.726797103881836, + "learning_rate": 8.29343365253078e-06, + "loss": 1.9518, + "step": 9756 + }, + { + "epoch": 0.83, + "grad_norm": 1.7323880195617676, + "learning_rate": 8.276333789329686e-06, + "loss": 1.9352, + "step": 9760 + }, + { + "epoch": 0.83, + "grad_norm": 1.7740181684494019, + "learning_rate": 8.259233926128592e-06, + "loss": 1.8697, + "step": 9764 + }, + { + "epoch": 0.84, + "grad_norm": 1.6174265146255493, + "learning_rate": 8.242134062927497e-06, + "loss": 1.8365, + "step": 9768 + }, + { + "epoch": 0.84, + "grad_norm": 1.7219985723495483, + "learning_rate": 8.225034199726403e-06, + "loss": 1.8343, + "step": 9772 + }, + { + "epoch": 0.84, + "grad_norm": 1.618703007698059, + "learning_rate": 8.207934336525308e-06, + "loss": 1.914, + "step": 9776 + }, + { + "epoch": 0.84, + "grad_norm": 1.5896012783050537, + "learning_rate": 8.190834473324212e-06, + "loss": 1.8311, + "step": 9780 + }, + { + "epoch": 0.84, + "grad_norm": 1.614791989326477, + "learning_rate": 8.17373461012312e-06, + "loss": 1.8189, + "step": 9784 + }, + { + "epoch": 0.84, + "grad_norm": 1.6827095746994019, + "learning_rate": 8.156634746922025e-06, + "loss": 1.7639, + "step": 9788 + }, + { + "epoch": 0.84, + "grad_norm": 1.563234806060791, + "learning_rate": 8.139534883720931e-06, + "loss": 1.8255, + "step": 9792 + }, + { + "epoch": 0.84, + "grad_norm": 1.871492624282837, + "learning_rate": 8.122435020519837e-06, + "loss": 1.7558, + "step": 9796 + }, + { + "epoch": 0.84, + "grad_norm": 1.827107548713684, + "learning_rate": 8.10533515731874e-06, + "loss": 1.93, + "step": 9800 + }, + { + "epoch": 0.84, + "grad_norm": 1.7132670879364014, + "learning_rate": 8.088235294117648e-06, + "loss": 1.9702, + "step": 9804 + }, + { + "epoch": 0.84, + "grad_norm": 1.71363365650177, + "learning_rate": 8.071135430916554e-06, + "loss": 1.9638, + "step": 9808 + }, + { + "epoch": 0.84, + "grad_norm": 1.6538909673690796, + "learning_rate": 8.054035567715458e-06, + "loss": 1.9329, + "step": 9812 + }, + { + "epoch": 0.84, + "grad_norm": 1.4397317171096802, + "learning_rate": 8.036935704514365e-06, + "loss": 1.7031, + "step": 9816 + }, + { + "epoch": 0.84, + "grad_norm": 1.7500611543655396, + "learning_rate": 8.019835841313269e-06, + "loss": 1.7927, + "step": 9820 + }, + { + "epoch": 0.84, + "grad_norm": 1.7213584184646606, + "learning_rate": 8.002735978112176e-06, + "loss": 1.8121, + "step": 9824 + }, + { + "epoch": 0.84, + "grad_norm": 1.8555186986923218, + "learning_rate": 7.985636114911082e-06, + "loss": 1.9388, + "step": 9828 + }, + { + "epoch": 0.84, + "grad_norm": 1.5887397527694702, + "learning_rate": 7.968536251709986e-06, + "loss": 1.8344, + "step": 9832 + }, + { + "epoch": 0.84, + "grad_norm": 1.5652546882629395, + "learning_rate": 7.951436388508893e-06, + "loss": 1.9696, + "step": 9836 + }, + { + "epoch": 0.84, + "grad_norm": 1.5642329454421997, + "learning_rate": 7.934336525307797e-06, + "loss": 1.6744, + "step": 9840 + }, + { + "epoch": 0.84, + "grad_norm": 1.631618618965149, + "learning_rate": 7.917236662106703e-06, + "loss": 1.8742, + "step": 9844 + }, + { + "epoch": 0.84, + "grad_norm": 1.6112003326416016, + "learning_rate": 7.90013679890561e-06, + "loss": 1.9281, + "step": 9848 + }, + { + "epoch": 0.84, + "grad_norm": 1.612426996231079, + "learning_rate": 7.883036935704514e-06, + "loss": 1.9576, + "step": 9852 + }, + { + "epoch": 0.84, + "grad_norm": 1.7601630687713623, + "learning_rate": 7.865937072503421e-06, + "loss": 1.7478, + "step": 9856 + }, + { + "epoch": 0.84, + "grad_norm": 1.782696008682251, + "learning_rate": 7.848837209302325e-06, + "loss": 1.8667, + "step": 9860 + }, + { + "epoch": 0.84, + "grad_norm": 1.716011881828308, + "learning_rate": 7.831737346101231e-06, + "loss": 1.9523, + "step": 9864 + }, + { + "epoch": 0.84, + "grad_norm": 1.5538578033447266, + "learning_rate": 7.814637482900137e-06, + "loss": 1.9909, + "step": 9868 + }, + { + "epoch": 0.84, + "grad_norm": 1.6781466007232666, + "learning_rate": 7.797537619699042e-06, + "loss": 1.7684, + "step": 9872 + }, + { + "epoch": 0.84, + "grad_norm": 1.6765227317810059, + "learning_rate": 7.780437756497948e-06, + "loss": 1.8663, + "step": 9876 + }, + { + "epoch": 0.84, + "grad_norm": 1.5819069147109985, + "learning_rate": 7.763337893296854e-06, + "loss": 1.8592, + "step": 9880 + }, + { + "epoch": 0.85, + "grad_norm": 1.6805880069732666, + "learning_rate": 7.74623803009576e-06, + "loss": 1.8612, + "step": 9884 + }, + { + "epoch": 0.85, + "grad_norm": 1.6755802631378174, + "learning_rate": 7.729138166894665e-06, + "loss": 1.8602, + "step": 9888 + }, + { + "epoch": 0.85, + "grad_norm": 1.7085371017456055, + "learning_rate": 7.71203830369357e-06, + "loss": 1.6679, + "step": 9892 + }, + { + "epoch": 0.85, + "grad_norm": 1.5899639129638672, + "learning_rate": 7.694938440492476e-06, + "loss": 1.7626, + "step": 9896 + }, + { + "epoch": 0.85, + "grad_norm": 1.740966796875, + "learning_rate": 7.677838577291382e-06, + "loss": 1.7984, + "step": 9900 + }, + { + "epoch": 0.85, + "grad_norm": 1.8925389051437378, + "learning_rate": 7.660738714090288e-06, + "loss": 1.8242, + "step": 9904 + }, + { + "epoch": 0.85, + "grad_norm": 1.7259442806243896, + "learning_rate": 7.643638850889193e-06, + "loss": 1.7816, + "step": 9908 + }, + { + "epoch": 0.85, + "grad_norm": 1.5764368772506714, + "learning_rate": 7.626538987688099e-06, + "loss": 1.7718, + "step": 9912 + }, + { + "epoch": 0.85, + "grad_norm": 1.7654069662094116, + "learning_rate": 7.6094391244870045e-06, + "loss": 1.7728, + "step": 9916 + }, + { + "epoch": 0.85, + "grad_norm": 1.5519089698791504, + "learning_rate": 7.592339261285911e-06, + "loss": 1.9182, + "step": 9920 + }, + { + "epoch": 0.85, + "grad_norm": 1.690704584121704, + "learning_rate": 7.575239398084816e-06, + "loss": 1.9885, + "step": 9924 + }, + { + "epoch": 0.85, + "grad_norm": 1.5261144638061523, + "learning_rate": 7.558139534883721e-06, + "loss": 1.7169, + "step": 9928 + }, + { + "epoch": 0.85, + "grad_norm": 1.5968029499053955, + "learning_rate": 7.541039671682627e-06, + "loss": 1.7766, + "step": 9932 + }, + { + "epoch": 0.85, + "grad_norm": 1.5991588830947876, + "learning_rate": 7.523939808481533e-06, + "loss": 1.8359, + "step": 9936 + }, + { + "epoch": 0.85, + "grad_norm": 1.591503620147705, + "learning_rate": 7.506839945280438e-06, + "loss": 1.7582, + "step": 9940 + }, + { + "epoch": 0.85, + "grad_norm": 1.5666191577911377, + "learning_rate": 7.489740082079344e-06, + "loss": 1.8947, + "step": 9944 + }, + { + "epoch": 0.85, + "grad_norm": 1.7336504459381104, + "learning_rate": 7.472640218878249e-06, + "loss": 1.8194, + "step": 9948 + }, + { + "epoch": 0.85, + "grad_norm": 1.7414416074752808, + "learning_rate": 7.455540355677155e-06, + "loss": 1.8543, + "step": 9952 + }, + { + "epoch": 0.85, + "grad_norm": 1.6664258241653442, + "learning_rate": 7.438440492476061e-06, + "loss": 1.786, + "step": 9956 + }, + { + "epoch": 0.85, + "grad_norm": 1.594941258430481, + "learning_rate": 7.421340629274966e-06, + "loss": 1.773, + "step": 9960 + }, + { + "epoch": 0.85, + "grad_norm": 1.71213698387146, + "learning_rate": 7.404240766073872e-06, + "loss": 1.9381, + "step": 9964 + }, + { + "epoch": 0.85, + "grad_norm": 1.6945880651474, + "learning_rate": 7.387140902872777e-06, + "loss": 1.7854, + "step": 9968 + }, + { + "epoch": 0.85, + "grad_norm": 1.71058189868927, + "learning_rate": 7.370041039671682e-06, + "loss": 1.8552, + "step": 9972 + }, + { + "epoch": 0.85, + "grad_norm": 1.8152309656143188, + "learning_rate": 7.3529411764705884e-06, + "loss": 1.8759, + "step": 9976 + }, + { + "epoch": 0.85, + "grad_norm": 1.6043202877044678, + "learning_rate": 7.335841313269494e-06, + "loss": 1.7945, + "step": 9980 + }, + { + "epoch": 0.85, + "grad_norm": 1.7119457721710205, + "learning_rate": 7.318741450068401e-06, + "loss": 1.9138, + "step": 9984 + }, + { + "epoch": 0.85, + "grad_norm": 1.9094306230545044, + "learning_rate": 7.301641586867305e-06, + "loss": 1.8234, + "step": 9988 + }, + { + "epoch": 0.85, + "grad_norm": 1.6789647340774536, + "learning_rate": 7.28454172366621e-06, + "loss": 1.8883, + "step": 9992 + }, + { + "epoch": 0.85, + "grad_norm": 1.6013123989105225, + "learning_rate": 7.267441860465117e-06, + "loss": 1.6778, + "step": 9996 + }, + { + "epoch": 0.85, + "grad_norm": 1.7987964153289795, + "learning_rate": 7.250341997264022e-06, + "loss": 1.6727, + "step": 10000 + }, + { + "epoch": 0.86, + "grad_norm": 1.6856164932250977, + "learning_rate": 7.233242134062927e-06, + "loss": 1.8289, + "step": 10004 + }, + { + "epoch": 0.86, + "grad_norm": 1.7940067052841187, + "learning_rate": 7.216142270861834e-06, + "loss": 1.917, + "step": 10008 + }, + { + "epoch": 0.86, + "grad_norm": 1.806894063949585, + "learning_rate": 7.1990424076607384e-06, + "loss": 1.9523, + "step": 10012 + }, + { + "epoch": 0.86, + "grad_norm": 1.6045538187026978, + "learning_rate": 7.181942544459645e-06, + "loss": 1.8155, + "step": 10016 + }, + { + "epoch": 0.86, + "grad_norm": 1.6840627193450928, + "learning_rate": 7.164842681258551e-06, + "loss": 1.87, + "step": 10020 + }, + { + "epoch": 0.86, + "grad_norm": 1.5683917999267578, + "learning_rate": 7.147742818057455e-06, + "loss": 1.9501, + "step": 10024 + }, + { + "epoch": 0.86, + "grad_norm": 1.6646571159362793, + "learning_rate": 7.130642954856362e-06, + "loss": 1.862, + "step": 10028 + }, + { + "epoch": 0.86, + "grad_norm": 1.6929519176483154, + "learning_rate": 7.113543091655267e-06, + "loss": 1.9485, + "step": 10032 + }, + { + "epoch": 0.86, + "grad_norm": 1.600963830947876, + "learning_rate": 7.096443228454172e-06, + "loss": 1.7745, + "step": 10036 + }, + { + "epoch": 0.86, + "grad_norm": 1.6967799663543701, + "learning_rate": 7.079343365253079e-06, + "loss": 1.9167, + "step": 10040 + }, + { + "epoch": 0.86, + "grad_norm": 1.6387646198272705, + "learning_rate": 7.062243502051984e-06, + "loss": 1.9051, + "step": 10044 + }, + { + "epoch": 0.86, + "grad_norm": 1.5275384187698364, + "learning_rate": 7.04514363885089e-06, + "loss": 1.9997, + "step": 10048 + }, + { + "epoch": 0.86, + "grad_norm": 1.677001953125, + "learning_rate": 7.028043775649795e-06, + "loss": 1.9359, + "step": 10052 + }, + { + "epoch": 0.86, + "grad_norm": 1.5836267471313477, + "learning_rate": 7.010943912448701e-06, + "loss": 1.7359, + "step": 10056 + }, + { + "epoch": 0.86, + "grad_norm": 1.7847533226013184, + "learning_rate": 6.993844049247607e-06, + "loss": 1.9568, + "step": 10060 + }, + { + "epoch": 0.86, + "grad_norm": 1.6765036582946777, + "learning_rate": 6.976744186046512e-06, + "loss": 1.8862, + "step": 10064 + }, + { + "epoch": 0.86, + "grad_norm": 1.8817429542541504, + "learning_rate": 6.959644322845417e-06, + "loss": 1.8935, + "step": 10068 + }, + { + "epoch": 0.86, + "grad_norm": 1.765724778175354, + "learning_rate": 6.942544459644323e-06, + "loss": 1.7135, + "step": 10072 + }, + { + "epoch": 0.86, + "grad_norm": 1.5911288261413574, + "learning_rate": 6.925444596443229e-06, + "loss": 1.7035, + "step": 10076 + }, + { + "epoch": 0.86, + "grad_norm": 1.738471508026123, + "learning_rate": 6.908344733242135e-06, + "loss": 1.7544, + "step": 10080 + }, + { + "epoch": 0.86, + "grad_norm": 1.719092845916748, + "learning_rate": 6.89124487004104e-06, + "loss": 1.8507, + "step": 10084 + }, + { + "epoch": 0.86, + "grad_norm": 1.786017656326294, + "learning_rate": 6.874145006839945e-06, + "loss": 1.9009, + "step": 10088 + }, + { + "epoch": 0.86, + "grad_norm": 2.0530200004577637, + "learning_rate": 6.8570451436388514e-06, + "loss": 1.8839, + "step": 10092 + }, + { + "epoch": 0.86, + "grad_norm": 1.6444154977798462, + "learning_rate": 6.839945280437757e-06, + "loss": 1.8756, + "step": 10096 + }, + { + "epoch": 0.86, + "grad_norm": 1.6261224746704102, + "learning_rate": 6.822845417236662e-06, + "loss": 1.6744, + "step": 10100 + }, + { + "epoch": 0.86, + "grad_norm": 1.6390396356582642, + "learning_rate": 6.805745554035568e-06, + "loss": 1.8752, + "step": 10104 + }, + { + "epoch": 0.86, + "grad_norm": 1.9215726852416992, + "learning_rate": 6.788645690834473e-06, + "loss": 1.9022, + "step": 10108 + }, + { + "epoch": 0.86, + "grad_norm": 1.839656949043274, + "learning_rate": 6.77154582763338e-06, + "loss": 2.1024, + "step": 10112 + }, + { + "epoch": 0.86, + "grad_norm": 1.6141258478164673, + "learning_rate": 6.754445964432285e-06, + "loss": 1.7609, + "step": 10116 + }, + { + "epoch": 0.87, + "grad_norm": 1.6340337991714478, + "learning_rate": 6.73734610123119e-06, + "loss": 1.8939, + "step": 10120 + }, + { + "epoch": 0.87, + "grad_norm": 1.529555082321167, + "learning_rate": 6.720246238030097e-06, + "loss": 1.7863, + "step": 10124 + }, + { + "epoch": 0.87, + "grad_norm": 1.670279860496521, + "learning_rate": 6.7031463748290014e-06, + "loss": 1.8213, + "step": 10128 + }, + { + "epoch": 0.87, + "grad_norm": 1.7106720209121704, + "learning_rate": 6.686046511627907e-06, + "loss": 1.6828, + "step": 10132 + }, + { + "epoch": 0.87, + "grad_norm": 1.8096644878387451, + "learning_rate": 6.668946648426814e-06, + "loss": 1.8081, + "step": 10136 + }, + { + "epoch": 0.87, + "grad_norm": 1.636301040649414, + "learning_rate": 6.651846785225718e-06, + "loss": 1.8518, + "step": 10140 + }, + { + "epoch": 0.87, + "grad_norm": 1.5952695608139038, + "learning_rate": 6.634746922024625e-06, + "loss": 1.7897, + "step": 10144 + }, + { + "epoch": 0.87, + "grad_norm": 1.5805269479751587, + "learning_rate": 6.61764705882353e-06, + "loss": 1.9542, + "step": 10148 + }, + { + "epoch": 0.87, + "grad_norm": 1.8776960372924805, + "learning_rate": 6.6005471956224345e-06, + "loss": 1.8719, + "step": 10152 + }, + { + "epoch": 0.87, + "grad_norm": 1.5122414827346802, + "learning_rate": 6.583447332421341e-06, + "loss": 1.7259, + "step": 10156 + }, + { + "epoch": 0.87, + "grad_norm": 1.7734562158584595, + "learning_rate": 6.566347469220247e-06, + "loss": 1.8812, + "step": 10160 + }, + { + "epoch": 0.87, + "grad_norm": 1.511674165725708, + "learning_rate": 6.5492476060191514e-06, + "loss": 1.841, + "step": 10164 + }, + { + "epoch": 0.87, + "grad_norm": 1.6405954360961914, + "learning_rate": 6.532147742818058e-06, + "loss": 1.8166, + "step": 10168 + }, + { + "epoch": 0.87, + "grad_norm": 1.822506070137024, + "learning_rate": 6.515047879616963e-06, + "loss": 1.8154, + "step": 10172 + }, + { + "epoch": 0.87, + "grad_norm": 1.6633646488189697, + "learning_rate": 6.497948016415869e-06, + "loss": 1.8579, + "step": 10176 + }, + { + "epoch": 0.87, + "grad_norm": 1.8290987014770508, + "learning_rate": 6.480848153214775e-06, + "loss": 1.8602, + "step": 10180 + }, + { + "epoch": 0.87, + "grad_norm": 1.6676056385040283, + "learning_rate": 6.46374829001368e-06, + "loss": 1.7908, + "step": 10184 + }, + { + "epoch": 0.87, + "grad_norm": 1.4981609582901, + "learning_rate": 6.446648426812586e-06, + "loss": 1.7605, + "step": 10188 + }, + { + "epoch": 0.87, + "grad_norm": 1.7770746946334839, + "learning_rate": 6.429548563611491e-06, + "loss": 2.005, + "step": 10192 + }, + { + "epoch": 0.87, + "grad_norm": 1.6636948585510254, + "learning_rate": 6.412448700410397e-06, + "loss": 1.9328, + "step": 10196 + }, + { + "epoch": 0.87, + "grad_norm": 1.5431137084960938, + "learning_rate": 6.395348837209303e-06, + "loss": 1.7199, + "step": 10200 + }, + { + "epoch": 0.87, + "grad_norm": 1.6235666275024414, + "learning_rate": 6.378248974008208e-06, + "loss": 1.8701, + "step": 10204 + }, + { + "epoch": 0.87, + "grad_norm": 1.7597670555114746, + "learning_rate": 6.3611491108071144e-06, + "loss": 1.7161, + "step": 10208 + }, + { + "epoch": 0.87, + "grad_norm": 1.8666725158691406, + "learning_rate": 6.344049247606019e-06, + "loss": 1.8283, + "step": 10212 + }, + { + "epoch": 0.87, + "grad_norm": 1.8828492164611816, + "learning_rate": 6.326949384404925e-06, + "loss": 1.9689, + "step": 10216 + }, + { + "epoch": 0.87, + "grad_norm": 1.539336919784546, + "learning_rate": 6.309849521203831e-06, + "loss": 1.731, + "step": 10220 + }, + { + "epoch": 0.87, + "grad_norm": 1.6746304035186768, + "learning_rate": 6.292749658002736e-06, + "loss": 1.7627, + "step": 10224 + }, + { + "epoch": 0.87, + "grad_norm": 1.6880711317062378, + "learning_rate": 6.275649794801641e-06, + "loss": 1.8081, + "step": 10228 + }, + { + "epoch": 0.87, + "grad_norm": 1.6686969995498657, + "learning_rate": 6.2585499316005475e-06, + "loss": 1.7955, + "step": 10232 + }, + { + "epoch": 0.88, + "grad_norm": 1.6514986753463745, + "learning_rate": 6.241450068399453e-06, + "loss": 1.7788, + "step": 10236 + }, + { + "epoch": 0.88, + "grad_norm": 1.5582215785980225, + "learning_rate": 6.224350205198359e-06, + "loss": 1.7514, + "step": 10240 + }, + { + "epoch": 0.88, + "grad_norm": 1.6330581903457642, + "learning_rate": 6.2072503419972644e-06, + "loss": 1.9055, + "step": 10244 + }, + { + "epoch": 0.88, + "grad_norm": 1.6368670463562012, + "learning_rate": 6.19015047879617e-06, + "loss": 1.732, + "step": 10248 + }, + { + "epoch": 0.88, + "grad_norm": 1.7110068798065186, + "learning_rate": 6.173050615595075e-06, + "loss": 1.9471, + "step": 10252 + }, + { + "epoch": 0.88, + "grad_norm": 1.7540807723999023, + "learning_rate": 6.155950752393981e-06, + "loss": 1.8072, + "step": 10256 + }, + { + "epoch": 0.88, + "grad_norm": 1.6526390314102173, + "learning_rate": 6.138850889192887e-06, + "loss": 1.7913, + "step": 10260 + }, + { + "epoch": 0.88, + "grad_norm": 1.7240028381347656, + "learning_rate": 6.121751025991793e-06, + "loss": 1.8526, + "step": 10264 + }, + { + "epoch": 0.88, + "grad_norm": 1.683828592300415, + "learning_rate": 6.1046511627906975e-06, + "loss": 1.7572, + "step": 10268 + }, + { + "epoch": 0.88, + "grad_norm": 1.5554591417312622, + "learning_rate": 6.087551299589603e-06, + "loss": 1.7571, + "step": 10272 + }, + { + "epoch": 0.88, + "grad_norm": 1.6821385622024536, + "learning_rate": 6.07045143638851e-06, + "loss": 1.7848, + "step": 10276 + }, + { + "epoch": 0.88, + "grad_norm": 1.6544996500015259, + "learning_rate": 6.053351573187415e-06, + "loss": 1.8252, + "step": 10280 + }, + { + "epoch": 0.88, + "grad_norm": 1.6138795614242554, + "learning_rate": 6.03625170998632e-06, + "loss": 1.979, + "step": 10284 + }, + { + "epoch": 0.88, + "grad_norm": 2.004995346069336, + "learning_rate": 6.019151846785226e-06, + "loss": 1.7996, + "step": 10288 + }, + { + "epoch": 0.88, + "grad_norm": 1.8927818536758423, + "learning_rate": 6.002051983584131e-06, + "loss": 1.8892, + "step": 10292 + }, + { + "epoch": 0.88, + "grad_norm": 1.9253323078155518, + "learning_rate": 5.984952120383038e-06, + "loss": 1.9798, + "step": 10296 + }, + { + "epoch": 0.88, + "grad_norm": 1.623855471611023, + "learning_rate": 5.967852257181943e-06, + "loss": 1.6606, + "step": 10300 + }, + { + "epoch": 0.88, + "grad_norm": 1.5401018857955933, + "learning_rate": 5.950752393980848e-06, + "loss": 1.7557, + "step": 10304 + }, + { + "epoch": 0.88, + "grad_norm": 1.643587350845337, + "learning_rate": 5.933652530779754e-06, + "loss": 1.9448, + "step": 10308 + }, + { + "epoch": 0.88, + "grad_norm": 1.7403227090835571, + "learning_rate": 5.91655266757866e-06, + "loss": 1.9609, + "step": 10312 + }, + { + "epoch": 0.88, + "grad_norm": 1.7716693878173828, + "learning_rate": 5.899452804377565e-06, + "loss": 1.8861, + "step": 10316 + }, + { + "epoch": 0.88, + "grad_norm": 1.7579643726348877, + "learning_rate": 5.882352941176471e-06, + "loss": 1.8461, + "step": 10320 + }, + { + "epoch": 0.88, + "grad_norm": 1.512486219406128, + "learning_rate": 5.8652530779753766e-06, + "loss": 1.8184, + "step": 10324 + }, + { + "epoch": 0.88, + "grad_norm": 1.8807471990585327, + "learning_rate": 5.848153214774282e-06, + "loss": 1.7981, + "step": 10328 + }, + { + "epoch": 0.88, + "grad_norm": 1.8313617706298828, + "learning_rate": 5.831053351573187e-06, + "loss": 1.7566, + "step": 10332 + }, + { + "epoch": 0.88, + "grad_norm": 1.7667449712753296, + "learning_rate": 5.8139534883720935e-06, + "loss": 1.8996, + "step": 10336 + }, + { + "epoch": 0.88, + "grad_norm": 1.5892364978790283, + "learning_rate": 5.796853625170999e-06, + "loss": 1.7042, + "step": 10340 + }, + { + "epoch": 0.88, + "grad_norm": 1.9081833362579346, + "learning_rate": 5.779753761969905e-06, + "loss": 2.0525, + "step": 10344 + }, + { + "epoch": 0.88, + "grad_norm": 1.891441822052002, + "learning_rate": 5.76265389876881e-06, + "loss": 1.7879, + "step": 10348 + }, + { + "epoch": 0.89, + "grad_norm": 1.9260497093200684, + "learning_rate": 5.745554035567715e-06, + "loss": 1.8796, + "step": 10352 + }, + { + "epoch": 0.89, + "grad_norm": 1.6760286092758179, + "learning_rate": 5.728454172366622e-06, + "loss": 1.9087, + "step": 10356 + }, + { + "epoch": 0.89, + "grad_norm": 1.6360900402069092, + "learning_rate": 5.711354309165527e-06, + "loss": 1.7715, + "step": 10360 + }, + { + "epoch": 0.89, + "grad_norm": 1.7029601335525513, + "learning_rate": 5.694254445964432e-06, + "loss": 1.7859, + "step": 10364 + }, + { + "epoch": 0.89, + "grad_norm": 1.5900228023529053, + "learning_rate": 5.677154582763338e-06, + "loss": 1.8456, + "step": 10368 + }, + { + "epoch": 0.89, + "grad_norm": 1.6234673261642456, + "learning_rate": 5.6600547195622435e-06, + "loss": 1.8007, + "step": 10372 + }, + { + "epoch": 0.89, + "grad_norm": 1.5690399408340454, + "learning_rate": 5.64295485636115e-06, + "loss": 1.8154, + "step": 10376 + }, + { + "epoch": 0.89, + "grad_norm": 1.622010588645935, + "learning_rate": 5.625854993160055e-06, + "loss": 1.9555, + "step": 10380 + }, + { + "epoch": 0.89, + "grad_norm": 1.7594480514526367, + "learning_rate": 5.6087551299589605e-06, + "loss": 1.9146, + "step": 10384 + }, + { + "epoch": 0.89, + "grad_norm": 1.644951581954956, + "learning_rate": 5.591655266757866e-06, + "loss": 1.9562, + "step": 10388 + }, + { + "epoch": 0.89, + "grad_norm": 1.6730928421020508, + "learning_rate": 5.574555403556772e-06, + "loss": 1.927, + "step": 10392 + }, + { + "epoch": 0.89, + "grad_norm": 1.749422311782837, + "learning_rate": 5.557455540355677e-06, + "loss": 1.8613, + "step": 10396 + }, + { + "epoch": 0.89, + "grad_norm": 1.6241000890731812, + "learning_rate": 5.540355677154583e-06, + "loss": 1.6963, + "step": 10400 + }, + { + "epoch": 0.89, + "grad_norm": 1.6478880643844604, + "learning_rate": 5.523255813953489e-06, + "loss": 1.7823, + "step": 10404 + }, + { + "epoch": 0.89, + "grad_norm": 1.8106017112731934, + "learning_rate": 5.506155950752394e-06, + "loss": 1.8875, + "step": 10408 + }, + { + "epoch": 0.89, + "grad_norm": 1.792172908782959, + "learning_rate": 5.4890560875513e-06, + "loss": 1.9241, + "step": 10412 + }, + { + "epoch": 0.89, + "grad_norm": 1.5553025007247925, + "learning_rate": 5.471956224350206e-06, + "loss": 1.7159, + "step": 10416 + }, + { + "epoch": 0.89, + "grad_norm": 1.5697897672653198, + "learning_rate": 5.454856361149111e-06, + "loss": 1.7839, + "step": 10420 + }, + { + "epoch": 0.89, + "grad_norm": 1.7735848426818848, + "learning_rate": 5.437756497948017e-06, + "loss": 1.7892, + "step": 10424 + }, + { + "epoch": 0.89, + "grad_norm": 1.8028829097747803, + "learning_rate": 5.420656634746922e-06, + "loss": 1.8241, + "step": 10428 + }, + { + "epoch": 0.89, + "grad_norm": 1.7222471237182617, + "learning_rate": 5.403556771545827e-06, + "loss": 1.7322, + "step": 10432 + }, + { + "epoch": 0.89, + "grad_norm": 1.5926076173782349, + "learning_rate": 5.386456908344734e-06, + "loss": 1.8576, + "step": 10436 + }, + { + "epoch": 0.89, + "grad_norm": 1.642992377281189, + "learning_rate": 5.3693570451436396e-06, + "loss": 1.83, + "step": 10440 + }, + { + "epoch": 0.89, + "grad_norm": 1.577621579170227, + "learning_rate": 5.352257181942544e-06, + "loss": 1.7914, + "step": 10444 + }, + { + "epoch": 0.89, + "grad_norm": 1.5518428087234497, + "learning_rate": 5.33515731874145e-06, + "loss": 1.9518, + "step": 10448 + }, + { + "epoch": 0.89, + "grad_norm": 1.6200405359268188, + "learning_rate": 5.318057455540356e-06, + "loss": 1.7868, + "step": 10452 + }, + { + "epoch": 0.89, + "grad_norm": 1.5001662969589233, + "learning_rate": 5.300957592339262e-06, + "loss": 1.8093, + "step": 10456 + }, + { + "epoch": 0.89, + "grad_norm": 1.6836313009262085, + "learning_rate": 5.283857729138167e-06, + "loss": 1.8562, + "step": 10460 + }, + { + "epoch": 0.89, + "grad_norm": 1.7168313264846802, + "learning_rate": 5.266757865937073e-06, + "loss": 1.8735, + "step": 10464 + }, + { + "epoch": 0.9, + "grad_norm": 1.7337522506713867, + "learning_rate": 5.249658002735978e-06, + "loss": 1.9054, + "step": 10468 + }, + { + "epoch": 0.9, + "grad_norm": 1.4755985736846924, + "learning_rate": 5.232558139534884e-06, + "loss": 1.8456, + "step": 10472 + }, + { + "epoch": 0.9, + "grad_norm": 1.7717063426971436, + "learning_rate": 5.2154582763337896e-06, + "loss": 1.8035, + "step": 10476 + }, + { + "epoch": 0.9, + "grad_norm": 1.8099918365478516, + "learning_rate": 5.198358413132695e-06, + "loss": 1.9099, + "step": 10480 + }, + { + "epoch": 0.9, + "grad_norm": 1.6142700910568237, + "learning_rate": 5.181258549931601e-06, + "loss": 1.7275, + "step": 10484 + }, + { + "epoch": 0.9, + "grad_norm": 1.6501797437667847, + "learning_rate": 5.1641586867305065e-06, + "loss": 1.7888, + "step": 10488 + }, + { + "epoch": 0.9, + "grad_norm": 1.6632269620895386, + "learning_rate": 5.147058823529412e-06, + "loss": 1.77, + "step": 10492 + }, + { + "epoch": 0.9, + "grad_norm": 1.698384165763855, + "learning_rate": 5.129958960328318e-06, + "loss": 1.7493, + "step": 10496 + }, + { + "epoch": 0.9, + "grad_norm": 1.6063936948776245, + "learning_rate": 5.1128590971272235e-06, + "loss": 1.7966, + "step": 10500 + }, + { + "epoch": 0.9, + "grad_norm": 1.9107542037963867, + "learning_rate": 5.095759233926129e-06, + "loss": 1.7634, + "step": 10504 + }, + { + "epoch": 0.9, + "grad_norm": 1.839381217956543, + "learning_rate": 5.078659370725034e-06, + "loss": 1.7991, + "step": 10508 + }, + { + "epoch": 0.9, + "grad_norm": 1.7109549045562744, + "learning_rate": 5.0615595075239396e-06, + "loss": 1.7911, + "step": 10512 + }, + { + "epoch": 0.9, + "grad_norm": 1.7641953229904175, + "learning_rate": 5.044459644322846e-06, + "loss": 1.9, + "step": 10516 + }, + { + "epoch": 0.9, + "grad_norm": 1.981584072113037, + "learning_rate": 5.027359781121752e-06, + "loss": 1.8533, + "step": 10520 + }, + { + "epoch": 0.9, + "grad_norm": 1.5387113094329834, + "learning_rate": 5.0102599179206565e-06, + "loss": 1.9877, + "step": 10524 + }, + { + "epoch": 0.9, + "grad_norm": 1.630399465560913, + "learning_rate": 4.993160054719562e-06, + "loss": 1.9056, + "step": 10528 + }, + { + "epoch": 0.9, + "grad_norm": 1.8931196928024292, + "learning_rate": 4.976060191518468e-06, + "loss": 1.7797, + "step": 10532 + }, + { + "epoch": 0.9, + "grad_norm": 1.5811591148376465, + "learning_rate": 4.958960328317374e-06, + "loss": 2.0371, + "step": 10536 + }, + { + "epoch": 0.9, + "grad_norm": 1.7181917428970337, + "learning_rate": 4.941860465116279e-06, + "loss": 1.7816, + "step": 10540 + }, + { + "epoch": 0.9, + "grad_norm": 1.6405460834503174, + "learning_rate": 4.924760601915185e-06, + "loss": 1.7458, + "step": 10544 + }, + { + "epoch": 0.9, + "grad_norm": 1.758260726928711, + "learning_rate": 4.90766073871409e-06, + "loss": 1.948, + "step": 10548 + }, + { + "epoch": 0.9, + "grad_norm": 1.845942497253418, + "learning_rate": 4.890560875512996e-06, + "loss": 1.7851, + "step": 10552 + }, + { + "epoch": 0.9, + "grad_norm": 1.79182767868042, + "learning_rate": 4.873461012311902e-06, + "loss": 1.8356, + "step": 10556 + }, + { + "epoch": 0.9, + "grad_norm": 1.8030176162719727, + "learning_rate": 4.856361149110807e-06, + "loss": 1.9056, + "step": 10560 + }, + { + "epoch": 0.9, + "grad_norm": 1.7074739933013916, + "learning_rate": 4.839261285909713e-06, + "loss": 1.8684, + "step": 10564 + }, + { + "epoch": 0.9, + "grad_norm": 1.5636216402053833, + "learning_rate": 4.822161422708619e-06, + "loss": 1.7258, + "step": 10568 + }, + { + "epoch": 0.9, + "grad_norm": 1.8994554281234741, + "learning_rate": 4.805061559507524e-06, + "loss": 1.8462, + "step": 10572 + }, + { + "epoch": 0.9, + "grad_norm": 1.5819472074508667, + "learning_rate": 4.78796169630643e-06, + "loss": 1.7744, + "step": 10576 + }, + { + "epoch": 0.9, + "grad_norm": 1.7214453220367432, + "learning_rate": 4.770861833105336e-06, + "loss": 1.8286, + "step": 10580 + }, + { + "epoch": 0.9, + "grad_norm": 1.6858314275741577, + "learning_rate": 4.753761969904241e-06, + "loss": 1.9328, + "step": 10584 + }, + { + "epoch": 0.91, + "grad_norm": 1.6816788911819458, + "learning_rate": 4.736662106703146e-06, + "loss": 1.9712, + "step": 10588 + }, + { + "epoch": 0.91, + "grad_norm": 1.5183528661727905, + "learning_rate": 4.7195622435020526e-06, + "loss": 1.8719, + "step": 10592 + }, + { + "epoch": 0.91, + "grad_norm": 1.7054622173309326, + "learning_rate": 4.702462380300958e-06, + "loss": 1.8244, + "step": 10596 + }, + { + "epoch": 0.91, + "grad_norm": 1.5704643726348877, + "learning_rate": 4.685362517099864e-06, + "loss": 1.7187, + "step": 10600 + }, + { + "epoch": 0.91, + "grad_norm": 1.616736888885498, + "learning_rate": 4.668262653898769e-06, + "loss": 1.9231, + "step": 10604 + }, + { + "epoch": 0.91, + "grad_norm": 1.8305636644363403, + "learning_rate": 4.651162790697674e-06, + "loss": 2.0724, + "step": 10608 + }, + { + "epoch": 0.91, + "grad_norm": 1.623712420463562, + "learning_rate": 4.63406292749658e-06, + "loss": 1.8901, + "step": 10612 + }, + { + "epoch": 0.91, + "grad_norm": 1.7231340408325195, + "learning_rate": 4.6169630642954865e-06, + "loss": 1.7852, + "step": 10616 + }, + { + "epoch": 0.91, + "grad_norm": 1.707742691040039, + "learning_rate": 4.599863201094391e-06, + "loss": 1.861, + "step": 10620 + }, + { + "epoch": 0.91, + "grad_norm": 1.6897032260894775, + "learning_rate": 4.582763337893297e-06, + "loss": 1.7987, + "step": 10624 + }, + { + "epoch": 0.91, + "grad_norm": 1.4974230527877808, + "learning_rate": 4.5656634746922026e-06, + "loss": 1.7377, + "step": 10628 + }, + { + "epoch": 0.91, + "grad_norm": 1.7821507453918457, + "learning_rate": 4.548563611491108e-06, + "loss": 1.7803, + "step": 10632 + }, + { + "epoch": 0.91, + "grad_norm": 1.7004159688949585, + "learning_rate": 4.531463748290014e-06, + "loss": 1.7409, + "step": 10636 + }, + { + "epoch": 0.91, + "grad_norm": 1.620969295501709, + "learning_rate": 4.5143638850889195e-06, + "loss": 1.6708, + "step": 10640 + }, + { + "epoch": 0.91, + "grad_norm": 1.6645839214324951, + "learning_rate": 4.497264021887825e-06, + "loss": 1.8713, + "step": 10644 + }, + { + "epoch": 0.91, + "grad_norm": 1.6770342588424683, + "learning_rate": 4.480164158686731e-06, + "loss": 1.9282, + "step": 10648 + }, + { + "epoch": 0.91, + "grad_norm": 1.7173173427581787, + "learning_rate": 4.4630642954856365e-06, + "loss": 1.8702, + "step": 10652 + }, + { + "epoch": 0.91, + "grad_norm": 1.7240304946899414, + "learning_rate": 4.445964432284542e-06, + "loss": 1.8032, + "step": 10656 + }, + { + "epoch": 0.91, + "grad_norm": 1.618517518043518, + "learning_rate": 4.428864569083448e-06, + "loss": 1.846, + "step": 10660 + }, + { + "epoch": 0.91, + "grad_norm": 1.6240915060043335, + "learning_rate": 4.411764705882353e-06, + "loss": 1.8224, + "step": 10664 + }, + { + "epoch": 0.91, + "grad_norm": 1.519727110862732, + "learning_rate": 4.394664842681258e-06, + "loss": 1.7819, + "step": 10668 + }, + { + "epoch": 0.91, + "grad_norm": 2.0412402153015137, + "learning_rate": 4.377564979480165e-06, + "loss": 1.8308, + "step": 10672 + }, + { + "epoch": 0.91, + "grad_norm": 1.5424792766571045, + "learning_rate": 4.36046511627907e-06, + "loss": 1.8432, + "step": 10676 + }, + { + "epoch": 0.91, + "grad_norm": 1.9428133964538574, + "learning_rate": 4.343365253077976e-06, + "loss": 1.6436, + "step": 10680 + }, + { + "epoch": 0.91, + "grad_norm": 1.6989853382110596, + "learning_rate": 4.326265389876881e-06, + "loss": 1.7265, + "step": 10684 + }, + { + "epoch": 0.91, + "grad_norm": 1.5763792991638184, + "learning_rate": 4.3091655266757865e-06, + "loss": 1.8635, + "step": 10688 + }, + { + "epoch": 0.91, + "grad_norm": 1.7506353855133057, + "learning_rate": 4.292065663474692e-06, + "loss": 1.83, + "step": 10692 + }, + { + "epoch": 0.91, + "grad_norm": 1.6734927892684937, + "learning_rate": 4.274965800273599e-06, + "loss": 1.8733, + "step": 10696 + }, + { + "epoch": 0.91, + "grad_norm": 1.7979787588119507, + "learning_rate": 4.257865937072503e-06, + "loss": 1.8805, + "step": 10700 + }, + { + "epoch": 0.92, + "grad_norm": 1.643833875656128, + "learning_rate": 4.240766073871409e-06, + "loss": 1.7028, + "step": 10704 + }, + { + "epoch": 0.92, + "grad_norm": 1.727108120918274, + "learning_rate": 4.223666210670315e-06, + "loss": 1.7918, + "step": 10708 + }, + { + "epoch": 0.92, + "grad_norm": 1.6539056301116943, + "learning_rate": 4.20656634746922e-06, + "loss": 1.8238, + "step": 10712 + }, + { + "epoch": 0.92, + "grad_norm": 1.7414677143096924, + "learning_rate": 4.189466484268126e-06, + "loss": 1.8796, + "step": 10716 + }, + { + "epoch": 0.92, + "grad_norm": 1.6490898132324219, + "learning_rate": 4.172366621067032e-06, + "loss": 1.7907, + "step": 10720 + }, + { + "epoch": 0.92, + "grad_norm": 1.7541985511779785, + "learning_rate": 4.155266757865937e-06, + "loss": 1.8669, + "step": 10724 + }, + { + "epoch": 0.92, + "grad_norm": 1.6836141347885132, + "learning_rate": 4.138166894664843e-06, + "loss": 1.7802, + "step": 10728 + }, + { + "epoch": 0.92, + "grad_norm": 2.008136034011841, + "learning_rate": 4.121067031463749e-06, + "loss": 1.9157, + "step": 10732 + }, + { + "epoch": 0.92, + "grad_norm": 1.7152680158615112, + "learning_rate": 4.103967168262654e-06, + "loss": 1.8344, + "step": 10736 + }, + { + "epoch": 0.92, + "grad_norm": 1.5120372772216797, + "learning_rate": 4.08686730506156e-06, + "loss": 1.7443, + "step": 10740 + }, + { + "epoch": 0.92, + "grad_norm": 1.5764340162277222, + "learning_rate": 4.0697674418604655e-06, + "loss": 1.7508, + "step": 10744 + }, + { + "epoch": 0.92, + "grad_norm": 1.6534470319747925, + "learning_rate": 4.05266757865937e-06, + "loss": 1.8064, + "step": 10748 + }, + { + "epoch": 0.92, + "grad_norm": 1.8456621170043945, + "learning_rate": 4.035567715458277e-06, + "loss": 1.8589, + "step": 10752 + }, + { + "epoch": 0.92, + "grad_norm": 1.6170421838760376, + "learning_rate": 4.0184678522571825e-06, + "loss": 1.6253, + "step": 10756 + }, + { + "epoch": 0.92, + "grad_norm": 1.5028194189071655, + "learning_rate": 4.001367989056088e-06, + "loss": 1.7209, + "step": 10760 + }, + { + "epoch": 0.92, + "grad_norm": 1.9968106746673584, + "learning_rate": 3.984268125854993e-06, + "loss": 1.9172, + "step": 10764 + }, + { + "epoch": 0.92, + "grad_norm": 1.7104765176773071, + "learning_rate": 3.967168262653899e-06, + "loss": 1.7745, + "step": 10768 + }, + { + "epoch": 0.92, + "grad_norm": 1.739295244216919, + "learning_rate": 3.950068399452805e-06, + "loss": 1.8637, + "step": 10772 + }, + { + "epoch": 0.92, + "grad_norm": 1.587810754776001, + "learning_rate": 3.932968536251711e-06, + "loss": 1.6962, + "step": 10776 + }, + { + "epoch": 0.92, + "grad_norm": 1.6938265562057495, + "learning_rate": 3.9158686730506155e-06, + "loss": 1.8125, + "step": 10780 + }, + { + "epoch": 0.92, + "grad_norm": 1.7163448333740234, + "learning_rate": 3.898768809849521e-06, + "loss": 1.8493, + "step": 10784 + }, + { + "epoch": 0.92, + "grad_norm": 1.6594278812408447, + "learning_rate": 3.881668946648427e-06, + "loss": 1.8301, + "step": 10788 + }, + { + "epoch": 0.92, + "grad_norm": 1.661649227142334, + "learning_rate": 3.8645690834473325e-06, + "loss": 1.774, + "step": 10792 + }, + { + "epoch": 0.92, + "grad_norm": 1.8990195989608765, + "learning_rate": 3.847469220246238e-06, + "loss": 1.8284, + "step": 10796 + }, + { + "epoch": 0.92, + "grad_norm": 1.8179618120193481, + "learning_rate": 3.830369357045144e-06, + "loss": 1.7436, + "step": 10800 + }, + { + "epoch": 0.92, + "grad_norm": 2.0388402938842773, + "learning_rate": 3.8132694938440494e-06, + "loss": 1.9214, + "step": 10804 + }, + { + "epoch": 0.92, + "grad_norm": 1.578235387802124, + "learning_rate": 3.7961696306429555e-06, + "loss": 1.8137, + "step": 10808 + }, + { + "epoch": 0.92, + "grad_norm": 1.9262685775756836, + "learning_rate": 3.7790697674418603e-06, + "loss": 1.8067, + "step": 10812 + }, + { + "epoch": 0.92, + "grad_norm": 1.7524725198745728, + "learning_rate": 3.7619699042407664e-06, + "loss": 2.018, + "step": 10816 + }, + { + "epoch": 0.93, + "grad_norm": 1.7184381484985352, + "learning_rate": 3.744870041039672e-06, + "loss": 1.8733, + "step": 10820 + }, + { + "epoch": 0.93, + "grad_norm": 1.626380443572998, + "learning_rate": 3.7277701778385777e-06, + "loss": 1.8379, + "step": 10824 + }, + { + "epoch": 0.93, + "grad_norm": 1.610595941543579, + "learning_rate": 3.710670314637483e-06, + "loss": 1.8246, + "step": 10828 + }, + { + "epoch": 0.93, + "grad_norm": 1.6897306442260742, + "learning_rate": 3.6935704514363886e-06, + "loss": 1.7103, + "step": 10832 + }, + { + "epoch": 0.93, + "grad_norm": 1.8274520635604858, + "learning_rate": 3.6764705882352942e-06, + "loss": 1.9273, + "step": 10836 + }, + { + "epoch": 0.93, + "grad_norm": 1.8843746185302734, + "learning_rate": 3.6593707250342003e-06, + "loss": 1.792, + "step": 10840 + }, + { + "epoch": 0.93, + "grad_norm": 3.664485216140747, + "learning_rate": 3.642270861833105e-06, + "loss": 2.0409, + "step": 10844 + }, + { + "epoch": 0.93, + "grad_norm": 1.5859229564666748, + "learning_rate": 3.625170998632011e-06, + "loss": 1.9006, + "step": 10848 + }, + { + "epoch": 0.93, + "grad_norm": 1.6038724184036255, + "learning_rate": 3.608071135430917e-06, + "loss": 1.8245, + "step": 10852 + }, + { + "epoch": 0.93, + "grad_norm": 1.6321396827697754, + "learning_rate": 3.5909712722298225e-06, + "loss": 1.8706, + "step": 10856 + }, + { + "epoch": 0.93, + "grad_norm": 1.7967422008514404, + "learning_rate": 3.5738714090287277e-06, + "loss": 1.86, + "step": 10860 + }, + { + "epoch": 0.93, + "grad_norm": 1.5631977319717407, + "learning_rate": 3.5567715458276333e-06, + "loss": 1.7408, + "step": 10864 + }, + { + "epoch": 0.93, + "grad_norm": 1.5402038097381592, + "learning_rate": 3.5396716826265394e-06, + "loss": 1.8824, + "step": 10868 + }, + { + "epoch": 0.93, + "grad_norm": 1.5307432413101196, + "learning_rate": 3.522571819425445e-06, + "loss": 1.7778, + "step": 10872 + }, + { + "epoch": 0.93, + "grad_norm": 1.6216157674789429, + "learning_rate": 3.5054719562243503e-06, + "loss": 1.6656, + "step": 10876 + }, + { + "epoch": 0.93, + "grad_norm": 1.6624993085861206, + "learning_rate": 3.488372093023256e-06, + "loss": 1.9027, + "step": 10880 + }, + { + "epoch": 0.93, + "grad_norm": 1.6463823318481445, + "learning_rate": 3.4712722298221616e-06, + "loss": 1.8464, + "step": 10884 + }, + { + "epoch": 0.93, + "grad_norm": 1.56813645362854, + "learning_rate": 3.4541723666210677e-06, + "loss": 1.8271, + "step": 10888 + }, + { + "epoch": 0.93, + "grad_norm": 1.8501081466674805, + "learning_rate": 3.4370725034199725e-06, + "loss": 1.9077, + "step": 10892 + }, + { + "epoch": 0.93, + "grad_norm": 1.9633204936981201, + "learning_rate": 3.4199726402188785e-06, + "loss": 1.8458, + "step": 10896 + }, + { + "epoch": 0.93, + "grad_norm": 1.8002575635910034, + "learning_rate": 3.402872777017784e-06, + "loss": 2.0061, + "step": 10900 + }, + { + "epoch": 0.93, + "grad_norm": 1.6839830875396729, + "learning_rate": 3.38577291381669e-06, + "loss": 1.8378, + "step": 10904 + }, + { + "epoch": 0.93, + "grad_norm": 1.6927244663238525, + "learning_rate": 3.368673050615595e-06, + "loss": 1.7045, + "step": 10908 + }, + { + "epoch": 0.93, + "grad_norm": 1.5076004266738892, + "learning_rate": 3.3515731874145007e-06, + "loss": 1.9581, + "step": 10912 + }, + { + "epoch": 0.93, + "grad_norm": 1.7239080667495728, + "learning_rate": 3.334473324213407e-06, + "loss": 1.9258, + "step": 10916 + }, + { + "epoch": 0.93, + "grad_norm": 1.8428497314453125, + "learning_rate": 3.3173734610123124e-06, + "loss": 1.8136, + "step": 10920 + }, + { + "epoch": 0.93, + "grad_norm": 1.6915169954299927, + "learning_rate": 3.3002735978112172e-06, + "loss": 1.9492, + "step": 10924 + }, + { + "epoch": 0.93, + "grad_norm": 1.641465425491333, + "learning_rate": 3.2831737346101233e-06, + "loss": 1.7496, + "step": 10928 + }, + { + "epoch": 0.93, + "grad_norm": 1.6878330707550049, + "learning_rate": 3.266073871409029e-06, + "loss": 1.85, + "step": 10932 + }, + { + "epoch": 0.94, + "grad_norm": 1.8498550653457642, + "learning_rate": 3.2489740082079346e-06, + "loss": 1.9107, + "step": 10936 + }, + { + "epoch": 0.94, + "grad_norm": 1.61210036277771, + "learning_rate": 3.23187414500684e-06, + "loss": 1.9388, + "step": 10940 + }, + { + "epoch": 0.94, + "grad_norm": 1.551893949508667, + "learning_rate": 3.2147742818057455e-06, + "loss": 1.8542, + "step": 10944 + }, + { + "epoch": 0.94, + "grad_norm": 1.587180733680725, + "learning_rate": 3.1976744186046516e-06, + "loss": 1.8516, + "step": 10948 + }, + { + "epoch": 0.94, + "grad_norm": 1.42110276222229, + "learning_rate": 3.1805745554035572e-06, + "loss": 1.7036, + "step": 10952 + }, + { + "epoch": 0.94, + "grad_norm": 1.7156518697738647, + "learning_rate": 3.1634746922024624e-06, + "loss": 1.8442, + "step": 10956 + }, + { + "epoch": 0.94, + "grad_norm": 1.6081629991531372, + "learning_rate": 3.146374829001368e-06, + "loss": 1.9073, + "step": 10960 + }, + { + "epoch": 0.94, + "grad_norm": 2.0794620513916016, + "learning_rate": 3.1292749658002737e-06, + "loss": 2.0049, + "step": 10964 + }, + { + "epoch": 0.94, + "grad_norm": 1.6746197938919067, + "learning_rate": 3.1121751025991794e-06, + "loss": 1.7059, + "step": 10968 + }, + { + "epoch": 0.94, + "grad_norm": 1.5624709129333496, + "learning_rate": 3.095075239398085e-06, + "loss": 1.8929, + "step": 10972 + }, + { + "epoch": 0.94, + "grad_norm": 1.6810204982757568, + "learning_rate": 3.0779753761969907e-06, + "loss": 1.8587, + "step": 10976 + }, + { + "epoch": 0.94, + "grad_norm": 1.586511492729187, + "learning_rate": 3.0608755129958963e-06, + "loss": 1.7846, + "step": 10980 + }, + { + "epoch": 0.94, + "grad_norm": 1.6255276203155518, + "learning_rate": 3.0437756497948016e-06, + "loss": 1.8619, + "step": 10984 + }, + { + "epoch": 0.94, + "grad_norm": 1.8912711143493652, + "learning_rate": 3.0266757865937076e-06, + "loss": 1.9116, + "step": 10988 + }, + { + "epoch": 0.94, + "grad_norm": 1.4620989561080933, + "learning_rate": 3.009575923392613e-06, + "loss": 1.9151, + "step": 10992 + }, + { + "epoch": 0.94, + "grad_norm": 1.5181527137756348, + "learning_rate": 2.992476060191519e-06, + "loss": 1.6923, + "step": 10996 + }, + { + "epoch": 0.94, + "grad_norm": 1.7841485738754272, + "learning_rate": 2.975376196990424e-06, + "loss": 1.7321, + "step": 11000 + }, + { + "epoch": 0.94, + "grad_norm": 1.8255186080932617, + "learning_rate": 2.95827633378933e-06, + "loss": 1.8564, + "step": 11004 + }, + { + "epoch": 0.94, + "grad_norm": 1.6331710815429688, + "learning_rate": 2.9411764705882355e-06, + "loss": 1.9261, + "step": 11008 + }, + { + "epoch": 0.94, + "grad_norm": 1.5975050926208496, + "learning_rate": 2.924076607387141e-06, + "loss": 1.7562, + "step": 11012 + }, + { + "epoch": 0.94, + "grad_norm": 1.5330755710601807, + "learning_rate": 2.9069767441860468e-06, + "loss": 1.7575, + "step": 11016 + }, + { + "epoch": 0.94, + "grad_norm": 1.7199543714523315, + "learning_rate": 2.8898768809849524e-06, + "loss": 1.8875, + "step": 11020 + }, + { + "epoch": 0.94, + "grad_norm": 1.7265928983688354, + "learning_rate": 2.8727770177838576e-06, + "loss": 1.9619, + "step": 11024 + }, + { + "epoch": 0.94, + "grad_norm": 1.955166220664978, + "learning_rate": 2.8556771545827637e-06, + "loss": 1.9266, + "step": 11028 + }, + { + "epoch": 0.94, + "grad_norm": 1.708550214767456, + "learning_rate": 2.838577291381669e-06, + "loss": 1.8194, + "step": 11032 + }, + { + "epoch": 0.94, + "grad_norm": 1.7049528360366821, + "learning_rate": 2.821477428180575e-06, + "loss": 1.728, + "step": 11036 + }, + { + "epoch": 0.94, + "grad_norm": 1.8736356496810913, + "learning_rate": 2.8043775649794802e-06, + "loss": 1.7949, + "step": 11040 + }, + { + "epoch": 0.94, + "grad_norm": 1.8903663158416748, + "learning_rate": 2.787277701778386e-06, + "loss": 1.7765, + "step": 11044 + }, + { + "epoch": 0.94, + "grad_norm": 1.690417766571045, + "learning_rate": 2.7701778385772915e-06, + "loss": 1.8932, + "step": 11048 + }, + { + "epoch": 0.94, + "grad_norm": 1.7920684814453125, + "learning_rate": 2.753077975376197e-06, + "loss": 1.7986, + "step": 11052 + }, + { + "epoch": 0.95, + "grad_norm": 1.771796703338623, + "learning_rate": 2.735978112175103e-06, + "loss": 1.7103, + "step": 11056 + }, + { + "epoch": 0.95, + "grad_norm": 1.51233971118927, + "learning_rate": 2.7188782489740085e-06, + "loss": 1.6045, + "step": 11060 + }, + { + "epoch": 0.95, + "grad_norm": 1.6907927989959717, + "learning_rate": 2.7017783857729137e-06, + "loss": 1.9416, + "step": 11064 + }, + { + "epoch": 0.95, + "grad_norm": 1.575386881828308, + "learning_rate": 2.6846785225718198e-06, + "loss": 1.7111, + "step": 11068 + }, + { + "epoch": 0.95, + "grad_norm": 2.0558063983917236, + "learning_rate": 2.667578659370725e-06, + "loss": 1.7934, + "step": 11072 + }, + { + "epoch": 0.95, + "grad_norm": 1.705366611480713, + "learning_rate": 2.650478796169631e-06, + "loss": 1.8026, + "step": 11076 + }, + { + "epoch": 0.95, + "grad_norm": 1.625901222229004, + "learning_rate": 2.6333789329685363e-06, + "loss": 1.9377, + "step": 11080 + }, + { + "epoch": 0.95, + "grad_norm": 1.7622089385986328, + "learning_rate": 2.616279069767442e-06, + "loss": 1.7597, + "step": 11084 + }, + { + "epoch": 0.95, + "grad_norm": 1.841213583946228, + "learning_rate": 2.5991792065663476e-06, + "loss": 1.901, + "step": 11088 + }, + { + "epoch": 0.95, + "grad_norm": 1.6358860731124878, + "learning_rate": 2.5820793433652533e-06, + "loss": 1.7777, + "step": 11092 + }, + { + "epoch": 0.95, + "grad_norm": 1.6823545694351196, + "learning_rate": 2.564979480164159e-06, + "loss": 1.9779, + "step": 11096 + }, + { + "epoch": 0.95, + "grad_norm": 1.8069067001342773, + "learning_rate": 2.5478796169630646e-06, + "loss": 1.9257, + "step": 11100 + }, + { + "epoch": 0.95, + "grad_norm": 1.4878418445587158, + "learning_rate": 2.5307797537619698e-06, + "loss": 1.8447, + "step": 11104 + }, + { + "epoch": 0.95, + "grad_norm": 1.6673344373703003, + "learning_rate": 2.513679890560876e-06, + "loss": 1.8752, + "step": 11108 + }, + { + "epoch": 0.95, + "grad_norm": 1.6673535108566284, + "learning_rate": 2.496580027359781e-06, + "loss": 1.8024, + "step": 11112 + }, + { + "epoch": 0.95, + "grad_norm": 1.7625112533569336, + "learning_rate": 2.479480164158687e-06, + "loss": 1.7451, + "step": 11116 + }, + { + "epoch": 0.95, + "grad_norm": 1.7562973499298096, + "learning_rate": 2.4623803009575924e-06, + "loss": 1.8505, + "step": 11120 + }, + { + "epoch": 0.95, + "grad_norm": 1.55601966381073, + "learning_rate": 2.445280437756498e-06, + "loss": 1.7542, + "step": 11124 + }, + { + "epoch": 0.95, + "grad_norm": 1.5536571741104126, + "learning_rate": 2.4281805745554037e-06, + "loss": 1.7571, + "step": 11128 + }, + { + "epoch": 0.95, + "grad_norm": 2.797058343887329, + "learning_rate": 2.4110807113543093e-06, + "loss": 1.8041, + "step": 11132 + }, + { + "epoch": 0.95, + "grad_norm": 1.5327272415161133, + "learning_rate": 2.393980848153215e-06, + "loss": 1.9662, + "step": 11136 + }, + { + "epoch": 0.95, + "grad_norm": 1.6519979238510132, + "learning_rate": 2.3768809849521206e-06, + "loss": 1.8325, + "step": 11140 + }, + { + "epoch": 0.95, + "grad_norm": 1.6532912254333496, + "learning_rate": 2.3597811217510263e-06, + "loss": 1.7309, + "step": 11144 + }, + { + "epoch": 0.95, + "grad_norm": 1.6513277292251587, + "learning_rate": 2.342681258549932e-06, + "loss": 1.8967, + "step": 11148 + }, + { + "epoch": 0.95, + "grad_norm": 2.01287579536438, + "learning_rate": 2.325581395348837e-06, + "loss": 1.8535, + "step": 11152 + }, + { + "epoch": 0.95, + "grad_norm": 1.7155896425247192, + "learning_rate": 2.3084815321477432e-06, + "loss": 1.9063, + "step": 11156 + }, + { + "epoch": 0.95, + "grad_norm": 1.802552342414856, + "learning_rate": 2.2913816689466485e-06, + "loss": 1.7353, + "step": 11160 + }, + { + "epoch": 0.95, + "grad_norm": 1.850523591041565, + "learning_rate": 2.274281805745554e-06, + "loss": 2.0235, + "step": 11164 + }, + { + "epoch": 0.95, + "grad_norm": 1.6676249504089355, + "learning_rate": 2.2571819425444598e-06, + "loss": 1.8164, + "step": 11168 + }, + { + "epoch": 0.96, + "grad_norm": 2.562305212020874, + "learning_rate": 2.2400820793433654e-06, + "loss": 1.8171, + "step": 11172 + }, + { + "epoch": 0.96, + "grad_norm": 1.9502869844436646, + "learning_rate": 2.222982216142271e-06, + "loss": 1.8298, + "step": 11176 + }, + { + "epoch": 0.96, + "grad_norm": 1.739216923713684, + "learning_rate": 2.2058823529411767e-06, + "loss": 1.7871, + "step": 11180 + }, + { + "epoch": 0.96, + "grad_norm": 1.595462441444397, + "learning_rate": 2.1887824897400824e-06, + "loss": 1.7876, + "step": 11184 + }, + { + "epoch": 0.96, + "grad_norm": 2.28205943107605, + "learning_rate": 2.171682626538988e-06, + "loss": 1.9108, + "step": 11188 + }, + { + "epoch": 0.96, + "grad_norm": 1.8560374975204468, + "learning_rate": 2.1545827633378932e-06, + "loss": 1.7791, + "step": 11192 + }, + { + "epoch": 0.96, + "grad_norm": 1.6630357503890991, + "learning_rate": 2.1374829001367993e-06, + "loss": 1.8153, + "step": 11196 + }, + { + "epoch": 0.96, + "grad_norm": 1.6874091625213623, + "learning_rate": 2.1203830369357045e-06, + "loss": 1.813, + "step": 11200 + }, + { + "epoch": 0.96, + "grad_norm": 1.5925251245498657, + "learning_rate": 2.10328317373461e-06, + "loss": 1.7251, + "step": 11204 + }, + { + "epoch": 0.96, + "grad_norm": 1.7215338945388794, + "learning_rate": 2.086183310533516e-06, + "loss": 1.8046, + "step": 11208 + }, + { + "epoch": 0.96, + "grad_norm": 1.6895273923873901, + "learning_rate": 2.0690834473324215e-06, + "loss": 1.8298, + "step": 11212 + }, + { + "epoch": 0.96, + "grad_norm": 1.641157865524292, + "learning_rate": 2.051983584131327e-06, + "loss": 1.7497, + "step": 11216 + }, + { + "epoch": 0.96, + "grad_norm": 1.756468653678894, + "learning_rate": 2.0348837209302328e-06, + "loss": 1.791, + "step": 11220 + }, + { + "epoch": 0.96, + "grad_norm": 1.7440533638000488, + "learning_rate": 2.0177838577291384e-06, + "loss": 1.7596, + "step": 11224 + }, + { + "epoch": 0.96, + "grad_norm": 1.4534015655517578, + "learning_rate": 2.000683994528044e-06, + "loss": 1.7385, + "step": 11228 + }, + { + "epoch": 0.96, + "grad_norm": 1.6513500213623047, + "learning_rate": 1.9835841313269493e-06, + "loss": 2.1346, + "step": 11232 + }, + { + "epoch": 0.96, + "grad_norm": 1.7908762693405151, + "learning_rate": 1.9664842681258554e-06, + "loss": 1.7386, + "step": 11236 + }, + { + "epoch": 0.96, + "grad_norm": 1.5934255123138428, + "learning_rate": 1.9493844049247606e-06, + "loss": 1.8361, + "step": 11240 + }, + { + "epoch": 0.96, + "grad_norm": 1.4890646934509277, + "learning_rate": 1.9322845417236662e-06, + "loss": 1.7226, + "step": 11244 + }, + { + "epoch": 0.96, + "grad_norm": 1.6311677694320679, + "learning_rate": 1.915184678522572e-06, + "loss": 1.7406, + "step": 11248 + }, + { + "epoch": 0.96, + "grad_norm": 1.662655234336853, + "learning_rate": 1.8980848153214778e-06, + "loss": 1.894, + "step": 11252 + }, + { + "epoch": 0.96, + "grad_norm": 1.6911251544952393, + "learning_rate": 1.8809849521203832e-06, + "loss": 2.0025, + "step": 11256 + }, + { + "epoch": 0.96, + "grad_norm": 1.5574413537979126, + "learning_rate": 1.8638850889192888e-06, + "loss": 1.7462, + "step": 11260 + }, + { + "epoch": 0.96, + "grad_norm": 1.699474573135376, + "learning_rate": 1.8467852257181943e-06, + "loss": 1.7242, + "step": 11264 + }, + { + "epoch": 0.96, + "grad_norm": 1.642033338546753, + "learning_rate": 1.8296853625171001e-06, + "loss": 1.7673, + "step": 11268 + }, + { + "epoch": 0.96, + "grad_norm": 1.6996281147003174, + "learning_rate": 1.8125854993160056e-06, + "loss": 1.8045, + "step": 11272 + }, + { + "epoch": 0.96, + "grad_norm": 1.6581562757492065, + "learning_rate": 1.7954856361149112e-06, + "loss": 1.9656, + "step": 11276 + }, + { + "epoch": 0.96, + "grad_norm": 1.6569535732269287, + "learning_rate": 1.7783857729138167e-06, + "loss": 1.9959, + "step": 11280 + }, + { + "epoch": 0.96, + "grad_norm": 1.7449274063110352, + "learning_rate": 1.7612859097127225e-06, + "loss": 1.8343, + "step": 11284 + }, + { + "epoch": 0.97, + "grad_norm": 1.6935420036315918, + "learning_rate": 1.744186046511628e-06, + "loss": 1.8053, + "step": 11288 + }, + { + "epoch": 0.97, + "grad_norm": 1.5281199216842651, + "learning_rate": 1.7270861833105338e-06, + "loss": 1.9919, + "step": 11292 + }, + { + "epoch": 0.97, + "grad_norm": 1.633588433265686, + "learning_rate": 1.7099863201094393e-06, + "loss": 1.8699, + "step": 11296 + }, + { + "epoch": 0.97, + "grad_norm": 1.7310950756072998, + "learning_rate": 1.692886456908345e-06, + "loss": 1.9074, + "step": 11300 + }, + { + "epoch": 0.97, + "grad_norm": 1.5915794372558594, + "learning_rate": 1.6757865937072504e-06, + "loss": 1.7659, + "step": 11304 + }, + { + "epoch": 0.97, + "grad_norm": 1.7064154148101807, + "learning_rate": 1.6586867305061562e-06, + "loss": 1.8796, + "step": 11308 + }, + { + "epoch": 0.97, + "grad_norm": 1.7068238258361816, + "learning_rate": 1.6415868673050617e-06, + "loss": 1.824, + "step": 11312 + }, + { + "epoch": 0.97, + "grad_norm": 1.686372995376587, + "learning_rate": 1.6244870041039673e-06, + "loss": 2.0173, + "step": 11316 + }, + { + "epoch": 0.97, + "grad_norm": 1.5897817611694336, + "learning_rate": 1.6073871409028727e-06, + "loss": 1.8182, + "step": 11320 + }, + { + "epoch": 0.97, + "grad_norm": 1.6900789737701416, + "learning_rate": 1.5902872777017786e-06, + "loss": 1.7615, + "step": 11324 + }, + { + "epoch": 0.97, + "grad_norm": 1.7033038139343262, + "learning_rate": 1.573187414500684e-06, + "loss": 1.7663, + "step": 11328 + }, + { + "epoch": 0.97, + "grad_norm": 1.661864161491394, + "learning_rate": 1.5560875512995897e-06, + "loss": 1.8007, + "step": 11332 + }, + { + "epoch": 0.97, + "grad_norm": 1.663250207901001, + "learning_rate": 1.5389876880984953e-06, + "loss": 1.7852, + "step": 11336 + }, + { + "epoch": 0.97, + "grad_norm": 1.7646623849868774, + "learning_rate": 1.5218878248974008e-06, + "loss": 1.7738, + "step": 11340 + }, + { + "epoch": 0.97, + "grad_norm": 1.6536412239074707, + "learning_rate": 1.5047879616963064e-06, + "loss": 1.8398, + "step": 11344 + }, + { + "epoch": 0.97, + "grad_norm": 1.726285457611084, + "learning_rate": 1.487688098495212e-06, + "loss": 1.7792, + "step": 11348 + }, + { + "epoch": 0.97, + "grad_norm": 1.7666223049163818, + "learning_rate": 1.4705882352941177e-06, + "loss": 1.9069, + "step": 11352 + }, + { + "epoch": 0.97, + "grad_norm": 1.7547391653060913, + "learning_rate": 1.4534883720930234e-06, + "loss": 1.8827, + "step": 11356 + }, + { + "epoch": 0.97, + "grad_norm": 1.8074520826339722, + "learning_rate": 1.4363885088919288e-06, + "loss": 1.8349, + "step": 11360 + }, + { + "epoch": 0.97, + "grad_norm": 1.5819740295410156, + "learning_rate": 1.4192886456908345e-06, + "loss": 2.0226, + "step": 11364 + }, + { + "epoch": 0.97, + "grad_norm": 1.4653937816619873, + "learning_rate": 1.4021887824897401e-06, + "loss": 1.7796, + "step": 11368 + }, + { + "epoch": 0.97, + "grad_norm": 1.7137483358383179, + "learning_rate": 1.3850889192886458e-06, + "loss": 1.9836, + "step": 11372 + }, + { + "epoch": 0.97, + "grad_norm": 1.5901538133621216, + "learning_rate": 1.3679890560875514e-06, + "loss": 1.7209, + "step": 11376 + }, + { + "epoch": 0.97, + "grad_norm": 1.5994963645935059, + "learning_rate": 1.3508891928864569e-06, + "loss": 1.8554, + "step": 11380 + }, + { + "epoch": 0.97, + "grad_norm": 1.578391194343567, + "learning_rate": 1.3337893296853625e-06, + "loss": 1.8694, + "step": 11384 + }, + { + "epoch": 0.97, + "grad_norm": 1.6012123823165894, + "learning_rate": 1.3166894664842682e-06, + "loss": 1.7565, + "step": 11388 + }, + { + "epoch": 0.97, + "grad_norm": 1.6539853811264038, + "learning_rate": 1.2995896032831738e-06, + "loss": 1.8733, + "step": 11392 + }, + { + "epoch": 0.97, + "grad_norm": 1.7771899700164795, + "learning_rate": 1.2824897400820795e-06, + "loss": 1.8847, + "step": 11396 + }, + { + "epoch": 0.97, + "grad_norm": 1.6242918968200684, + "learning_rate": 1.2653898768809849e-06, + "loss": 1.7995, + "step": 11400 + }, + { + "epoch": 0.98, + "grad_norm": 1.7309808731079102, + "learning_rate": 1.2482900136798905e-06, + "loss": 1.7911, + "step": 11404 + }, + { + "epoch": 0.98, + "grad_norm": 1.64167320728302, + "learning_rate": 1.2311901504787962e-06, + "loss": 1.8132, + "step": 11408 + }, + { + "epoch": 0.98, + "grad_norm": 1.7144200801849365, + "learning_rate": 1.2140902872777018e-06, + "loss": 1.8484, + "step": 11412 + }, + { + "epoch": 0.98, + "grad_norm": 1.498760461807251, + "learning_rate": 1.1969904240766075e-06, + "loss": 1.7778, + "step": 11416 + }, + { + "epoch": 0.98, + "grad_norm": 1.62421452999115, + "learning_rate": 1.1798905608755131e-06, + "loss": 1.8898, + "step": 11420 + }, + { + "epoch": 0.98, + "grad_norm": 1.7838332653045654, + "learning_rate": 1.1627906976744186e-06, + "loss": 1.8246, + "step": 11424 + }, + { + "epoch": 0.98, + "grad_norm": 1.7368948459625244, + "learning_rate": 1.1456908344733242e-06, + "loss": 1.9796, + "step": 11428 + }, + { + "epoch": 0.98, + "grad_norm": 1.5505352020263672, + "learning_rate": 1.1285909712722299e-06, + "loss": 1.8111, + "step": 11432 + }, + { + "epoch": 0.98, + "grad_norm": 1.7267060279846191, + "learning_rate": 1.1114911080711355e-06, + "loss": 1.7691, + "step": 11436 + }, + { + "epoch": 0.98, + "grad_norm": 1.6978029012680054, + "learning_rate": 1.0943912448700412e-06, + "loss": 1.7508, + "step": 11440 + }, + { + "epoch": 0.98, + "grad_norm": 1.665993571281433, + "learning_rate": 1.0772913816689466e-06, + "loss": 1.7835, + "step": 11444 + }, + { + "epoch": 0.98, + "grad_norm": 1.5911554098129272, + "learning_rate": 1.0601915184678523e-06, + "loss": 1.8659, + "step": 11448 + }, + { + "epoch": 0.98, + "grad_norm": 2.122206449508667, + "learning_rate": 1.043091655266758e-06, + "loss": 1.8262, + "step": 11452 + }, + { + "epoch": 0.98, + "grad_norm": 1.5077828168869019, + "learning_rate": 1.0259917920656636e-06, + "loss": 1.8085, + "step": 11456 + }, + { + "epoch": 0.98, + "grad_norm": 1.527305245399475, + "learning_rate": 1.0088919288645692e-06, + "loss": 1.7245, + "step": 11460 + }, + { + "epoch": 0.98, + "grad_norm": 1.8870925903320312, + "learning_rate": 9.917920656634746e-07, + "loss": 1.891, + "step": 11464 + }, + { + "epoch": 0.98, + "grad_norm": 1.66902756690979, + "learning_rate": 9.746922024623803e-07, + "loss": 1.8576, + "step": 11468 + }, + { + "epoch": 0.98, + "grad_norm": 1.6984986066818237, + "learning_rate": 9.57592339261286e-07, + "loss": 1.711, + "step": 11472 + }, + { + "epoch": 0.98, + "grad_norm": 1.8500134944915771, + "learning_rate": 9.404924760601916e-07, + "loss": 1.83, + "step": 11476 + }, + { + "epoch": 0.98, + "grad_norm": 1.5974342823028564, + "learning_rate": 9.233926128590971e-07, + "loss": 1.7812, + "step": 11480 + }, + { + "epoch": 0.98, + "grad_norm": 1.678200364112854, + "learning_rate": 9.062927496580028e-07, + "loss": 1.8443, + "step": 11484 + }, + { + "epoch": 0.98, + "grad_norm": 1.766434907913208, + "learning_rate": 8.891928864569083e-07, + "loss": 1.9573, + "step": 11488 + }, + { + "epoch": 0.98, + "grad_norm": 1.7245994806289673, + "learning_rate": 8.72093023255814e-07, + "loss": 1.8823, + "step": 11492 + }, + { + "epoch": 0.98, + "grad_norm": 1.598664402961731, + "learning_rate": 8.549931600547196e-07, + "loss": 1.78, + "step": 11496 + }, + { + "epoch": 0.98, + "grad_norm": 1.5603172779083252, + "learning_rate": 8.378932968536252e-07, + "loss": 1.7606, + "step": 11500 + }, + { + "epoch": 0.98, + "grad_norm": 1.5893508195877075, + "learning_rate": 8.207934336525308e-07, + "loss": 1.7072, + "step": 11504 + }, + { + "epoch": 0.98, + "grad_norm": 1.6806880235671997, + "learning_rate": 8.036935704514364e-07, + "loss": 1.9592, + "step": 11508 + }, + { + "epoch": 0.98, + "grad_norm": 1.6872777938842773, + "learning_rate": 7.86593707250342e-07, + "loss": 1.8096, + "step": 11512 + }, + { + "epoch": 0.98, + "grad_norm": 1.6055060625076294, + "learning_rate": 7.694938440492477e-07, + "loss": 1.991, + "step": 11516 + }, + { + "epoch": 0.98, + "grad_norm": 1.5616644620895386, + "learning_rate": 7.523939808481532e-07, + "loss": 1.8996, + "step": 11520 + }, + { + "epoch": 0.99, + "grad_norm": 1.7734068632125854, + "learning_rate": 7.352941176470589e-07, + "loss": 1.8998, + "step": 11524 + }, + { + "epoch": 0.99, + "grad_norm": 1.7730764150619507, + "learning_rate": 7.181942544459644e-07, + "loss": 1.82, + "step": 11528 + }, + { + "epoch": 0.99, + "grad_norm": 1.6176447868347168, + "learning_rate": 7.010943912448701e-07, + "loss": 1.6604, + "step": 11532 + }, + { + "epoch": 0.99, + "grad_norm": 1.6673433780670166, + "learning_rate": 6.839945280437757e-07, + "loss": 1.8491, + "step": 11536 + }, + { + "epoch": 0.99, + "grad_norm": 1.7543388605117798, + "learning_rate": 6.668946648426813e-07, + "loss": 1.9856, + "step": 11540 + }, + { + "epoch": 0.99, + "grad_norm": 1.687723159790039, + "learning_rate": 6.497948016415869e-07, + "loss": 1.7666, + "step": 11544 + }, + { + "epoch": 0.99, + "grad_norm": 1.6949374675750732, + "learning_rate": 6.326949384404924e-07, + "loss": 1.7367, + "step": 11548 + }, + { + "epoch": 0.99, + "grad_norm": 1.88869047164917, + "learning_rate": 6.155950752393981e-07, + "loss": 1.8341, + "step": 11552 + }, + { + "epoch": 0.99, + "grad_norm": 1.616167426109314, + "learning_rate": 5.984952120383037e-07, + "loss": 1.8967, + "step": 11556 + }, + { + "epoch": 0.99, + "grad_norm": 1.6819902658462524, + "learning_rate": 5.813953488372093e-07, + "loss": 1.6976, + "step": 11560 + }, + { + "epoch": 0.99, + "grad_norm": 1.628939151763916, + "learning_rate": 5.642954856361149e-07, + "loss": 1.7678, + "step": 11564 + }, + { + "epoch": 0.99, + "grad_norm": 1.9879276752471924, + "learning_rate": 5.471956224350206e-07, + "loss": 1.8467, + "step": 11568 + }, + { + "epoch": 0.99, + "grad_norm": 1.9371817111968994, + "learning_rate": 5.300957592339261e-07, + "loss": 1.8366, + "step": 11572 + }, + { + "epoch": 0.99, + "grad_norm": 1.7080752849578857, + "learning_rate": 5.129958960328318e-07, + "loss": 1.8151, + "step": 11576 + }, + { + "epoch": 0.99, + "grad_norm": 1.6665830612182617, + "learning_rate": 4.958960328317373e-07, + "loss": 1.7895, + "step": 11580 + }, + { + "epoch": 0.99, + "grad_norm": 1.5489290952682495, + "learning_rate": 4.78796169630643e-07, + "loss": 1.7158, + "step": 11584 + }, + { + "epoch": 0.99, + "grad_norm": 1.6452659368515015, + "learning_rate": 4.6169630642954857e-07, + "loss": 1.9806, + "step": 11588 + }, + { + "epoch": 0.99, + "grad_norm": 1.5039621591567993, + "learning_rate": 4.4459644322845417e-07, + "loss": 1.6445, + "step": 11592 + }, + { + "epoch": 0.99, + "grad_norm": 1.6051033735275269, + "learning_rate": 4.274965800273598e-07, + "loss": 1.812, + "step": 11596 + }, + { + "epoch": 0.99, + "grad_norm": 1.5924128293991089, + "learning_rate": 4.103967168262654e-07, + "loss": 1.7894, + "step": 11600 + }, + { + "epoch": 0.99, + "grad_norm": 1.752646565437317, + "learning_rate": 3.93296853625171e-07, + "loss": 1.7116, + "step": 11604 + }, + { + "epoch": 0.99, + "grad_norm": 1.6699579954147339, + "learning_rate": 3.761969904240766e-07, + "loss": 1.7171, + "step": 11608 + }, + { + "epoch": 0.99, + "grad_norm": 1.7405366897583008, + "learning_rate": 3.590971272229822e-07, + "loss": 1.9161, + "step": 11612 + }, + { + "epoch": 0.99, + "grad_norm": 1.5944126844406128, + "learning_rate": 3.4199726402188785e-07, + "loss": 1.758, + "step": 11616 + }, + { + "epoch": 0.99, + "grad_norm": 1.7876613140106201, + "learning_rate": 3.2489740082079345e-07, + "loss": 1.8338, + "step": 11620 + }, + { + "epoch": 0.99, + "grad_norm": 1.640367031097412, + "learning_rate": 3.0779753761969905e-07, + "loss": 1.8632, + "step": 11624 + }, + { + "epoch": 0.99, + "grad_norm": 1.646633267402649, + "learning_rate": 2.9069767441860464e-07, + "loss": 1.8923, + "step": 11628 + }, + { + "epoch": 0.99, + "grad_norm": 1.551578402519226, + "learning_rate": 2.735978112175103e-07, + "loss": 1.8043, + "step": 11632 + }, + { + "epoch": 0.99, + "grad_norm": 2.0039708614349365, + "learning_rate": 2.564979480164159e-07, + "loss": 1.8416, + "step": 11636 + }, + { + "epoch": 1.0, + "grad_norm": 1.6551536321640015, + "learning_rate": 2.393980848153215e-07, + "loss": 1.8004, + "step": 11640 + }, + { + "epoch": 1.0, + "grad_norm": 1.774258017539978, + "learning_rate": 2.2229822161422708e-07, + "loss": 1.6694, + "step": 11644 + }, + { + "epoch": 1.0, + "grad_norm": 1.6978957653045654, + "learning_rate": 2.051983584131327e-07, + "loss": 1.9608, + "step": 11648 + }, + { + "epoch": 1.0, + "grad_norm": 1.5244742631912231, + "learning_rate": 1.880984952120383e-07, + "loss": 1.749, + "step": 11652 + }, + { + "epoch": 1.0, + "grad_norm": 1.7631537914276123, + "learning_rate": 1.7099863201094393e-07, + "loss": 1.9072, + "step": 11656 + }, + { + "epoch": 1.0, + "grad_norm": 1.6298117637634277, + "learning_rate": 1.5389876880984952e-07, + "loss": 1.8008, + "step": 11660 + }, + { + "epoch": 1.0, + "grad_norm": 1.6735055446624756, + "learning_rate": 1.3679890560875515e-07, + "loss": 1.7698, + "step": 11664 + }, + { + "epoch": 1.0, + "grad_norm": 1.6959282159805298, + "learning_rate": 1.1969904240766074e-07, + "loss": 1.7243, + "step": 11668 + }, + { + "epoch": 1.0, + "grad_norm": 1.6977593898773193, + "learning_rate": 1.0259917920656635e-07, + "loss": 1.7902, + "step": 11672 + }, + { + "epoch": 1.0, + "grad_norm": 1.5963069200515747, + "learning_rate": 8.549931600547196e-08, + "loss": 1.6623, + "step": 11676 + }, + { + "epoch": 1.0, + "grad_norm": 1.5912084579467773, + "learning_rate": 6.839945280437757e-08, + "loss": 1.8806, + "step": 11680 + }, + { + "epoch": 1.0, + "grad_norm": 1.8129706382751465, + "learning_rate": 5.129958960328318e-08, + "loss": 1.9337, + "step": 11684 + }, + { + "epoch": 1.0, + "grad_norm": 1.7241120338439941, + "learning_rate": 3.419972640218879e-08, + "loss": 1.9163, + "step": 11688 + }, + { + "epoch": 1.0, + "grad_norm": 1.5432144403457642, + "learning_rate": 1.7099863201094393e-08, + "loss": 1.6597, + "step": 11692 + }, + { + "epoch": 1.0, + "grad_norm": 1.5885206460952759, + "learning_rate": 0.0, + "loss": 1.9262, + "step": 11696 + }, + { + "epoch": 1.0, + "step": 11696, + "total_flos": 1.5848503128568627e+17, + "train_loss": 2.3174790558589957, + "train_runtime": 2998.5108, + "train_samples_per_second": 31.202, + "train_steps_per_second": 3.901 + } + ], + "logging_steps": 4, + "max_steps": 11696, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1170, + "total_flos": 1.5848503128568627e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}