diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,74597 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 53244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1659.2270556222848, + "learning_rate": 1.8779342723004697e-09, + "loss": 16.4902, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1882.7565013569613, + "learning_rate": 9.389671361502349e-09, + "loss": 17.2082, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 1713.0057743789032, + "learning_rate": 1.8779342723004698e-08, + "loss": 17.0727, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 1843.3418373795246, + "learning_rate": 2.8169014084507045e-08, + "loss": 16.8226, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 1813.3767547899452, + "learning_rate": 3.7558685446009395e-08, + "loss": 16.9255, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 1678.219429048779, + "learning_rate": 4.694835680751174e-08, + "loss": 16.452, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 1636.6897483112282, + "learning_rate": 5.633802816901409e-08, + "loss": 15.6666, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 1318.3277535950876, + "learning_rate": 6.572769953051644e-08, + "loss": 15.4184, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1441.7068876937751, + "learning_rate": 7.511737089201879e-08, + "loss": 14.4912, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 1292.3376716271725, + "learning_rate": 8.450704225352113e-08, + "loss": 13.7669, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 1025.333612889984, + "learning_rate": 9.389671361502348e-08, + "loss": 12.0966, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1048.3141228926484, + "learning_rate": 1.0328638497652583e-07, + "loss": 11.3846, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 953.3406257911793, + "learning_rate": 1.1267605633802818e-07, + "loss": 10.3368, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 704.4452333942636, + "learning_rate": 1.2206572769953052e-07, + "loss": 9.1991, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 676.9230725364876, + "learning_rate": 1.3145539906103288e-07, + "loss": 8.0264, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 526.1271016517904, + "learning_rate": 1.4084507042253522e-07, + "loss": 7.0255, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 443.4611422903287, + "learning_rate": 1.5023474178403758e-07, + "loss": 6.3881, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 370.18793921907417, + "learning_rate": 1.5962441314553992e-07, + "loss": 5.8924, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 335.7407410048187, + "learning_rate": 1.6901408450704225e-07, + "loss": 5.5044, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 299.98221531514986, + "learning_rate": 1.7840375586854462e-07, + "loss": 5.195, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 236.48950000745847, + "learning_rate": 1.8779342723004696e-07, + "loss": 4.8934, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 230.23667875404382, + "learning_rate": 1.9718309859154932e-07, + "loss": 4.6712, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 281.23701010806394, + "learning_rate": 2.0657276995305166e-07, + "loss": 4.5184, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 193.95287916413935, + "learning_rate": 2.1596244131455402e-07, + "loss": 4.1266, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 185.3029747883216, + "learning_rate": 2.2535211267605636e-07, + "loss": 3.8772, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 173.66612187024688, + "learning_rate": 2.347417840375587e-07, + "loss": 3.7705, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 172.94071057332368, + "learning_rate": 2.4413145539906103e-07, + "loss": 3.4679, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 194.94757041376008, + "learning_rate": 2.535211267605634e-07, + "loss": 3.2841, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 166.56693420215467, + "learning_rate": 2.6291079812206576e-07, + "loss": 3.0015, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 143.23989029889705, + "learning_rate": 2.7230046948356807e-07, + "loss": 2.8706, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 107.66554276610727, + "learning_rate": 2.8169014084507043e-07, + "loss": 2.7523, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 98.96870952867148, + "learning_rate": 2.910798122065728e-07, + "loss": 2.4828, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 93.48328458386071, + "learning_rate": 3.0046948356807516e-07, + "loss": 2.5108, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 84.45473295814509, + "learning_rate": 3.0985915492957747e-07, + "loss": 2.382, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 72.72398209693705, + "learning_rate": 3.1924882629107983e-07, + "loss": 2.4057, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 59.77425684524275, + "learning_rate": 3.286384976525822e-07, + "loss": 2.4874, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 54.25859607523927, + "learning_rate": 3.380281690140845e-07, + "loss": 2.2608, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 62.59008628669073, + "learning_rate": 3.4741784037558687e-07, + "loss": 2.1518, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 61.748549974783444, + "learning_rate": 3.5680751173708924e-07, + "loss": 2.2213, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 53.17144499215813, + "learning_rate": 3.661971830985916e-07, + "loss": 2.2406, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 46.29529444440863, + "learning_rate": 3.755868544600939e-07, + "loss": 2.0733, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 46.584758261966314, + "learning_rate": 3.849765258215963e-07, + "loss": 2.0223, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 44.768233137428126, + "learning_rate": 3.9436619718309864e-07, + "loss": 2.0254, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 40.74425845277057, + "learning_rate": 4.0375586854460095e-07, + "loss": 2.0132, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 50.58164313855809, + "learning_rate": 4.131455399061033e-07, + "loss": 2.0735, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 36.21383505508657, + "learning_rate": 4.225352112676057e-07, + "loss": 2.0147, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 35.515462287966336, + "learning_rate": 4.3192488262910804e-07, + "loss": 2.0923, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 33.58221330168005, + "learning_rate": 4.413145539906104e-07, + "loss": 1.882, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 27.532017516063455, + "learning_rate": 4.507042253521127e-07, + "loss": 1.8617, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 37.96487393536598, + "learning_rate": 4.600938967136151e-07, + "loss": 1.8915, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 27.330095839009893, + "learning_rate": 4.694835680751174e-07, + "loss": 1.8524, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 29.94842775461955, + "learning_rate": 4.788732394366198e-07, + "loss": 1.715, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 26.766016728849173, + "learning_rate": 4.882629107981221e-07, + "loss": 1.7667, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 23.463242568186832, + "learning_rate": 4.976525821596245e-07, + "loss": 1.7974, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 25.118375474568996, + "learning_rate": 5.070422535211268e-07, + "loss": 1.698, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 22.72813701469559, + "learning_rate": 5.164319248826292e-07, + "loss": 1.8417, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 21.406810690811522, + "learning_rate": 5.258215962441315e-07, + "loss": 1.6796, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 23.125783926767337, + "learning_rate": 5.352112676056338e-07, + "loss": 1.6856, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 25.995319572484725, + "learning_rate": 5.446009389671361e-07, + "loss": 1.6726, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 21.243142035631116, + "learning_rate": 5.539906103286386e-07, + "loss": 1.7066, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 18.67820611891104, + "learning_rate": 5.633802816901409e-07, + "loss": 1.6953, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 18.376636526861923, + "learning_rate": 5.727699530516433e-07, + "loss": 1.7589, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 19.02065083625833, + "learning_rate": 5.821596244131456e-07, + "loss": 1.6449, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 21.58201361484835, + "learning_rate": 5.915492957746479e-07, + "loss": 1.6097, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 18.19599497843043, + "learning_rate": 6.009389671361503e-07, + "loss": 1.6031, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 15.046247584431306, + "learning_rate": 6.103286384976526e-07, + "loss": 1.6507, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 25.399110107235458, + "learning_rate": 6.197183098591549e-07, + "loss": 1.5415, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 14.82291205526455, + "learning_rate": 6.291079812206573e-07, + "loss": 1.6317, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 21.021949210423973, + "learning_rate": 6.384976525821597e-07, + "loss": 1.7103, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 18.150920968613, + "learning_rate": 6.47887323943662e-07, + "loss": 1.6881, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 17.333198402713048, + "learning_rate": 6.572769953051644e-07, + "loss": 1.6202, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 19.8010389586852, + "learning_rate": 6.666666666666667e-07, + "loss": 1.6292, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 20.00157539248203, + "learning_rate": 6.76056338028169e-07, + "loss": 1.5772, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 19.62853873047387, + "learning_rate": 6.854460093896714e-07, + "loss": 1.5247, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 15.193991835422885, + "learning_rate": 6.948356807511737e-07, + "loss": 1.6593, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 24.393541649217944, + "learning_rate": 7.042253521126762e-07, + "loss": 1.6113, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 16.31644211061861, + "learning_rate": 7.136150234741785e-07, + "loss": 1.6773, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 15.691680121843476, + "learning_rate": 7.230046948356809e-07, + "loss": 1.5878, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 20.04497999909259, + "learning_rate": 7.323943661971832e-07, + "loss": 1.5604, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 16.41186846019421, + "learning_rate": 7.417840375586854e-07, + "loss": 1.6153, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 13.83632590811489, + "learning_rate": 7.511737089201878e-07, + "loss": 1.435, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 17.278070590704782, + "learning_rate": 7.605633802816901e-07, + "loss": 1.5988, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 16.70233274567926, + "learning_rate": 7.699530516431925e-07, + "loss": 1.482, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 18.57338520068876, + "learning_rate": 7.793427230046949e-07, + "loss": 1.6485, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 13.885891113723483, + "learning_rate": 7.887323943661973e-07, + "loss": 1.5797, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 13.819451503329113, + "learning_rate": 7.981220657276996e-07, + "loss": 1.6249, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 16.976777618671832, + "learning_rate": 8.075117370892019e-07, + "loss": 1.6556, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 14.54294827649258, + "learning_rate": 8.169014084507043e-07, + "loss": 1.4836, + "step": 435 + }, + { + "epoch": 0.02, + "grad_norm": 12.521234608810484, + "learning_rate": 8.262910798122066e-07, + "loss": 1.4876, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 19.635846092765583, + "learning_rate": 8.35680751173709e-07, + "loss": 1.5898, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 20.053185201621453, + "learning_rate": 8.450704225352114e-07, + "loss": 1.5442, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 16.423021512749994, + "learning_rate": 8.544600938967138e-07, + "loss": 1.4988, + "step": 455 + }, + { + "epoch": 0.03, + "grad_norm": 14.495061538407052, + "learning_rate": 8.638497652582161e-07, + "loss": 1.4259, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 16.850361920318438, + "learning_rate": 8.732394366197183e-07, + "loss": 1.5186, + "step": 465 + }, + { + "epoch": 0.03, + "grad_norm": 13.578056106424471, + "learning_rate": 8.826291079812208e-07, + "loss": 1.4974, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 14.858639033286455, + "learning_rate": 8.92018779342723e-07, + "loss": 1.495, + "step": 475 + }, + { + "epoch": 0.03, + "grad_norm": 16.260815266203885, + "learning_rate": 9.014084507042254e-07, + "loss": 1.5365, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 18.54236055418119, + "learning_rate": 9.107981220657277e-07, + "loss": 1.5177, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 13.42644574502043, + "learning_rate": 9.201877934272302e-07, + "loss": 1.4785, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 19.272213121132413, + "learning_rate": 9.295774647887325e-07, + "loss": 1.4362, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 15.787703070931476, + "learning_rate": 9.389671361502348e-07, + "loss": 1.4147, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 22.290405211460172, + "learning_rate": 9.483568075117372e-07, + "loss": 1.5199, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 31.16683031656451, + "learning_rate": 9.577464788732395e-07, + "loss": 1.5129, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 13.05810144796746, + "learning_rate": 9.67136150234742e-07, + "loss": 1.4579, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 13.910797176778393, + "learning_rate": 9.765258215962441e-07, + "loss": 1.4825, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 12.828025692937706, + "learning_rate": 9.859154929577465e-07, + "loss": 1.4976, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 16.06711314660652, + "learning_rate": 9.95305164319249e-07, + "loss": 1.5023, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 14.902815752999173, + "learning_rate": 1.0046948356807512e-06, + "loss": 1.4582, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 12.299784889411022, + "learning_rate": 1.0140845070422536e-06, + "loss": 1.4677, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 21.224842088929567, + "learning_rate": 1.023474178403756e-06, + "loss": 1.4832, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 16.16512174398155, + "learning_rate": 1.0328638497652584e-06, + "loss": 1.4232, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 19.124144676501672, + "learning_rate": 1.0422535211267606e-06, + "loss": 1.4937, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 16.53966345408406, + "learning_rate": 1.051643192488263e-06, + "loss": 1.4997, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 16.057068878424882, + "learning_rate": 1.0610328638497655e-06, + "loss": 1.4607, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 15.222469721093784, + "learning_rate": 1.0704225352112677e-06, + "loss": 1.4737, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 14.482584766686639, + "learning_rate": 1.07981220657277e-06, + "loss": 1.4429, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 16.144605621844935, + "learning_rate": 1.0892018779342723e-06, + "loss": 1.4306, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 12.221183173617685, + "learning_rate": 1.098591549295775e-06, + "loss": 1.398, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 12.06786859392066, + "learning_rate": 1.1079812206572771e-06, + "loss": 1.3978, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 13.058932890422534, + "learning_rate": 1.1173708920187793e-06, + "loss": 1.4144, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 13.953379741736752, + "learning_rate": 1.1267605633802817e-06, + "loss": 1.371, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 13.427524454113836, + "learning_rate": 1.1361502347417841e-06, + "loss": 1.418, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 11.92374482063334, + "learning_rate": 1.1455399061032866e-06, + "loss": 1.3953, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 13.121410703430758, + "learning_rate": 1.1549295774647888e-06, + "loss": 1.423, + "step": 615 + }, + { + "epoch": 0.03, + "grad_norm": 14.188058478295792, + "learning_rate": 1.1643192488262912e-06, + "loss": 1.409, + "step": 620 + }, + { + "epoch": 0.04, + "grad_norm": 16.25281743075917, + "learning_rate": 1.1737089201877936e-06, + "loss": 1.3714, + "step": 625 + }, + { + "epoch": 0.04, + "grad_norm": 15.277758266359262, + "learning_rate": 1.1830985915492958e-06, + "loss": 1.4606, + "step": 630 + }, + { + "epoch": 0.04, + "grad_norm": 13.10051728291961, + "learning_rate": 1.1924882629107982e-06, + "loss": 1.3395, + "step": 635 + }, + { + "epoch": 0.04, + "grad_norm": 21.941601579589136, + "learning_rate": 1.2018779342723006e-06, + "loss": 1.4602, + "step": 640 + }, + { + "epoch": 0.04, + "grad_norm": 12.800228963677368, + "learning_rate": 1.211267605633803e-06, + "loss": 1.396, + "step": 645 + }, + { + "epoch": 0.04, + "grad_norm": 17.188498623370233, + "learning_rate": 1.2206572769953053e-06, + "loss": 1.3942, + "step": 650 + }, + { + "epoch": 0.04, + "grad_norm": 14.377090250263134, + "learning_rate": 1.2300469483568075e-06, + "loss": 1.4094, + "step": 655 + }, + { + "epoch": 0.04, + "grad_norm": 28.301310304498266, + "learning_rate": 1.2394366197183099e-06, + "loss": 1.4329, + "step": 660 + }, + { + "epoch": 0.04, + "grad_norm": 25.47871877191197, + "learning_rate": 1.2488262910798123e-06, + "loss": 1.3971, + "step": 665 + }, + { + "epoch": 0.04, + "grad_norm": 25.595306071329194, + "learning_rate": 1.2582159624413145e-06, + "loss": 1.3817, + "step": 670 + }, + { + "epoch": 0.04, + "grad_norm": 12.595530388058712, + "learning_rate": 1.267605633802817e-06, + "loss": 1.3787, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 30.086937759318808, + "learning_rate": 1.2769953051643193e-06, + "loss": 1.3526, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 20.172621056361283, + "learning_rate": 1.2863849765258218e-06, + "loss": 1.4155, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 24.310858020676054, + "learning_rate": 1.295774647887324e-06, + "loss": 1.3586, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 17.095471048461118, + "learning_rate": 1.3051643192488264e-06, + "loss": 1.3316, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 20.345681006867164, + "learning_rate": 1.3145539906103288e-06, + "loss": 1.3864, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 14.472551573600576, + "learning_rate": 1.323943661971831e-06, + "loss": 1.4093, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 16.80859667842158, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.3963, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 13.629469421345675, + "learning_rate": 1.3427230046948358e-06, + "loss": 1.3077, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 28.724843874556054, + "learning_rate": 1.352112676056338e-06, + "loss": 1.3234, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 25.62788509343862, + "learning_rate": 1.3615023474178405e-06, + "loss": 1.3013, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 32.190687772388905, + "learning_rate": 1.3708920187793429e-06, + "loss": 1.3736, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 28.197369021160146, + "learning_rate": 1.3802816901408453e-06, + "loss": 1.3555, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 14.772548510655179, + "learning_rate": 1.3896713615023475e-06, + "loss": 1.2694, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 13.374151647085297, + "learning_rate": 1.39906103286385e-06, + "loss": 1.3748, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 12.473217747365887, + "learning_rate": 1.4084507042253523e-06, + "loss": 1.3575, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 36.7012252204732, + "learning_rate": 1.4178403755868545e-06, + "loss": 1.3715, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 19.154802725792006, + "learning_rate": 1.427230046948357e-06, + "loss": 1.4093, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 16.47266839815217, + "learning_rate": 1.4366197183098594e-06, + "loss": 1.3366, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 12.22464496371909, + "learning_rate": 1.4460093896713618e-06, + "loss": 1.3634, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 14.641809458970616, + "learning_rate": 1.455399061032864e-06, + "loss": 1.3799, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 13.751617149883694, + "learning_rate": 1.4647887323943664e-06, + "loss": 1.3247, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 14.155368277725412, + "learning_rate": 1.4741784037558688e-06, + "loss": 1.3073, + "step": 785 + }, + { + "epoch": 0.04, + "grad_norm": 14.460702773460593, + "learning_rate": 1.4835680751173708e-06, + "loss": 1.3435, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 19.662672711998084, + "learning_rate": 1.4929577464788732e-06, + "loss": 1.3248, + "step": 795 + }, + { + "epoch": 0.05, + "grad_norm": 14.459626455820352, + "learning_rate": 1.5023474178403756e-06, + "loss": 1.3241, + "step": 800 + }, + { + "epoch": 0.05, + "grad_norm": 9.92693756733971, + "learning_rate": 1.5117370892018783e-06, + "loss": 1.2749, + "step": 805 + }, + { + "epoch": 0.05, + "grad_norm": 12.724913694283131, + "learning_rate": 1.5211267605633803e-06, + "loss": 1.3404, + "step": 810 + }, + { + "epoch": 0.05, + "grad_norm": 13.532467582650584, + "learning_rate": 1.5305164319248827e-06, + "loss": 1.3283, + "step": 815 + }, + { + "epoch": 0.05, + "grad_norm": 9.90636117794833, + "learning_rate": 1.539906103286385e-06, + "loss": 1.2561, + "step": 820 + }, + { + "epoch": 0.05, + "grad_norm": 12.765187863382085, + "learning_rate": 1.5492957746478873e-06, + "loss": 1.3248, + "step": 825 + }, + { + "epoch": 0.05, + "grad_norm": 12.974253716946507, + "learning_rate": 1.5586854460093897e-06, + "loss": 1.3316, + "step": 830 + }, + { + "epoch": 0.05, + "grad_norm": 10.230876147011728, + "learning_rate": 1.5680751173708921e-06, + "loss": 1.3496, + "step": 835 + }, + { + "epoch": 0.05, + "grad_norm": 11.653346482896024, + "learning_rate": 1.5774647887323946e-06, + "loss": 1.3542, + "step": 840 + }, + { + "epoch": 0.05, + "grad_norm": 13.288384189626518, + "learning_rate": 1.5868544600938968e-06, + "loss": 1.3754, + "step": 845 + }, + { + "epoch": 0.05, + "grad_norm": 12.595063908973746, + "learning_rate": 1.5962441314553992e-06, + "loss": 1.327, + "step": 850 + }, + { + "epoch": 0.05, + "grad_norm": 11.707725833904247, + "learning_rate": 1.6056338028169016e-06, + "loss": 1.3276, + "step": 855 + }, + { + "epoch": 0.05, + "grad_norm": 12.575003230283633, + "learning_rate": 1.6150234741784038e-06, + "loss": 1.375, + "step": 860 + }, + { + "epoch": 0.05, + "grad_norm": 11.133078679937455, + "learning_rate": 1.6244131455399062e-06, + "loss": 1.2794, + "step": 865 + }, + { + "epoch": 0.05, + "grad_norm": 13.396732713028415, + "learning_rate": 1.6338028169014086e-06, + "loss": 1.2786, + "step": 870 + }, + { + "epoch": 0.05, + "grad_norm": 19.0038238204167, + "learning_rate": 1.643192488262911e-06, + "loss": 1.3261, + "step": 875 + }, + { + "epoch": 0.05, + "grad_norm": 10.006940964261577, + "learning_rate": 1.6525821596244132e-06, + "loss": 1.2926, + "step": 880 + }, + { + "epoch": 0.05, + "grad_norm": 10.93581106394731, + "learning_rate": 1.6619718309859157e-06, + "loss": 1.282, + "step": 885 + }, + { + "epoch": 0.05, + "grad_norm": 12.65592428076664, + "learning_rate": 1.671361502347418e-06, + "loss": 1.2591, + "step": 890 + }, + { + "epoch": 0.05, + "grad_norm": 15.315244923529148, + "learning_rate": 1.6807511737089203e-06, + "loss": 1.4264, + "step": 895 + }, + { + "epoch": 0.05, + "grad_norm": 15.74543685655779, + "learning_rate": 1.6901408450704227e-06, + "loss": 1.283, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 12.21183412560726, + "learning_rate": 1.6995305164319251e-06, + "loss": 1.3377, + "step": 905 + }, + { + "epoch": 0.05, + "grad_norm": 14.587485547338511, + "learning_rate": 1.7089201877934275e-06, + "loss": 1.3355, + "step": 910 + }, + { + "epoch": 0.05, + "grad_norm": 14.570554898369355, + "learning_rate": 1.7183098591549297e-06, + "loss": 1.3419, + "step": 915 + }, + { + "epoch": 0.05, + "grad_norm": 18.080005496329896, + "learning_rate": 1.7276995305164322e-06, + "loss": 1.3719, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 12.601733886689647, + "learning_rate": 1.7370892018779346e-06, + "loss": 1.3305, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 19.182001359469993, + "learning_rate": 1.7464788732394366e-06, + "loss": 1.3133, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 12.654297993584537, + "learning_rate": 1.755868544600939e-06, + "loss": 1.2799, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 15.257829805726358, + "learning_rate": 1.7652582159624416e-06, + "loss": 1.2324, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 13.585485409291497, + "learning_rate": 1.774647887323944e-06, + "loss": 1.3184, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 11.159592380113939, + "learning_rate": 1.784037558685446e-06, + "loss": 1.2731, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 12.98072436891059, + "learning_rate": 1.7934272300469484e-06, + "loss": 1.3129, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 14.334950203134358, + "learning_rate": 1.8028169014084509e-06, + "loss": 1.3025, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 8.831771087571925, + "learning_rate": 1.812206572769953e-06, + "loss": 1.2471, + "step": 965 + }, + { + "epoch": 0.05, + "grad_norm": 25.594671065609298, + "learning_rate": 1.8215962441314555e-06, + "loss": 1.3591, + "step": 970 + }, + { + "epoch": 0.05, + "grad_norm": 14.09598181520699, + "learning_rate": 1.8309859154929579e-06, + "loss": 1.2491, + "step": 975 + }, + { + "epoch": 0.06, + "grad_norm": 12.079393096266388, + "learning_rate": 1.8403755868544603e-06, + "loss": 1.3326, + "step": 980 + }, + { + "epoch": 0.06, + "grad_norm": 10.672109157664112, + "learning_rate": 1.8497652582159625e-06, + "loss": 1.2313, + "step": 985 + }, + { + "epoch": 0.06, + "grad_norm": 12.243721125514357, + "learning_rate": 1.859154929577465e-06, + "loss": 1.3036, + "step": 990 + }, + { + "epoch": 0.06, + "grad_norm": 13.324736332255661, + "learning_rate": 1.8685446009389673e-06, + "loss": 1.3095, + "step": 995 + }, + { + "epoch": 0.06, + "grad_norm": 16.27952026335669, + "learning_rate": 1.8779342723004696e-06, + "loss": 1.2832, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 12.979541899673272, + "learning_rate": 1.887323943661972e-06, + "loss": 1.3264, + "step": 1005 + }, + { + "epoch": 0.06, + "grad_norm": 13.22324867764637, + "learning_rate": 1.8967136150234744e-06, + "loss": 1.2964, + "step": 1010 + }, + { + "epoch": 0.06, + "grad_norm": 20.862442145452597, + "learning_rate": 1.9061032863849766e-06, + "loss": 1.2276, + "step": 1015 + }, + { + "epoch": 0.06, + "grad_norm": 13.56689153802307, + "learning_rate": 1.915492957746479e-06, + "loss": 1.2277, + "step": 1020 + }, + { + "epoch": 0.06, + "grad_norm": 15.59196128567854, + "learning_rate": 1.9248826291079814e-06, + "loss": 1.27, + "step": 1025 + }, + { + "epoch": 0.06, + "grad_norm": 11.299128498391946, + "learning_rate": 1.934272300469484e-06, + "loss": 1.3295, + "step": 1030 + }, + { + "epoch": 0.06, + "grad_norm": 19.20043989853521, + "learning_rate": 1.943661971830986e-06, + "loss": 1.3343, + "step": 1035 + }, + { + "epoch": 0.06, + "grad_norm": 14.59109623708328, + "learning_rate": 1.9530516431924883e-06, + "loss": 1.3281, + "step": 1040 + }, + { + "epoch": 0.06, + "grad_norm": 13.343332788368398, + "learning_rate": 1.9624413145539907e-06, + "loss": 1.339, + "step": 1045 + }, + { + "epoch": 0.06, + "grad_norm": 34.38348243471212, + "learning_rate": 1.971830985915493e-06, + "loss": 1.2953, + "step": 1050 + }, + { + "epoch": 0.06, + "grad_norm": 57.553178005750915, + "learning_rate": 1.9812206572769955e-06, + "loss": 1.2797, + "step": 1055 + }, + { + "epoch": 0.06, + "grad_norm": 11.395373329350186, + "learning_rate": 1.990610328638498e-06, + "loss": 1.2543, + "step": 1060 + }, + { + "epoch": 0.06, + "grad_norm": 12.18711726177598, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.2292, + "step": 1065 + }, + { + "epoch": 0.06, + "grad_norm": 18.898228878702902, + "learning_rate": 2.0093896713615023e-06, + "loss": 1.3609, + "step": 1070 + }, + { + "epoch": 0.06, + "grad_norm": 11.501589514924925, + "learning_rate": 2.0187793427230047e-06, + "loss": 1.2875, + "step": 1075 + }, + { + "epoch": 0.06, + "grad_norm": 9.570857168438987, + "learning_rate": 2.028169014084507e-06, + "loss": 1.2705, + "step": 1080 + }, + { + "epoch": 0.06, + "grad_norm": 13.011759356869026, + "learning_rate": 2.0375586854460096e-06, + "loss": 1.2501, + "step": 1085 + }, + { + "epoch": 0.06, + "grad_norm": 11.36984363387079, + "learning_rate": 2.046948356807512e-06, + "loss": 1.2802, + "step": 1090 + }, + { + "epoch": 0.06, + "grad_norm": 11.573554484180887, + "learning_rate": 2.0563380281690144e-06, + "loss": 1.2299, + "step": 1095 + }, + { + "epoch": 0.06, + "grad_norm": 12.346613101864381, + "learning_rate": 2.065727699530517e-06, + "loss": 1.278, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 12.673556476995792, + "learning_rate": 2.075117370892019e-06, + "loss": 1.2857, + "step": 1105 + }, + { + "epoch": 0.06, + "grad_norm": 12.743903049798025, + "learning_rate": 2.0845070422535212e-06, + "loss": 1.2731, + "step": 1110 + }, + { + "epoch": 0.06, + "grad_norm": 10.095242668024651, + "learning_rate": 2.0938967136150237e-06, + "loss": 1.3167, + "step": 1115 + }, + { + "epoch": 0.06, + "grad_norm": 10.525888405537948, + "learning_rate": 2.103286384976526e-06, + "loss": 1.2914, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 9.939198161893481, + "learning_rate": 2.1126760563380285e-06, + "loss": 1.2053, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 11.462157973981705, + "learning_rate": 2.122065727699531e-06, + "loss": 1.2672, + "step": 1130 + }, + { + "epoch": 0.06, + "grad_norm": 17.316075393990364, + "learning_rate": 2.1314553990610333e-06, + "loss": 1.2912, + "step": 1135 + }, + { + "epoch": 0.06, + "grad_norm": 10.059605944650713, + "learning_rate": 2.1408450704225353e-06, + "loss": 1.2916, + "step": 1140 + }, + { + "epoch": 0.06, + "grad_norm": 13.315508373754005, + "learning_rate": 2.1502347417840377e-06, + "loss": 1.2437, + "step": 1145 + }, + { + "epoch": 0.06, + "grad_norm": 10.260117294160276, + "learning_rate": 2.15962441314554e-06, + "loss": 1.2363, + "step": 1150 + }, + { + "epoch": 0.07, + "grad_norm": 15.201836066155025, + "learning_rate": 2.169014084507042e-06, + "loss": 1.3131, + "step": 1155 + }, + { + "epoch": 0.07, + "grad_norm": 14.948713842229541, + "learning_rate": 2.1784037558685446e-06, + "loss": 1.277, + "step": 1160 + }, + { + "epoch": 0.07, + "grad_norm": 32.52981277229426, + "learning_rate": 2.187793427230047e-06, + "loss": 1.3297, + "step": 1165 + }, + { + "epoch": 0.07, + "grad_norm": 28.419378073727167, + "learning_rate": 2.19718309859155e-06, + "loss": 1.3176, + "step": 1170 + }, + { + "epoch": 0.07, + "grad_norm": 36.13582450376201, + "learning_rate": 2.206572769953052e-06, + "loss": 1.1653, + "step": 1175 + }, + { + "epoch": 0.07, + "grad_norm": 62.500714210346636, + "learning_rate": 2.2159624413145542e-06, + "loss": 1.2826, + "step": 1180 + }, + { + "epoch": 0.07, + "grad_norm": 61.90731014306774, + "learning_rate": 2.2253521126760566e-06, + "loss": 1.3107, + "step": 1185 + }, + { + "epoch": 0.07, + "grad_norm": 39.50365457516419, + "learning_rate": 2.2347417840375586e-06, + "loss": 1.2295, + "step": 1190 + }, + { + "epoch": 0.07, + "grad_norm": 35.84862984982831, + "learning_rate": 2.244131455399061e-06, + "loss": 1.2772, + "step": 1195 + }, + { + "epoch": 0.07, + "grad_norm": 17.65318332059801, + "learning_rate": 2.2535211267605635e-06, + "loss": 1.3234, + "step": 1200 + }, + { + "epoch": 0.07, + "grad_norm": 13.635645594041737, + "learning_rate": 2.262910798122066e-06, + "loss": 1.2615, + "step": 1205 + }, + { + "epoch": 0.07, + "grad_norm": 15.498192745981429, + "learning_rate": 2.2723004694835683e-06, + "loss": 1.3146, + "step": 1210 + }, + { + "epoch": 0.07, + "grad_norm": 10.401059501962532, + "learning_rate": 2.2816901408450707e-06, + "loss": 1.2705, + "step": 1215 + }, + { + "epoch": 0.07, + "grad_norm": 11.747909461352078, + "learning_rate": 2.291079812206573e-06, + "loss": 1.3012, + "step": 1220 + }, + { + "epoch": 0.07, + "grad_norm": 11.72255084965526, + "learning_rate": 2.300469483568075e-06, + "loss": 1.2489, + "step": 1225 + }, + { + "epoch": 0.07, + "grad_norm": 17.31235925255563, + "learning_rate": 2.3098591549295775e-06, + "loss": 1.2716, + "step": 1230 + }, + { + "epoch": 0.07, + "grad_norm": 15.9954248660628, + "learning_rate": 2.31924882629108e-06, + "loss": 1.2991, + "step": 1235 + }, + { + "epoch": 0.07, + "grad_norm": 13.42949763678413, + "learning_rate": 2.3286384976525824e-06, + "loss": 1.2765, + "step": 1240 + }, + { + "epoch": 0.07, + "grad_norm": 23.061281608940302, + "learning_rate": 2.338028169014085e-06, + "loss": 1.2494, + "step": 1245 + }, + { + "epoch": 0.07, + "grad_norm": 11.734814565335228, + "learning_rate": 2.347417840375587e-06, + "loss": 1.2396, + "step": 1250 + }, + { + "epoch": 0.07, + "grad_norm": 11.540481170544485, + "learning_rate": 2.3568075117370896e-06, + "loss": 1.2321, + "step": 1255 + }, + { + "epoch": 0.07, + "grad_norm": 14.028057084580603, + "learning_rate": 2.3661971830985916e-06, + "loss": 1.2415, + "step": 1260 + }, + { + "epoch": 0.07, + "grad_norm": 12.043226858203862, + "learning_rate": 2.375586854460094e-06, + "loss": 1.226, + "step": 1265 + }, + { + "epoch": 0.07, + "grad_norm": 12.477510290358993, + "learning_rate": 2.3849765258215964e-06, + "loss": 1.2521, + "step": 1270 + }, + { + "epoch": 0.07, + "grad_norm": 12.256647846745311, + "learning_rate": 2.3943661971830984e-06, + "loss": 1.2849, + "step": 1275 + }, + { + "epoch": 0.07, + "grad_norm": 12.742208231148163, + "learning_rate": 2.4037558685446013e-06, + "loss": 1.1871, + "step": 1280 + }, + { + "epoch": 0.07, + "grad_norm": 18.927219685267502, + "learning_rate": 2.4131455399061037e-06, + "loss": 1.2489, + "step": 1285 + }, + { + "epoch": 0.07, + "grad_norm": 9.756776186947189, + "learning_rate": 2.422535211267606e-06, + "loss": 1.2231, + "step": 1290 + }, + { + "epoch": 0.07, + "grad_norm": 10.616992125356624, + "learning_rate": 2.431924882629108e-06, + "loss": 1.2557, + "step": 1295 + }, + { + "epoch": 0.07, + "grad_norm": 9.701257195294193, + "learning_rate": 2.4413145539906105e-06, + "loss": 1.2434, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 14.699896318555654, + "learning_rate": 2.450704225352113e-06, + "loss": 1.2262, + "step": 1305 + }, + { + "epoch": 0.07, + "grad_norm": 18.530133028071965, + "learning_rate": 2.460093896713615e-06, + "loss": 1.3151, + "step": 1310 + }, + { + "epoch": 0.07, + "grad_norm": 11.84474791051373, + "learning_rate": 2.4694835680751174e-06, + "loss": 1.269, + "step": 1315 + }, + { + "epoch": 0.07, + "grad_norm": 36.51089454980534, + "learning_rate": 2.4788732394366198e-06, + "loss": 1.2982, + "step": 1320 + }, + { + "epoch": 0.07, + "grad_norm": 10.941829354051336, + "learning_rate": 2.488262910798122e-06, + "loss": 1.2912, + "step": 1325 + }, + { + "epoch": 0.07, + "grad_norm": 24.72393459100198, + "learning_rate": 2.4976525821596246e-06, + "loss": 1.3132, + "step": 1330 + }, + { + "epoch": 0.08, + "grad_norm": 14.974596031712748, + "learning_rate": 2.507042253521127e-06, + "loss": 1.2502, + "step": 1335 + }, + { + "epoch": 0.08, + "grad_norm": 13.752314054915557, + "learning_rate": 2.516431924882629e-06, + "loss": 1.2283, + "step": 1340 + }, + { + "epoch": 0.08, + "grad_norm": 14.543313397681446, + "learning_rate": 2.525821596244132e-06, + "loss": 1.2551, + "step": 1345 + }, + { + "epoch": 0.08, + "grad_norm": 21.87340019920864, + "learning_rate": 2.535211267605634e-06, + "loss": 1.1885, + "step": 1350 + }, + { + "epoch": 0.08, + "grad_norm": 21.08675819665147, + "learning_rate": 2.5446009389671363e-06, + "loss": 1.2108, + "step": 1355 + }, + { + "epoch": 0.08, + "grad_norm": 28.13883185096642, + "learning_rate": 2.5539906103286387e-06, + "loss": 1.2596, + "step": 1360 + }, + { + "epoch": 0.08, + "grad_norm": 40.189244786733674, + "learning_rate": 2.563380281690141e-06, + "loss": 1.3154, + "step": 1365 + }, + { + "epoch": 0.08, + "grad_norm": 42.33542568705795, + "learning_rate": 2.5727699530516435e-06, + "loss": 1.2612, + "step": 1370 + }, + { + "epoch": 0.08, + "grad_norm": 30.275801063302588, + "learning_rate": 2.582159624413146e-06, + "loss": 1.2936, + "step": 1375 + }, + { + "epoch": 0.08, + "grad_norm": 20.162098963082585, + "learning_rate": 2.591549295774648e-06, + "loss": 1.2896, + "step": 1380 + }, + { + "epoch": 0.08, + "grad_norm": 16.498957249092086, + "learning_rate": 2.6009389671361508e-06, + "loss": 1.2507, + "step": 1385 + }, + { + "epoch": 0.08, + "grad_norm": 10.639486264516998, + "learning_rate": 2.6103286384976528e-06, + "loss": 1.2604, + "step": 1390 + }, + { + "epoch": 0.08, + "grad_norm": 29.8483209419568, + "learning_rate": 2.619718309859155e-06, + "loss": 1.2366, + "step": 1395 + }, + { + "epoch": 0.08, + "grad_norm": 23.394341391555916, + "learning_rate": 2.6291079812206576e-06, + "loss": 1.2213, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 12.344509034756229, + "learning_rate": 2.63849765258216e-06, + "loss": 1.2865, + "step": 1405 + }, + { + "epoch": 0.08, + "grad_norm": 11.299775911148041, + "learning_rate": 2.647887323943662e-06, + "loss": 1.1872, + "step": 1410 + }, + { + "epoch": 0.08, + "grad_norm": 19.95285873093074, + "learning_rate": 2.657276995305165e-06, + "loss": 1.3064, + "step": 1415 + }, + { + "epoch": 0.08, + "grad_norm": 28.29931505654469, + "learning_rate": 2.666666666666667e-06, + "loss": 1.2422, + "step": 1420 + }, + { + "epoch": 0.08, + "grad_norm": 22.03172510028025, + "learning_rate": 2.676056338028169e-06, + "loss": 1.2986, + "step": 1425 + }, + { + "epoch": 0.08, + "grad_norm": 9.757275809832382, + "learning_rate": 2.6854460093896717e-06, + "loss": 1.1891, + "step": 1430 + }, + { + "epoch": 0.08, + "grad_norm": 11.522088430581046, + "learning_rate": 2.6948356807511737e-06, + "loss": 1.29, + "step": 1435 + }, + { + "epoch": 0.08, + "grad_norm": 56.04452497266458, + "learning_rate": 2.704225352112676e-06, + "loss": 1.2318, + "step": 1440 + }, + { + "epoch": 0.08, + "grad_norm": 12.328382728439792, + "learning_rate": 2.713615023474179e-06, + "loss": 1.2388, + "step": 1445 + }, + { + "epoch": 0.08, + "grad_norm": 27.67864633622398, + "learning_rate": 2.723004694835681e-06, + "loss": 1.2536, + "step": 1450 + }, + { + "epoch": 0.08, + "grad_norm": 20.618348374946798, + "learning_rate": 2.7323943661971837e-06, + "loss": 1.2223, + "step": 1455 + }, + { + "epoch": 0.08, + "grad_norm": 34.38731255019953, + "learning_rate": 2.7417840375586857e-06, + "loss": 1.2435, + "step": 1460 + }, + { + "epoch": 0.08, + "grad_norm": 12.200577896742542, + "learning_rate": 2.7511737089201877e-06, + "loss": 1.2095, + "step": 1465 + }, + { + "epoch": 0.08, + "grad_norm": 11.6577772988377, + "learning_rate": 2.7605633802816906e-06, + "loss": 1.282, + "step": 1470 + }, + { + "epoch": 0.08, + "grad_norm": 14.44746895708401, + "learning_rate": 2.7699530516431926e-06, + "loss": 1.2528, + "step": 1475 + }, + { + "epoch": 0.08, + "grad_norm": 19.58629354717819, + "learning_rate": 2.779342723004695e-06, + "loss": 1.2114, + "step": 1480 + }, + { + "epoch": 0.08, + "grad_norm": 14.911014872106671, + "learning_rate": 2.7887323943661974e-06, + "loss": 1.1756, + "step": 1485 + }, + { + "epoch": 0.08, + "grad_norm": 13.241297502880386, + "learning_rate": 2.7981220657277e-06, + "loss": 1.2957, + "step": 1490 + }, + { + "epoch": 0.08, + "grad_norm": 17.997669865289506, + "learning_rate": 2.807511737089202e-06, + "loss": 1.2558, + "step": 1495 + }, + { + "epoch": 0.08, + "grad_norm": 23.127084753806834, + "learning_rate": 2.8169014084507046e-06, + "loss": 1.2417, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 14.97961856498744, + "learning_rate": 2.8262910798122066e-06, + "loss": 1.1799, + "step": 1505 + }, + { + "epoch": 0.09, + "grad_norm": 18.92322598523992, + "learning_rate": 2.835680751173709e-06, + "loss": 1.3069, + "step": 1510 + }, + { + "epoch": 0.09, + "grad_norm": 12.178134914863689, + "learning_rate": 2.8450704225352115e-06, + "loss": 1.2267, + "step": 1515 + }, + { + "epoch": 0.09, + "grad_norm": 11.73418912513778, + "learning_rate": 2.854460093896714e-06, + "loss": 1.1838, + "step": 1520 + }, + { + "epoch": 0.09, + "grad_norm": 17.57569030789101, + "learning_rate": 2.8638497652582163e-06, + "loss": 1.2699, + "step": 1525 + }, + { + "epoch": 0.09, + "grad_norm": 20.103205074818657, + "learning_rate": 2.8732394366197187e-06, + "loss": 1.2513, + "step": 1530 + }, + { + "epoch": 0.09, + "grad_norm": 13.74796858229263, + "learning_rate": 2.8826291079812207e-06, + "loss": 1.2233, + "step": 1535 + }, + { + "epoch": 0.09, + "grad_norm": 12.125121220128353, + "learning_rate": 2.8920187793427236e-06, + "loss": 1.2512, + "step": 1540 + }, + { + "epoch": 0.09, + "grad_norm": 17.923327802812725, + "learning_rate": 2.9014084507042255e-06, + "loss": 1.2217, + "step": 1545 + }, + { + "epoch": 0.09, + "grad_norm": 29.864230330956797, + "learning_rate": 2.910798122065728e-06, + "loss": 1.2279, + "step": 1550 + }, + { + "epoch": 0.09, + "grad_norm": 18.32678784754926, + "learning_rate": 2.9201877934272304e-06, + "loss": 1.2437, + "step": 1555 + }, + { + "epoch": 0.09, + "grad_norm": 17.743960758383423, + "learning_rate": 2.929577464788733e-06, + "loss": 1.2021, + "step": 1560 + }, + { + "epoch": 0.09, + "grad_norm": 11.866384456015291, + "learning_rate": 2.938967136150235e-06, + "loss": 1.1487, + "step": 1565 + }, + { + "epoch": 0.09, + "grad_norm": 12.098250571877362, + "learning_rate": 2.9483568075117376e-06, + "loss": 1.1885, + "step": 1570 + }, + { + "epoch": 0.09, + "grad_norm": 10.259031939504771, + "learning_rate": 2.9577464788732396e-06, + "loss": 1.2476, + "step": 1575 + }, + { + "epoch": 0.09, + "grad_norm": 13.472449698302071, + "learning_rate": 2.9671361502347416e-06, + "loss": 1.2519, + "step": 1580 + }, + { + "epoch": 0.09, + "grad_norm": 17.30621641804569, + "learning_rate": 2.9765258215962445e-06, + "loss": 1.2108, + "step": 1585 + }, + { + "epoch": 0.09, + "grad_norm": 18.804495414027024, + "learning_rate": 2.9859154929577465e-06, + "loss": 1.2356, + "step": 1590 + }, + { + "epoch": 0.09, + "grad_norm": 79.8771308486058, + "learning_rate": 2.9953051643192493e-06, + "loss": 1.2927, + "step": 1595 + }, + { + "epoch": 0.09, + "grad_norm": 36.98912216957134, + "learning_rate": 3.0046948356807513e-06, + "loss": 1.2825, + "step": 1600 + }, + { + "epoch": 0.09, + "grad_norm": 24.22949511816613, + "learning_rate": 3.0140845070422537e-06, + "loss": 1.2393, + "step": 1605 + }, + { + "epoch": 0.09, + "grad_norm": 43.48801816298305, + "learning_rate": 3.0234741784037565e-06, + "loss": 1.2168, + "step": 1610 + }, + { + "epoch": 0.09, + "grad_norm": 69.02114420972799, + "learning_rate": 3.0328638497652585e-06, + "loss": 1.2701, + "step": 1615 + }, + { + "epoch": 0.09, + "grad_norm": 51.13066642076986, + "learning_rate": 3.0422535211267605e-06, + "loss": 1.1947, + "step": 1620 + }, + { + "epoch": 0.09, + "grad_norm": 41.12245034419979, + "learning_rate": 3.0516431924882634e-06, + "loss": 1.2376, + "step": 1625 + }, + { + "epoch": 0.09, + "grad_norm": 11.919946134065007, + "learning_rate": 3.0610328638497654e-06, + "loss": 1.2236, + "step": 1630 + }, + { + "epoch": 0.09, + "grad_norm": 9.287405241698428, + "learning_rate": 3.0704225352112678e-06, + "loss": 1.1929, + "step": 1635 + }, + { + "epoch": 0.09, + "grad_norm": 11.292933429820003, + "learning_rate": 3.07981220657277e-06, + "loss": 1.2267, + "step": 1640 + }, + { + "epoch": 0.09, + "grad_norm": 21.71127998573997, + "learning_rate": 3.0892018779342726e-06, + "loss": 1.2475, + "step": 1645 + }, + { + "epoch": 0.09, + "grad_norm": 31.074374172436762, + "learning_rate": 3.0985915492957746e-06, + "loss": 1.2199, + "step": 1650 + }, + { + "epoch": 0.09, + "grad_norm": 18.182189207241336, + "learning_rate": 3.1079812206572774e-06, + "loss": 1.1956, + "step": 1655 + }, + { + "epoch": 0.09, + "grad_norm": 20.695901024833002, + "learning_rate": 3.1173708920187794e-06, + "loss": 1.265, + "step": 1660 + }, + { + "epoch": 0.09, + "grad_norm": 23.652515760504574, + "learning_rate": 3.1267605633802823e-06, + "loss": 1.2191, + "step": 1665 + }, + { + "epoch": 0.09, + "grad_norm": 9.510446962166238, + "learning_rate": 3.1361502347417843e-06, + "loss": 1.2269, + "step": 1670 + }, + { + "epoch": 0.09, + "grad_norm": 9.55534207393913, + "learning_rate": 3.1455399061032867e-06, + "loss": 1.1944, + "step": 1675 + }, + { + "epoch": 0.09, + "grad_norm": 11.834353219589225, + "learning_rate": 3.154929577464789e-06, + "loss": 1.2206, + "step": 1680 + }, + { + "epoch": 0.09, + "grad_norm": 9.347622734688182, + "learning_rate": 3.1643192488262915e-06, + "loss": 1.2037, + "step": 1685 + }, + { + "epoch": 0.1, + "grad_norm": 11.082052478565581, + "learning_rate": 3.1737089201877935e-06, + "loss": 1.214, + "step": 1690 + }, + { + "epoch": 0.1, + "grad_norm": 12.235478472594236, + "learning_rate": 3.1830985915492964e-06, + "loss": 1.2618, + "step": 1695 + }, + { + "epoch": 0.1, + "grad_norm": 8.51370136559126, + "learning_rate": 3.1924882629107983e-06, + "loss": 1.1912, + "step": 1700 + }, + { + "epoch": 0.1, + "grad_norm": 10.518871002073665, + "learning_rate": 3.2018779342723003e-06, + "loss": 1.2252, + "step": 1705 + }, + { + "epoch": 0.1, + "grad_norm": 12.183774407336283, + "learning_rate": 3.211267605633803e-06, + "loss": 1.1821, + "step": 1710 + }, + { + "epoch": 0.1, + "grad_norm": 20.953755616187795, + "learning_rate": 3.2206572769953056e-06, + "loss": 1.178, + "step": 1715 + }, + { + "epoch": 0.1, + "grad_norm": 10.16947579342062, + "learning_rate": 3.2300469483568076e-06, + "loss": 1.2226, + "step": 1720 + }, + { + "epoch": 0.1, + "grad_norm": 10.102934787903266, + "learning_rate": 3.2394366197183104e-06, + "loss": 1.1678, + "step": 1725 + }, + { + "epoch": 0.1, + "grad_norm": 12.20945662610384, + "learning_rate": 3.2488262910798124e-06, + "loss": 1.2281, + "step": 1730 + }, + { + "epoch": 0.1, + "grad_norm": 15.459902129059378, + "learning_rate": 3.2582159624413144e-06, + "loss": 1.2358, + "step": 1735 + }, + { + "epoch": 0.1, + "grad_norm": 15.087525019755466, + "learning_rate": 3.2676056338028173e-06, + "loss": 1.2747, + "step": 1740 + }, + { + "epoch": 0.1, + "grad_norm": 10.388260393020143, + "learning_rate": 3.2769953051643192e-06, + "loss": 1.1767, + "step": 1745 + }, + { + "epoch": 0.1, + "grad_norm": 10.354232491977962, + "learning_rate": 3.286384976525822e-06, + "loss": 1.2726, + "step": 1750 + }, + { + "epoch": 0.1, + "grad_norm": 13.134829932865085, + "learning_rate": 3.295774647887324e-06, + "loss": 1.2588, + "step": 1755 + }, + { + "epoch": 0.1, + "grad_norm": 9.99587325617737, + "learning_rate": 3.3051643192488265e-06, + "loss": 1.2264, + "step": 1760 + }, + { + "epoch": 0.1, + "grad_norm": 13.734943211600395, + "learning_rate": 3.314553990610329e-06, + "loss": 1.256, + "step": 1765 + }, + { + "epoch": 0.1, + "grad_norm": 13.34217967087882, + "learning_rate": 3.3239436619718313e-06, + "loss": 1.2513, + "step": 1770 + }, + { + "epoch": 0.1, + "grad_norm": 9.042606148479086, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.1657, + "step": 1775 + }, + { + "epoch": 0.1, + "grad_norm": 12.693454161873957, + "learning_rate": 3.342723004694836e-06, + "loss": 1.2002, + "step": 1780 + }, + { + "epoch": 0.1, + "grad_norm": 19.002447138761273, + "learning_rate": 3.352112676056338e-06, + "loss": 1.2263, + "step": 1785 + }, + { + "epoch": 0.1, + "grad_norm": 11.60649729452257, + "learning_rate": 3.3615023474178406e-06, + "loss": 1.1948, + "step": 1790 + }, + { + "epoch": 0.1, + "grad_norm": 14.752199113163702, + "learning_rate": 3.370892018779343e-06, + "loss": 1.225, + "step": 1795 + }, + { + "epoch": 0.1, + "grad_norm": 13.912569900789103, + "learning_rate": 3.3802816901408454e-06, + "loss": 1.1964, + "step": 1800 + }, + { + "epoch": 0.1, + "grad_norm": 28.776006217385422, + "learning_rate": 3.3896713615023474e-06, + "loss": 1.2135, + "step": 1805 + }, + { + "epoch": 0.1, + "grad_norm": 10.445070304376742, + "learning_rate": 3.3990610328638502e-06, + "loss": 1.1712, + "step": 1810 + }, + { + "epoch": 0.1, + "grad_norm": 21.66514653599388, + "learning_rate": 3.4084507042253522e-06, + "loss": 1.2671, + "step": 1815 + }, + { + "epoch": 0.1, + "grad_norm": 14.12605318731465, + "learning_rate": 3.417840375586855e-06, + "loss": 1.2047, + "step": 1820 + }, + { + "epoch": 0.1, + "grad_norm": 22.66942288865685, + "learning_rate": 3.427230046948357e-06, + "loss": 1.1977, + "step": 1825 + }, + { + "epoch": 0.1, + "grad_norm": 28.517348201476594, + "learning_rate": 3.4366197183098595e-06, + "loss": 1.2314, + "step": 1830 + }, + { + "epoch": 0.1, + "grad_norm": 13.236614878342387, + "learning_rate": 3.446009389671362e-06, + "loss": 1.2479, + "step": 1835 + }, + { + "epoch": 0.1, + "grad_norm": 16.58188772698804, + "learning_rate": 3.4553990610328643e-06, + "loss": 1.1897, + "step": 1840 + }, + { + "epoch": 0.1, + "grad_norm": 9.450810792750605, + "learning_rate": 3.4647887323943663e-06, + "loss": 1.2657, + "step": 1845 + }, + { + "epoch": 0.1, + "grad_norm": 18.328144068914792, + "learning_rate": 3.474178403755869e-06, + "loss": 1.1788, + "step": 1850 + }, + { + "epoch": 0.1, + "grad_norm": 10.646147429445598, + "learning_rate": 3.483568075117371e-06, + "loss": 1.1874, + "step": 1855 + }, + { + "epoch": 0.1, + "grad_norm": 17.046538853977292, + "learning_rate": 3.492957746478873e-06, + "loss": 1.1592, + "step": 1860 + }, + { + "epoch": 0.11, + "grad_norm": 11.704241336750403, + "learning_rate": 3.502347417840376e-06, + "loss": 1.1885, + "step": 1865 + }, + { + "epoch": 0.11, + "grad_norm": 9.4346558829419, + "learning_rate": 3.511737089201878e-06, + "loss": 1.2097, + "step": 1870 + }, + { + "epoch": 0.11, + "grad_norm": 9.397004458626839, + "learning_rate": 3.5211267605633804e-06, + "loss": 1.168, + "step": 1875 + }, + { + "epoch": 0.11, + "grad_norm": 10.039403557104384, + "learning_rate": 3.5305164319248832e-06, + "loss": 1.2138, + "step": 1880 + }, + { + "epoch": 0.11, + "grad_norm": 13.033316856532485, + "learning_rate": 3.5399061032863852e-06, + "loss": 1.2104, + "step": 1885 + }, + { + "epoch": 0.11, + "grad_norm": 11.581742073717887, + "learning_rate": 3.549295774647888e-06, + "loss": 1.1657, + "step": 1890 + }, + { + "epoch": 0.11, + "grad_norm": 9.193936813759473, + "learning_rate": 3.55868544600939e-06, + "loss": 1.1598, + "step": 1895 + }, + { + "epoch": 0.11, + "grad_norm": 20.783341696812734, + "learning_rate": 3.568075117370892e-06, + "loss": 1.1809, + "step": 1900 + }, + { + "epoch": 0.11, + "grad_norm": 18.53705773501876, + "learning_rate": 3.577464788732395e-06, + "loss": 1.2116, + "step": 1905 + }, + { + "epoch": 0.11, + "grad_norm": 32.46653112162618, + "learning_rate": 3.586854460093897e-06, + "loss": 1.1735, + "step": 1910 + }, + { + "epoch": 0.11, + "grad_norm": 20.29347797370921, + "learning_rate": 3.5962441314553993e-06, + "loss": 1.1923, + "step": 1915 + }, + { + "epoch": 0.11, + "grad_norm": 49.30618204917337, + "learning_rate": 3.6056338028169017e-06, + "loss": 1.216, + "step": 1920 + }, + { + "epoch": 0.11, + "grad_norm": 19.760430970051083, + "learning_rate": 3.615023474178404e-06, + "loss": 1.2522, + "step": 1925 + }, + { + "epoch": 0.11, + "grad_norm": 18.109595738192525, + "learning_rate": 3.624413145539906e-06, + "loss": 1.1739, + "step": 1930 + }, + { + "epoch": 0.11, + "grad_norm": 14.032334502759433, + "learning_rate": 3.633802816901409e-06, + "loss": 1.2265, + "step": 1935 + }, + { + "epoch": 0.11, + "grad_norm": 8.839563942638703, + "learning_rate": 3.643192488262911e-06, + "loss": 1.2752, + "step": 1940 + }, + { + "epoch": 0.11, + "grad_norm": 11.061383020324275, + "learning_rate": 3.6525821596244134e-06, + "loss": 1.2062, + "step": 1945 + }, + { + "epoch": 0.11, + "grad_norm": 20.490899045262893, + "learning_rate": 3.6619718309859158e-06, + "loss": 1.2212, + "step": 1950 + }, + { + "epoch": 0.11, + "grad_norm": 11.176378163025321, + "learning_rate": 3.671361502347418e-06, + "loss": 1.2657, + "step": 1955 + }, + { + "epoch": 0.11, + "grad_norm": 15.368210617302188, + "learning_rate": 3.6807511737089206e-06, + "loss": 1.1647, + "step": 1960 + }, + { + "epoch": 0.11, + "grad_norm": 30.91571927783136, + "learning_rate": 3.690140845070423e-06, + "loss": 1.2092, + "step": 1965 + }, + { + "epoch": 0.11, + "grad_norm": 29.255455789708254, + "learning_rate": 3.699530516431925e-06, + "loss": 1.2392, + "step": 1970 + }, + { + "epoch": 0.11, + "grad_norm": 36.16139916768643, + "learning_rate": 3.708920187793428e-06, + "loss": 1.1945, + "step": 1975 + }, + { + "epoch": 0.11, + "grad_norm": 29.10675274421538, + "learning_rate": 3.71830985915493e-06, + "loss": 1.2732, + "step": 1980 + }, + { + "epoch": 0.11, + "grad_norm": 21.275163471293087, + "learning_rate": 3.7276995305164323e-06, + "loss": 1.2389, + "step": 1985 + }, + { + "epoch": 0.11, + "grad_norm": 20.208871298142192, + "learning_rate": 3.7370892018779347e-06, + "loss": 1.2168, + "step": 1990 + }, + { + "epoch": 0.11, + "grad_norm": 12.469318357293972, + "learning_rate": 3.746478873239437e-06, + "loss": 1.1905, + "step": 1995 + }, + { + "epoch": 0.11, + "grad_norm": 11.688363452103964, + "learning_rate": 3.755868544600939e-06, + "loss": 1.2247, + "step": 2000 + }, + { + "epoch": 0.11, + "grad_norm": 22.47927123788488, + "learning_rate": 3.765258215962442e-06, + "loss": 1.2028, + "step": 2005 + }, + { + "epoch": 0.11, + "grad_norm": 10.822752014295487, + "learning_rate": 3.774647887323944e-06, + "loss": 1.2249, + "step": 2010 + }, + { + "epoch": 0.11, + "grad_norm": 8.775832776295033, + "learning_rate": 3.784037558685446e-06, + "loss": 1.1848, + "step": 2015 + }, + { + "epoch": 0.11, + "grad_norm": 15.403695439707086, + "learning_rate": 3.7934272300469488e-06, + "loss": 1.2687, + "step": 2020 + }, + { + "epoch": 0.11, + "grad_norm": 25.544886300422906, + "learning_rate": 3.8028169014084508e-06, + "loss": 1.191, + "step": 2025 + }, + { + "epoch": 0.11, + "grad_norm": 26.852316151220336, + "learning_rate": 3.812206572769953e-06, + "loss": 1.215, + "step": 2030 + }, + { + "epoch": 0.11, + "grad_norm": 11.691188545436885, + "learning_rate": 3.821596244131456e-06, + "loss": 1.2523, + "step": 2035 + }, + { + "epoch": 0.11, + "grad_norm": 17.774750676679698, + "learning_rate": 3.830985915492958e-06, + "loss": 1.1613, + "step": 2040 + }, + { + "epoch": 0.12, + "grad_norm": 11.23137836953399, + "learning_rate": 3.8403755868544604e-06, + "loss": 1.2546, + "step": 2045 + }, + { + "epoch": 0.12, + "grad_norm": 18.4462130773395, + "learning_rate": 3.849765258215963e-06, + "loss": 1.196, + "step": 2050 + }, + { + "epoch": 0.12, + "grad_norm": 9.07146144561959, + "learning_rate": 3.859154929577465e-06, + "loss": 1.1759, + "step": 2055 + }, + { + "epoch": 0.12, + "grad_norm": 11.880854867288102, + "learning_rate": 3.868544600938968e-06, + "loss": 1.1976, + "step": 2060 + }, + { + "epoch": 0.12, + "grad_norm": 12.23859917864838, + "learning_rate": 3.87793427230047e-06, + "loss": 1.1506, + "step": 2065 + }, + { + "epoch": 0.12, + "grad_norm": 9.261931534727905, + "learning_rate": 3.887323943661972e-06, + "loss": 1.1614, + "step": 2070 + }, + { + "epoch": 0.12, + "grad_norm": 10.990337812660432, + "learning_rate": 3.896713615023475e-06, + "loss": 1.189, + "step": 2075 + }, + { + "epoch": 0.12, + "grad_norm": 11.297729793250264, + "learning_rate": 3.9061032863849765e-06, + "loss": 1.1841, + "step": 2080 + }, + { + "epoch": 0.12, + "grad_norm": 10.809849895984062, + "learning_rate": 3.915492957746479e-06, + "loss": 1.1932, + "step": 2085 + }, + { + "epoch": 0.12, + "grad_norm": 10.644849837301333, + "learning_rate": 3.924882629107981e-06, + "loss": 1.251, + "step": 2090 + }, + { + "epoch": 0.12, + "grad_norm": 11.654947680600644, + "learning_rate": 3.934272300469484e-06, + "loss": 1.1413, + "step": 2095 + }, + { + "epoch": 0.12, + "grad_norm": 12.666717543340287, + "learning_rate": 3.943661971830986e-06, + "loss": 1.2533, + "step": 2100 + }, + { + "epoch": 0.12, + "grad_norm": 13.23515916827622, + "learning_rate": 3.953051643192489e-06, + "loss": 1.1855, + "step": 2105 + }, + { + "epoch": 0.12, + "grad_norm": 11.611998617092816, + "learning_rate": 3.962441314553991e-06, + "loss": 1.1686, + "step": 2110 + }, + { + "epoch": 0.12, + "grad_norm": 75.18546876934055, + "learning_rate": 3.971830985915493e-06, + "loss": 1.211, + "step": 2115 + }, + { + "epoch": 0.12, + "grad_norm": 34.48381540632781, + "learning_rate": 3.981220657276996e-06, + "loss": 1.265, + "step": 2120 + }, + { + "epoch": 0.12, + "grad_norm": 24.173242774205907, + "learning_rate": 3.990610328638498e-06, + "loss": 1.2204, + "step": 2125 + }, + { + "epoch": 0.12, + "grad_norm": 17.0891214249118, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1923, + "step": 2130 + }, + { + "epoch": 0.12, + "grad_norm": 41.64291555315773, + "learning_rate": 4.009389671361503e-06, + "loss": 1.2128, + "step": 2135 + }, + { + "epoch": 0.12, + "grad_norm": 10.281938539315025, + "learning_rate": 4.018779342723005e-06, + "loss": 1.2072, + "step": 2140 + }, + { + "epoch": 0.12, + "grad_norm": 10.413266774979027, + "learning_rate": 4.028169014084508e-06, + "loss": 1.2549, + "step": 2145 + }, + { + "epoch": 0.12, + "grad_norm": 30.710934721912487, + "learning_rate": 4.0375586854460095e-06, + "loss": 1.1913, + "step": 2150 + }, + { + "epoch": 0.12, + "grad_norm": 12.110650332435146, + "learning_rate": 4.046948356807512e-06, + "loss": 1.199, + "step": 2155 + }, + { + "epoch": 0.12, + "grad_norm": 14.431332214312636, + "learning_rate": 4.056338028169014e-06, + "loss": 1.2076, + "step": 2160 + }, + { + "epoch": 0.12, + "grad_norm": 11.946223670549578, + "learning_rate": 4.065727699530517e-06, + "loss": 1.2505, + "step": 2165 + }, + { + "epoch": 0.12, + "grad_norm": 15.049396887988486, + "learning_rate": 4.075117370892019e-06, + "loss": 1.2853, + "step": 2170 + }, + { + "epoch": 0.12, + "grad_norm": 11.991821965340616, + "learning_rate": 4.0845070422535216e-06, + "loss": 1.2469, + "step": 2175 + }, + { + "epoch": 0.12, + "grad_norm": 21.132829306364883, + "learning_rate": 4.093896713615024e-06, + "loss": 1.1638, + "step": 2180 + }, + { + "epoch": 0.12, + "grad_norm": 9.734974513523571, + "learning_rate": 4.103286384976526e-06, + "loss": 1.2183, + "step": 2185 + }, + { + "epoch": 0.12, + "grad_norm": 22.656682822121244, + "learning_rate": 4.112676056338029e-06, + "loss": 1.185, + "step": 2190 + }, + { + "epoch": 0.12, + "grad_norm": 55.96848982649205, + "learning_rate": 4.12206572769953e-06, + "loss": 1.2009, + "step": 2195 + }, + { + "epoch": 0.12, + "grad_norm": 18.454671520496518, + "learning_rate": 4.131455399061034e-06, + "loss": 1.1985, + "step": 2200 + }, + { + "epoch": 0.12, + "grad_norm": 15.289250236753556, + "learning_rate": 4.140845070422535e-06, + "loss": 1.2101, + "step": 2205 + }, + { + "epoch": 0.12, + "grad_norm": 14.676033470734506, + "learning_rate": 4.150234741784038e-06, + "loss": 1.1882, + "step": 2210 + }, + { + "epoch": 0.12, + "grad_norm": 11.254135510272167, + "learning_rate": 4.15962441314554e-06, + "loss": 1.1654, + "step": 2215 + }, + { + "epoch": 0.13, + "grad_norm": 12.98453780071096, + "learning_rate": 4.1690140845070425e-06, + "loss": 1.2611, + "step": 2220 + }, + { + "epoch": 0.13, + "grad_norm": 10.549573390214084, + "learning_rate": 4.178403755868545e-06, + "loss": 1.1565, + "step": 2225 + }, + { + "epoch": 0.13, + "grad_norm": 18.58385597390127, + "learning_rate": 4.187793427230047e-06, + "loss": 1.1752, + "step": 2230 + }, + { + "epoch": 0.13, + "grad_norm": 12.392989782907804, + "learning_rate": 4.19718309859155e-06, + "loss": 1.2213, + "step": 2235 + }, + { + "epoch": 0.13, + "grad_norm": 11.978724708307432, + "learning_rate": 4.206572769953052e-06, + "loss": 1.1658, + "step": 2240 + }, + { + "epoch": 0.13, + "grad_norm": 9.119250016932348, + "learning_rate": 4.2159624413145546e-06, + "loss": 1.1651, + "step": 2245 + }, + { + "epoch": 0.13, + "grad_norm": 11.990008208774727, + "learning_rate": 4.225352112676057e-06, + "loss": 1.2203, + "step": 2250 + }, + { + "epoch": 0.13, + "grad_norm": 12.797731311304023, + "learning_rate": 4.2347417840375585e-06, + "loss": 1.2147, + "step": 2255 + }, + { + "epoch": 0.13, + "grad_norm": 14.574322167429386, + "learning_rate": 4.244131455399062e-06, + "loss": 1.2288, + "step": 2260 + }, + { + "epoch": 0.13, + "grad_norm": 67.25446012983973, + "learning_rate": 4.253521126760563e-06, + "loss": 1.2248, + "step": 2265 + }, + { + "epoch": 0.13, + "grad_norm": 10.087015270076762, + "learning_rate": 4.262910798122067e-06, + "loss": 1.251, + "step": 2270 + }, + { + "epoch": 0.13, + "grad_norm": 32.14474547185929, + "learning_rate": 4.272300469483568e-06, + "loss": 1.1826, + "step": 2275 + }, + { + "epoch": 0.13, + "grad_norm": 45.47076478471937, + "learning_rate": 4.281690140845071e-06, + "loss": 1.2227, + "step": 2280 + }, + { + "epoch": 0.13, + "grad_norm": 36.72117557644853, + "learning_rate": 4.291079812206573e-06, + "loss": 1.2262, + "step": 2285 + }, + { + "epoch": 0.13, + "grad_norm": 20.1574966708121, + "learning_rate": 4.3004694835680755e-06, + "loss": 1.1448, + "step": 2290 + }, + { + "epoch": 0.13, + "grad_norm": 12.615420459592887, + "learning_rate": 4.309859154929578e-06, + "loss": 1.1637, + "step": 2295 + }, + { + "epoch": 0.13, + "grad_norm": 32.35427176558952, + "learning_rate": 4.31924882629108e-06, + "loss": 1.2204, + "step": 2300 + }, + { + "epoch": 0.13, + "grad_norm": 12.4522668863733, + "learning_rate": 4.328638497652583e-06, + "loss": 1.2138, + "step": 2305 + }, + { + "epoch": 0.13, + "grad_norm": 11.937973098253103, + "learning_rate": 4.338028169014084e-06, + "loss": 1.2189, + "step": 2310 + }, + { + "epoch": 0.13, + "grad_norm": 13.770652084639305, + "learning_rate": 4.3474178403755875e-06, + "loss": 1.1874, + "step": 2315 + }, + { + "epoch": 0.13, + "grad_norm": 31.721192010565304, + "learning_rate": 4.356807511737089e-06, + "loss": 1.1749, + "step": 2320 + }, + { + "epoch": 0.13, + "grad_norm": 15.921912666981186, + "learning_rate": 4.3661971830985915e-06, + "loss": 1.2121, + "step": 2325 + }, + { + "epoch": 0.13, + "grad_norm": 13.3296836830859, + "learning_rate": 4.375586854460094e-06, + "loss": 1.2265, + "step": 2330 + }, + { + "epoch": 0.13, + "grad_norm": 19.536602882796448, + "learning_rate": 4.384976525821596e-06, + "loss": 1.1435, + "step": 2335 + }, + { + "epoch": 0.13, + "grad_norm": 12.331085645634388, + "learning_rate": 4.3943661971831e-06, + "loss": 1.1882, + "step": 2340 + }, + { + "epoch": 0.13, + "grad_norm": 22.917501240167, + "learning_rate": 4.403755868544601e-06, + "loss": 1.1389, + "step": 2345 + }, + { + "epoch": 0.13, + "grad_norm": 25.79328209161115, + "learning_rate": 4.413145539906104e-06, + "loss": 1.2164, + "step": 2350 + }, + { + "epoch": 0.13, + "grad_norm": 15.358192281594208, + "learning_rate": 4.422535211267606e-06, + "loss": 1.176, + "step": 2355 + }, + { + "epoch": 0.13, + "grad_norm": 12.43037237322513, + "learning_rate": 4.4319248826291084e-06, + "loss": 1.194, + "step": 2360 + }, + { + "epoch": 0.13, + "grad_norm": 26.736859965173707, + "learning_rate": 4.441314553990611e-06, + "loss": 1.2026, + "step": 2365 + }, + { + "epoch": 0.13, + "grad_norm": 16.52324590542694, + "learning_rate": 4.450704225352113e-06, + "loss": 1.1399, + "step": 2370 + }, + { + "epoch": 0.13, + "grad_norm": 14.92299270355069, + "learning_rate": 4.460093896713616e-06, + "loss": 1.219, + "step": 2375 + }, + { + "epoch": 0.13, + "grad_norm": 12.095558611381136, + "learning_rate": 4.469483568075117e-06, + "loss": 1.1769, + "step": 2380 + }, + { + "epoch": 0.13, + "grad_norm": 10.923688201653198, + "learning_rate": 4.4788732394366205e-06, + "loss": 1.1507, + "step": 2385 + }, + { + "epoch": 0.13, + "grad_norm": 31.590920243735336, + "learning_rate": 4.488262910798122e-06, + "loss": 1.1989, + "step": 2390 + }, + { + "epoch": 0.13, + "grad_norm": 12.408685623776789, + "learning_rate": 4.4976525821596245e-06, + "loss": 1.1727, + "step": 2395 + }, + { + "epoch": 0.14, + "grad_norm": 22.466982177506797, + "learning_rate": 4.507042253521127e-06, + "loss": 1.1682, + "step": 2400 + }, + { + "epoch": 0.14, + "grad_norm": 15.601061830318697, + "learning_rate": 4.516431924882629e-06, + "loss": 1.1787, + "step": 2405 + }, + { + "epoch": 0.14, + "grad_norm": 18.385013048034264, + "learning_rate": 4.525821596244132e-06, + "loss": 1.2546, + "step": 2410 + }, + { + "epoch": 0.14, + "grad_norm": 47.0796561622599, + "learning_rate": 4.535211267605634e-06, + "loss": 1.2035, + "step": 2415 + }, + { + "epoch": 0.14, + "grad_norm": 69.181673678111, + "learning_rate": 4.544600938967137e-06, + "loss": 1.2772, + "step": 2420 + }, + { + "epoch": 0.14, + "grad_norm": 14.158561027487735, + "learning_rate": 4.553990610328639e-06, + "loss": 1.1987, + "step": 2425 + }, + { + "epoch": 0.14, + "grad_norm": 32.647291631523196, + "learning_rate": 4.5633802816901414e-06, + "loss": 1.2427, + "step": 2430 + }, + { + "epoch": 0.14, + "grad_norm": 54.632444722814455, + "learning_rate": 4.572769953051643e-06, + "loss": 1.2666, + "step": 2435 + }, + { + "epoch": 0.14, + "grad_norm": 37.02072504555358, + "learning_rate": 4.582159624413146e-06, + "loss": 1.2341, + "step": 2440 + }, + { + "epoch": 0.14, + "grad_norm": 18.220404511205434, + "learning_rate": 4.591549295774648e-06, + "loss": 1.2271, + "step": 2445 + }, + { + "epoch": 0.14, + "grad_norm": 11.732674018835615, + "learning_rate": 4.60093896713615e-06, + "loss": 1.2335, + "step": 2450 + }, + { + "epoch": 0.14, + "grad_norm": 19.631415654924684, + "learning_rate": 4.6103286384976535e-06, + "loss": 1.2299, + "step": 2455 + }, + { + "epoch": 0.14, + "grad_norm": 34.927007022453836, + "learning_rate": 4.619718309859155e-06, + "loss": 1.1715, + "step": 2460 + }, + { + "epoch": 0.14, + "grad_norm": 22.940986170699468, + "learning_rate": 4.6291079812206575e-06, + "loss": 1.1948, + "step": 2465 + }, + { + "epoch": 0.14, + "grad_norm": 19.41628824982717, + "learning_rate": 4.63849765258216e-06, + "loss": 1.1882, + "step": 2470 + }, + { + "epoch": 0.14, + "grad_norm": 13.632035301928894, + "learning_rate": 4.647887323943662e-06, + "loss": 1.2135, + "step": 2475 + }, + { + "epoch": 0.14, + "grad_norm": 12.609853090326421, + "learning_rate": 4.657276995305165e-06, + "loss": 1.2727, + "step": 2480 + }, + { + "epoch": 0.14, + "grad_norm": 13.105528217898113, + "learning_rate": 4.666666666666667e-06, + "loss": 1.1884, + "step": 2485 + }, + { + "epoch": 0.14, + "grad_norm": 26.294778056136835, + "learning_rate": 4.67605633802817e-06, + "loss": 1.1862, + "step": 2490 + }, + { + "epoch": 0.14, + "grad_norm": 10.718429733984017, + "learning_rate": 4.685446009389672e-06, + "loss": 1.169, + "step": 2495 + }, + { + "epoch": 0.14, + "grad_norm": 14.600669653065827, + "learning_rate": 4.694835680751174e-06, + "loss": 1.1823, + "step": 2500 + }, + { + "epoch": 0.14, + "grad_norm": 40.15389187905186, + "learning_rate": 4.704225352112676e-06, + "loss": 1.1924, + "step": 2505 + }, + { + "epoch": 0.14, + "grad_norm": 23.998923051536682, + "learning_rate": 4.713615023474179e-06, + "loss": 1.2393, + "step": 2510 + }, + { + "epoch": 0.14, + "grad_norm": 9.442591209952589, + "learning_rate": 4.723004694835681e-06, + "loss": 1.1873, + "step": 2515 + }, + { + "epoch": 0.14, + "grad_norm": 19.96855325104703, + "learning_rate": 4.732394366197183e-06, + "loss": 1.2309, + "step": 2520 + }, + { + "epoch": 0.14, + "grad_norm": 29.129323397220617, + "learning_rate": 4.741784037558686e-06, + "loss": 1.1697, + "step": 2525 + }, + { + "epoch": 0.14, + "grad_norm": 9.672294663258253, + "learning_rate": 4.751173708920188e-06, + "loss": 1.2323, + "step": 2530 + }, + { + "epoch": 0.14, + "grad_norm": 14.913564909324933, + "learning_rate": 4.7605633802816905e-06, + "loss": 1.237, + "step": 2535 + }, + { + "epoch": 0.14, + "grad_norm": 10.38302347120668, + "learning_rate": 4.769953051643193e-06, + "loss": 1.1793, + "step": 2540 + }, + { + "epoch": 0.14, + "grad_norm": 13.257323163907138, + "learning_rate": 4.779342723004695e-06, + "loss": 1.178, + "step": 2545 + }, + { + "epoch": 0.14, + "grad_norm": 19.812584778699634, + "learning_rate": 4.788732394366197e-06, + "loss": 1.147, + "step": 2550 + }, + { + "epoch": 0.14, + "grad_norm": 18.65982397347087, + "learning_rate": 4.7981220657277e-06, + "loss": 1.1151, + "step": 2555 + }, + { + "epoch": 0.14, + "grad_norm": 8.73998551226896, + "learning_rate": 4.8075117370892026e-06, + "loss": 1.2453, + "step": 2560 + }, + { + "epoch": 0.14, + "grad_norm": 28.096313844818898, + "learning_rate": 4.816901408450705e-06, + "loss": 1.1797, + "step": 2565 + }, + { + "epoch": 0.14, + "grad_norm": 13.786439715810394, + "learning_rate": 4.826291079812207e-06, + "loss": 1.2389, + "step": 2570 + }, + { + "epoch": 0.15, + "grad_norm": 10.557288723402566, + "learning_rate": 4.835680751173709e-06, + "loss": 1.1726, + "step": 2575 + }, + { + "epoch": 0.15, + "grad_norm": 11.217934859465894, + "learning_rate": 4.845070422535212e-06, + "loss": 1.2189, + "step": 2580 + }, + { + "epoch": 0.15, + "grad_norm": 15.935243637488893, + "learning_rate": 4.854460093896714e-06, + "loss": 1.1678, + "step": 2585 + }, + { + "epoch": 0.15, + "grad_norm": 62.22877930390806, + "learning_rate": 4.863849765258216e-06, + "loss": 1.19, + "step": 2590 + }, + { + "epoch": 0.15, + "grad_norm": 23.008126923573162, + "learning_rate": 4.873239436619719e-06, + "loss": 1.2024, + "step": 2595 + }, + { + "epoch": 0.15, + "grad_norm": 49.50461424928968, + "learning_rate": 4.882629107981221e-06, + "loss": 1.2202, + "step": 2600 + }, + { + "epoch": 0.15, + "grad_norm": 31.757856449655563, + "learning_rate": 4.8920187793427235e-06, + "loss": 1.2014, + "step": 2605 + }, + { + "epoch": 0.15, + "grad_norm": 17.261547220674743, + "learning_rate": 4.901408450704226e-06, + "loss": 1.2066, + "step": 2610 + }, + { + "epoch": 0.15, + "grad_norm": 18.268289450107485, + "learning_rate": 4.910798122065728e-06, + "loss": 1.186, + "step": 2615 + }, + { + "epoch": 0.15, + "grad_norm": 13.307824070308255, + "learning_rate": 4.92018779342723e-06, + "loss": 1.189, + "step": 2620 + }, + { + "epoch": 0.15, + "grad_norm": 22.637796115657476, + "learning_rate": 4.929577464788733e-06, + "loss": 1.1956, + "step": 2625 + }, + { + "epoch": 0.15, + "grad_norm": 8.755645306820854, + "learning_rate": 4.938967136150235e-06, + "loss": 1.1665, + "step": 2630 + }, + { + "epoch": 0.15, + "grad_norm": 11.468304785143085, + "learning_rate": 4.948356807511738e-06, + "loss": 1.2118, + "step": 2635 + }, + { + "epoch": 0.15, + "grad_norm": 12.799015010559174, + "learning_rate": 4.9577464788732395e-06, + "loss": 1.2311, + "step": 2640 + }, + { + "epoch": 0.15, + "grad_norm": 19.87197709226322, + "learning_rate": 4.967136150234742e-06, + "loss": 1.2427, + "step": 2645 + }, + { + "epoch": 0.15, + "grad_norm": 9.099632777132724, + "learning_rate": 4.976525821596244e-06, + "loss": 1.1823, + "step": 2650 + }, + { + "epoch": 0.15, + "grad_norm": 34.29335759964042, + "learning_rate": 4.985915492957747e-06, + "loss": 1.2207, + "step": 2655 + }, + { + "epoch": 0.15, + "grad_norm": 21.877054854846023, + "learning_rate": 4.995305164319249e-06, + "loss": 1.1786, + "step": 2660 + }, + { + "epoch": 0.15, + "grad_norm": 16.579004116388244, + "learning_rate": 5.004694835680752e-06, + "loss": 1.215, + "step": 2665 + }, + { + "epoch": 0.15, + "grad_norm": 11.894708596586472, + "learning_rate": 5.014084507042254e-06, + "loss": 1.1824, + "step": 2670 + }, + { + "epoch": 0.15, + "grad_norm": 8.768192973718882, + "learning_rate": 5.0234741784037565e-06, + "loss": 1.1863, + "step": 2675 + }, + { + "epoch": 0.15, + "grad_norm": 10.184374234203284, + "learning_rate": 5.032863849765258e-06, + "loss": 1.1987, + "step": 2680 + }, + { + "epoch": 0.15, + "grad_norm": 27.69630520473007, + "learning_rate": 5.042253521126761e-06, + "loss": 1.2174, + "step": 2685 + }, + { + "epoch": 0.15, + "grad_norm": 11.14666148453873, + "learning_rate": 5.051643192488264e-06, + "loss": 1.1656, + "step": 2690 + }, + { + "epoch": 0.15, + "grad_norm": 51.86245200534382, + "learning_rate": 5.061032863849765e-06, + "loss": 1.2327, + "step": 2695 + }, + { + "epoch": 0.15, + "grad_norm": 12.08166458034953, + "learning_rate": 5.070422535211268e-06, + "loss": 1.1426, + "step": 2700 + }, + { + "epoch": 0.15, + "grad_norm": 30.672451641803097, + "learning_rate": 5.079812206572771e-06, + "loss": 1.228, + "step": 2705 + }, + { + "epoch": 0.15, + "grad_norm": 13.681858098471906, + "learning_rate": 5.0892018779342725e-06, + "loss": 1.2094, + "step": 2710 + }, + { + "epoch": 0.15, + "grad_norm": 11.582748731923179, + "learning_rate": 5.098591549295775e-06, + "loss": 1.1459, + "step": 2715 + }, + { + "epoch": 0.15, + "grad_norm": 16.526169998045273, + "learning_rate": 5.107981220657277e-06, + "loss": 1.1561, + "step": 2720 + }, + { + "epoch": 0.15, + "grad_norm": 22.182816069511833, + "learning_rate": 5.117370892018779e-06, + "loss": 1.1646, + "step": 2725 + }, + { + "epoch": 0.15, + "grad_norm": 16.13347585327498, + "learning_rate": 5.126760563380282e-06, + "loss": 1.1673, + "step": 2730 + }, + { + "epoch": 0.15, + "grad_norm": 6.92383247295312, + "learning_rate": 5.136150234741785e-06, + "loss": 1.0964, + "step": 2735 + }, + { + "epoch": 0.15, + "grad_norm": 25.238958262104113, + "learning_rate": 5.145539906103287e-06, + "loss": 1.126, + "step": 2740 + }, + { + "epoch": 0.15, + "grad_norm": 10.727915817681522, + "learning_rate": 5.154929577464789e-06, + "loss": 1.184, + "step": 2745 + }, + { + "epoch": 0.15, + "grad_norm": 10.07815234664372, + "learning_rate": 5.164319248826292e-06, + "loss": 1.1962, + "step": 2750 + }, + { + "epoch": 0.16, + "grad_norm": 11.580965380449397, + "learning_rate": 5.173708920187794e-06, + "loss": 1.1812, + "step": 2755 + }, + { + "epoch": 0.16, + "grad_norm": 10.957695684884207, + "learning_rate": 5.183098591549296e-06, + "loss": 1.2402, + "step": 2760 + }, + { + "epoch": 0.16, + "grad_norm": 14.005195755748117, + "learning_rate": 5.192488262910798e-06, + "loss": 1.1686, + "step": 2765 + }, + { + "epoch": 0.16, + "grad_norm": 16.136526342545483, + "learning_rate": 5.2018779342723015e-06, + "loss": 1.1726, + "step": 2770 + }, + { + "epoch": 0.16, + "grad_norm": 11.085901815839362, + "learning_rate": 5.211267605633803e-06, + "loss": 1.1847, + "step": 2775 + }, + { + "epoch": 0.16, + "grad_norm": 21.964623577454045, + "learning_rate": 5.2206572769953055e-06, + "loss": 1.2073, + "step": 2780 + }, + { + "epoch": 0.16, + "grad_norm": 34.18386913088435, + "learning_rate": 5.230046948356809e-06, + "loss": 1.2156, + "step": 2785 + }, + { + "epoch": 0.16, + "grad_norm": 13.006982124236956, + "learning_rate": 5.23943661971831e-06, + "loss": 1.2176, + "step": 2790 + }, + { + "epoch": 0.16, + "grad_norm": 29.671873936742475, + "learning_rate": 5.248826291079813e-06, + "loss": 1.1959, + "step": 2795 + }, + { + "epoch": 0.16, + "grad_norm": 9.956018503582134, + "learning_rate": 5.258215962441315e-06, + "loss": 1.2051, + "step": 2800 + }, + { + "epoch": 0.16, + "grad_norm": 11.745510165265399, + "learning_rate": 5.267605633802817e-06, + "loss": 1.1691, + "step": 2805 + }, + { + "epoch": 0.16, + "grad_norm": 17.64201493072601, + "learning_rate": 5.27699530516432e-06, + "loss": 1.2276, + "step": 2810 + }, + { + "epoch": 0.16, + "grad_norm": 11.769176937775635, + "learning_rate": 5.286384976525822e-06, + "loss": 1.235, + "step": 2815 + }, + { + "epoch": 0.16, + "grad_norm": 10.63246771406025, + "learning_rate": 5.295774647887324e-06, + "loss": 1.2138, + "step": 2820 + }, + { + "epoch": 0.16, + "grad_norm": 16.948650740908132, + "learning_rate": 5.305164319248826e-06, + "loss": 1.1787, + "step": 2825 + }, + { + "epoch": 0.16, + "grad_norm": 13.360167189109927, + "learning_rate": 5.31455399061033e-06, + "loss": 1.2318, + "step": 2830 + }, + { + "epoch": 0.16, + "grad_norm": 30.88292172945298, + "learning_rate": 5.323943661971831e-06, + "loss": 1.2123, + "step": 2835 + }, + { + "epoch": 0.16, + "grad_norm": 37.37611584358783, + "learning_rate": 5.333333333333334e-06, + "loss": 1.1862, + "step": 2840 + }, + { + "epoch": 0.16, + "grad_norm": 13.77094118061188, + "learning_rate": 5.342723004694836e-06, + "loss": 1.1575, + "step": 2845 + }, + { + "epoch": 0.16, + "grad_norm": 79.92042130900299, + "learning_rate": 5.352112676056338e-06, + "loss": 1.2249, + "step": 2850 + }, + { + "epoch": 0.16, + "grad_norm": 16.350216749270025, + "learning_rate": 5.361502347417841e-06, + "loss": 1.1811, + "step": 2855 + }, + { + "epoch": 0.16, + "grad_norm": 61.20128382809511, + "learning_rate": 5.370892018779343e-06, + "loss": 1.2, + "step": 2860 + }, + { + "epoch": 0.16, + "grad_norm": 15.102967578060602, + "learning_rate": 5.380281690140845e-06, + "loss": 1.1781, + "step": 2865 + }, + { + "epoch": 0.16, + "grad_norm": 9.080028854178368, + "learning_rate": 5.389671361502347e-06, + "loss": 1.2154, + "step": 2870 + }, + { + "epoch": 0.16, + "grad_norm": 18.67126903643239, + "learning_rate": 5.3990610328638506e-06, + "loss": 1.2048, + "step": 2875 + }, + { + "epoch": 0.16, + "grad_norm": 59.06522160392222, + "learning_rate": 5.408450704225352e-06, + "loss": 1.2839, + "step": 2880 + }, + { + "epoch": 0.16, + "grad_norm": 9.612084203697622, + "learning_rate": 5.4178403755868546e-06, + "loss": 1.1689, + "step": 2885 + }, + { + "epoch": 0.16, + "grad_norm": 34.25826185544695, + "learning_rate": 5.427230046948358e-06, + "loss": 1.1436, + "step": 2890 + }, + { + "epoch": 0.16, + "grad_norm": 10.800326936769686, + "learning_rate": 5.43661971830986e-06, + "loss": 1.2112, + "step": 2895 + }, + { + "epoch": 0.16, + "grad_norm": 23.10092238106868, + "learning_rate": 5.446009389671362e-06, + "loss": 1.1718, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 30.954692739996624, + "learning_rate": 5.455399061032864e-06, + "loss": 1.1675, + "step": 2905 + }, + { + "epoch": 0.16, + "grad_norm": 8.944793887450846, + "learning_rate": 5.4647887323943675e-06, + "loss": 1.1885, + "step": 2910 + }, + { + "epoch": 0.16, + "grad_norm": 44.80214963552076, + "learning_rate": 5.474178403755869e-06, + "loss": 1.2333, + "step": 2915 + }, + { + "epoch": 0.16, + "grad_norm": 16.42679087897064, + "learning_rate": 5.4835680751173715e-06, + "loss": 1.1682, + "step": 2920 + }, + { + "epoch": 0.16, + "grad_norm": 37.40246134344992, + "learning_rate": 5.492957746478874e-06, + "loss": 1.1665, + "step": 2925 + }, + { + "epoch": 0.17, + "grad_norm": 15.787998396878661, + "learning_rate": 5.5023474178403755e-06, + "loss": 1.2112, + "step": 2930 + }, + { + "epoch": 0.17, + "grad_norm": 15.379855721125532, + "learning_rate": 5.511737089201879e-06, + "loss": 1.1584, + "step": 2935 + }, + { + "epoch": 0.17, + "grad_norm": 24.23506920135781, + "learning_rate": 5.521126760563381e-06, + "loss": 1.1276, + "step": 2940 + }, + { + "epoch": 0.17, + "grad_norm": 10.665725698283438, + "learning_rate": 5.530516431924883e-06, + "loss": 1.1764, + "step": 2945 + }, + { + "epoch": 0.17, + "grad_norm": 9.039102772157442, + "learning_rate": 5.539906103286385e-06, + "loss": 1.1508, + "step": 2950 + }, + { + "epoch": 0.17, + "grad_norm": 12.457190549365503, + "learning_rate": 5.549295774647888e-06, + "loss": 1.1823, + "step": 2955 + }, + { + "epoch": 0.17, + "grad_norm": 11.476738254085967, + "learning_rate": 5.55868544600939e-06, + "loss": 1.2079, + "step": 2960 + }, + { + "epoch": 0.17, + "grad_norm": 7.458033287032747, + "learning_rate": 5.568075117370892e-06, + "loss": 1.1699, + "step": 2965 + }, + { + "epoch": 0.17, + "grad_norm": 15.712976133528741, + "learning_rate": 5.577464788732395e-06, + "loss": 1.1868, + "step": 2970 + }, + { + "epoch": 0.17, + "grad_norm": 14.417131558431354, + "learning_rate": 5.586854460093896e-06, + "loss": 1.1654, + "step": 2975 + }, + { + "epoch": 0.17, + "grad_norm": 28.59571408785586, + "learning_rate": 5.5962441314554e-06, + "loss": 1.2117, + "step": 2980 + }, + { + "epoch": 0.17, + "grad_norm": 23.036729284240412, + "learning_rate": 5.605633802816902e-06, + "loss": 1.17, + "step": 2985 + }, + { + "epoch": 0.17, + "grad_norm": 9.856664652849746, + "learning_rate": 5.615023474178404e-06, + "loss": 1.2174, + "step": 2990 + }, + { + "epoch": 0.17, + "grad_norm": 17.003741305513707, + "learning_rate": 5.624413145539907e-06, + "loss": 1.1538, + "step": 2995 + }, + { + "epoch": 0.17, + "grad_norm": 14.1943600029335, + "learning_rate": 5.633802816901409e-06, + "loss": 1.2039, + "step": 3000 + }, + { + "epoch": 0.17, + "grad_norm": 31.453900964586012, + "learning_rate": 5.643192488262911e-06, + "loss": 1.2086, + "step": 3005 + }, + { + "epoch": 0.17, + "grad_norm": 15.353598012487927, + "learning_rate": 5.652582159624413e-06, + "loss": 1.1823, + "step": 3010 + }, + { + "epoch": 0.17, + "grad_norm": 11.542089610369638, + "learning_rate": 5.6619718309859165e-06, + "loss": 1.1715, + "step": 3015 + }, + { + "epoch": 0.17, + "grad_norm": 22.616104263459135, + "learning_rate": 5.671361502347418e-06, + "loss": 1.1886, + "step": 3020 + }, + { + "epoch": 0.17, + "grad_norm": 23.306014250423694, + "learning_rate": 5.6807511737089205e-06, + "loss": 1.1822, + "step": 3025 + }, + { + "epoch": 0.17, + "grad_norm": 12.440575640440233, + "learning_rate": 5.690140845070423e-06, + "loss": 1.1809, + "step": 3030 + }, + { + "epoch": 0.17, + "grad_norm": 13.727589981071375, + "learning_rate": 5.699530516431926e-06, + "loss": 1.2242, + "step": 3035 + }, + { + "epoch": 0.17, + "grad_norm": 8.2126417115623, + "learning_rate": 5.708920187793428e-06, + "loss": 1.242, + "step": 3040 + }, + { + "epoch": 0.17, + "grad_norm": 8.82700181003469, + "learning_rate": 5.71830985915493e-06, + "loss": 1.2324, + "step": 3045 + }, + { + "epoch": 0.17, + "grad_norm": 28.032261575246643, + "learning_rate": 5.727699530516433e-06, + "loss": 1.1834, + "step": 3050 + }, + { + "epoch": 0.17, + "grad_norm": 18.608062314257065, + "learning_rate": 5.737089201877934e-06, + "loss": 1.1929, + "step": 3055 + }, + { + "epoch": 0.17, + "grad_norm": 18.57639194191237, + "learning_rate": 5.7464788732394374e-06, + "loss": 1.1984, + "step": 3060 + }, + { + "epoch": 0.17, + "grad_norm": 22.635698991745947, + "learning_rate": 5.75586854460094e-06, + "loss": 1.1496, + "step": 3065 + }, + { + "epoch": 0.17, + "grad_norm": 15.601298823552913, + "learning_rate": 5.7652582159624414e-06, + "loss": 1.2219, + "step": 3070 + }, + { + "epoch": 0.17, + "grad_norm": 34.14981338727205, + "learning_rate": 5.774647887323944e-06, + "loss": 1.2354, + "step": 3075 + }, + { + "epoch": 0.17, + "grad_norm": 25.367596079631312, + "learning_rate": 5.784037558685447e-06, + "loss": 1.2353, + "step": 3080 + }, + { + "epoch": 0.17, + "grad_norm": 17.420422834221338, + "learning_rate": 5.793427230046949e-06, + "loss": 1.2228, + "step": 3085 + }, + { + "epoch": 0.17, + "grad_norm": 12.91873655885533, + "learning_rate": 5.802816901408451e-06, + "loss": 1.1709, + "step": 3090 + }, + { + "epoch": 0.17, + "grad_norm": 10.844350103128894, + "learning_rate": 5.8122065727699535e-06, + "loss": 1.1639, + "step": 3095 + }, + { + "epoch": 0.17, + "grad_norm": 11.342796919264774, + "learning_rate": 5.821596244131456e-06, + "loss": 1.1326, + "step": 3100 + }, + { + "epoch": 0.17, + "grad_norm": 18.855294721496623, + "learning_rate": 5.830985915492958e-06, + "loss": 1.2328, + "step": 3105 + }, + { + "epoch": 0.18, + "grad_norm": 16.427431240511645, + "learning_rate": 5.840375586854461e-06, + "loss": 1.2116, + "step": 3110 + }, + { + "epoch": 0.18, + "grad_norm": 45.723532604141056, + "learning_rate": 5.849765258215962e-06, + "loss": 1.1256, + "step": 3115 + }, + { + "epoch": 0.18, + "grad_norm": 33.542083457268305, + "learning_rate": 5.859154929577466e-06, + "loss": 1.1882, + "step": 3120 + }, + { + "epoch": 0.18, + "grad_norm": 14.129904502261516, + "learning_rate": 5.868544600938968e-06, + "loss": 1.162, + "step": 3125 + }, + { + "epoch": 0.18, + "grad_norm": 18.476932976901118, + "learning_rate": 5.87793427230047e-06, + "loss": 1.1973, + "step": 3130 + }, + { + "epoch": 0.18, + "grad_norm": 15.902035615766474, + "learning_rate": 5.887323943661972e-06, + "loss": 1.1472, + "step": 3135 + }, + { + "epoch": 0.18, + "grad_norm": 11.704225491764763, + "learning_rate": 5.896713615023475e-06, + "loss": 1.2177, + "step": 3140 + }, + { + "epoch": 0.18, + "grad_norm": 15.953895764161967, + "learning_rate": 5.906103286384977e-06, + "loss": 1.2006, + "step": 3145 + }, + { + "epoch": 0.18, + "grad_norm": 9.109934859779493, + "learning_rate": 5.915492957746479e-06, + "loss": 1.1376, + "step": 3150 + }, + { + "epoch": 0.18, + "grad_norm": 9.455425666662528, + "learning_rate": 5.924882629107982e-06, + "loss": 1.14, + "step": 3155 + }, + { + "epoch": 0.18, + "grad_norm": 9.751153243299727, + "learning_rate": 5.934272300469483e-06, + "loss": 1.1672, + "step": 3160 + }, + { + "epoch": 0.18, + "grad_norm": 12.578442734134143, + "learning_rate": 5.9436619718309865e-06, + "loss": 1.1984, + "step": 3165 + }, + { + "epoch": 0.18, + "grad_norm": 16.27194019530656, + "learning_rate": 5.953051643192489e-06, + "loss": 1.1612, + "step": 3170 + }, + { + "epoch": 0.18, + "grad_norm": 10.663340387913973, + "learning_rate": 5.9624413145539905e-06, + "loss": 1.1789, + "step": 3175 + }, + { + "epoch": 0.18, + "grad_norm": 9.588367068745, + "learning_rate": 5.971830985915493e-06, + "loss": 1.1218, + "step": 3180 + }, + { + "epoch": 0.18, + "grad_norm": 9.293006787691757, + "learning_rate": 5.981220657276996e-06, + "loss": 1.2613, + "step": 3185 + }, + { + "epoch": 0.18, + "grad_norm": 15.822416797957528, + "learning_rate": 5.990610328638499e-06, + "loss": 1.1899, + "step": 3190 + }, + { + "epoch": 0.18, + "grad_norm": 14.880043593701103, + "learning_rate": 6e-06, + "loss": 1.2044, + "step": 3195 + }, + { + "epoch": 0.18, + "grad_norm": 14.928209500542499, + "learning_rate": 6.0093896713615026e-06, + "loss": 1.232, + "step": 3200 + }, + { + "epoch": 0.18, + "grad_norm": 25.473220694504985, + "learning_rate": 6.018779342723006e-06, + "loss": 1.1904, + "step": 3205 + }, + { + "epoch": 0.18, + "grad_norm": 23.182348972445453, + "learning_rate": 6.028169014084507e-06, + "loss": 1.1145, + "step": 3210 + }, + { + "epoch": 0.18, + "grad_norm": 19.93176421759847, + "learning_rate": 6.03755868544601e-06, + "loss": 1.2053, + "step": 3215 + }, + { + "epoch": 0.18, + "grad_norm": 33.311478654152445, + "learning_rate": 6.046948356807513e-06, + "loss": 1.23, + "step": 3220 + }, + { + "epoch": 0.18, + "grad_norm": 34.910563964566165, + "learning_rate": 6.056338028169015e-06, + "loss": 1.2042, + "step": 3225 + }, + { + "epoch": 0.18, + "grad_norm": 10.305916726699248, + "learning_rate": 6.065727699530517e-06, + "loss": 1.2107, + "step": 3230 + }, + { + "epoch": 0.18, + "grad_norm": 17.925555694783906, + "learning_rate": 6.0751173708920195e-06, + "loss": 1.1915, + "step": 3235 + }, + { + "epoch": 0.18, + "grad_norm": 17.72001978234508, + "learning_rate": 6.084507042253521e-06, + "loss": 1.1603, + "step": 3240 + }, + { + "epoch": 0.18, + "grad_norm": 9.731287901642945, + "learning_rate": 6.093896713615024e-06, + "loss": 1.2402, + "step": 3245 + }, + { + "epoch": 0.18, + "grad_norm": 12.023217803642202, + "learning_rate": 6.103286384976527e-06, + "loss": 1.1384, + "step": 3250 + }, + { + "epoch": 0.18, + "grad_norm": 43.72845432383127, + "learning_rate": 6.112676056338028e-06, + "loss": 1.1864, + "step": 3255 + }, + { + "epoch": 0.18, + "grad_norm": 23.88664324844327, + "learning_rate": 6.122065727699531e-06, + "loss": 1.1742, + "step": 3260 + }, + { + "epoch": 0.18, + "grad_norm": 22.062745353495536, + "learning_rate": 6.131455399061034e-06, + "loss": 1.2107, + "step": 3265 + }, + { + "epoch": 0.18, + "grad_norm": 59.88662294125921, + "learning_rate": 6.1408450704225356e-06, + "loss": 1.2874, + "step": 3270 + }, + { + "epoch": 0.18, + "grad_norm": 29.659605027540636, + "learning_rate": 6.150234741784038e-06, + "loss": 1.2323, + "step": 3275 + }, + { + "epoch": 0.18, + "grad_norm": 12.246236154429443, + "learning_rate": 6.15962441314554e-06, + "loss": 1.1871, + "step": 3280 + }, + { + "epoch": 0.19, + "grad_norm": 11.487902777975865, + "learning_rate": 6.169014084507042e-06, + "loss": 1.1903, + "step": 3285 + }, + { + "epoch": 0.19, + "grad_norm": 9.190816034075878, + "learning_rate": 6.178403755868545e-06, + "loss": 1.1471, + "step": 3290 + }, + { + "epoch": 0.19, + "grad_norm": 20.895203886752387, + "learning_rate": 6.187793427230048e-06, + "loss": 1.1858, + "step": 3295 + }, + { + "epoch": 0.19, + "grad_norm": 13.52950883704903, + "learning_rate": 6.197183098591549e-06, + "loss": 1.2423, + "step": 3300 + }, + { + "epoch": 0.19, + "grad_norm": 10.835986404636852, + "learning_rate": 6.206572769953052e-06, + "loss": 1.1984, + "step": 3305 + }, + { + "epoch": 0.19, + "grad_norm": 16.283584871546616, + "learning_rate": 6.215962441314555e-06, + "loss": 1.1808, + "step": 3310 + }, + { + "epoch": 0.19, + "grad_norm": 13.592837492965831, + "learning_rate": 6.2253521126760565e-06, + "loss": 1.1685, + "step": 3315 + }, + { + "epoch": 0.19, + "grad_norm": 11.482807347446647, + "learning_rate": 6.234741784037559e-06, + "loss": 1.1492, + "step": 3320 + }, + { + "epoch": 0.19, + "grad_norm": 13.836520719453812, + "learning_rate": 6.244131455399062e-06, + "loss": 1.174, + "step": 3325 + }, + { + "epoch": 0.19, + "grad_norm": 10.145387454025895, + "learning_rate": 6.2535211267605646e-06, + "loss": 1.1839, + "step": 3330 + }, + { + "epoch": 0.19, + "grad_norm": 19.74980245451608, + "learning_rate": 6.262910798122066e-06, + "loss": 1.213, + "step": 3335 + }, + { + "epoch": 0.19, + "grad_norm": 10.0051182281824, + "learning_rate": 6.2723004694835685e-06, + "loss": 1.1656, + "step": 3340 + }, + { + "epoch": 0.19, + "grad_norm": 26.544198001647317, + "learning_rate": 6.281690140845072e-06, + "loss": 1.2225, + "step": 3345 + }, + { + "epoch": 0.19, + "grad_norm": 21.05461943725882, + "learning_rate": 6.291079812206573e-06, + "loss": 1.1576, + "step": 3350 + }, + { + "epoch": 0.19, + "grad_norm": 46.837709334348844, + "learning_rate": 6.300469483568076e-06, + "loss": 1.2523, + "step": 3355 + }, + { + "epoch": 0.19, + "grad_norm": 11.130518117204725, + "learning_rate": 6.309859154929578e-06, + "loss": 1.1609, + "step": 3360 + }, + { + "epoch": 0.19, + "grad_norm": 20.90081126517263, + "learning_rate": 6.31924882629108e-06, + "loss": 1.1375, + "step": 3365 + }, + { + "epoch": 0.19, + "grad_norm": 17.902814665858642, + "learning_rate": 6.328638497652583e-06, + "loss": 1.21, + "step": 3370 + }, + { + "epoch": 0.19, + "grad_norm": 17.0678477939345, + "learning_rate": 6.3380281690140855e-06, + "loss": 1.158, + "step": 3375 + }, + { + "epoch": 0.19, + "grad_norm": 14.45480025089843, + "learning_rate": 6.347417840375587e-06, + "loss": 1.2198, + "step": 3380 + }, + { + "epoch": 0.19, + "grad_norm": 10.318387694667846, + "learning_rate": 6.3568075117370894e-06, + "loss": 1.1965, + "step": 3385 + }, + { + "epoch": 0.19, + "grad_norm": 13.84241615890925, + "learning_rate": 6.366197183098593e-06, + "loss": 1.1983, + "step": 3390 + }, + { + "epoch": 0.19, + "grad_norm": 17.25212373036805, + "learning_rate": 6.375586854460094e-06, + "loss": 1.1699, + "step": 3395 + }, + { + "epoch": 0.19, + "grad_norm": 20.21689707128868, + "learning_rate": 6.384976525821597e-06, + "loss": 1.2123, + "step": 3400 + }, + { + "epoch": 0.19, + "grad_norm": 11.160465138136907, + "learning_rate": 6.394366197183099e-06, + "loss": 1.1336, + "step": 3405 + }, + { + "epoch": 0.19, + "grad_norm": 11.25062850522987, + "learning_rate": 6.403755868544601e-06, + "loss": 1.1507, + "step": 3410 + }, + { + "epoch": 0.19, + "grad_norm": 15.759040977103274, + "learning_rate": 6.413145539906104e-06, + "loss": 1.2328, + "step": 3415 + }, + { + "epoch": 0.19, + "grad_norm": 13.202392125250093, + "learning_rate": 6.422535211267606e-06, + "loss": 1.1934, + "step": 3420 + }, + { + "epoch": 0.19, + "grad_norm": 22.395804601771438, + "learning_rate": 6.431924882629108e-06, + "loss": 1.1812, + "step": 3425 + }, + { + "epoch": 0.19, + "grad_norm": 9.776606046029634, + "learning_rate": 6.441314553990611e-06, + "loss": 1.1571, + "step": 3430 + }, + { + "epoch": 0.19, + "grad_norm": 25.10005593935468, + "learning_rate": 6.450704225352114e-06, + "loss": 1.2581, + "step": 3435 + }, + { + "epoch": 0.19, + "grad_norm": 9.36646332934216, + "learning_rate": 6.460093896713615e-06, + "loss": 1.2021, + "step": 3440 + }, + { + "epoch": 0.19, + "grad_norm": 13.836089777946722, + "learning_rate": 6.469483568075118e-06, + "loss": 1.173, + "step": 3445 + }, + { + "epoch": 0.19, + "grad_norm": 14.904518395390559, + "learning_rate": 6.478873239436621e-06, + "loss": 1.1393, + "step": 3450 + }, + { + "epoch": 0.19, + "grad_norm": 15.856781617568297, + "learning_rate": 6.4882629107981224e-06, + "loss": 1.1876, + "step": 3455 + }, + { + "epoch": 0.19, + "grad_norm": 20.892170004163315, + "learning_rate": 6.497652582159625e-06, + "loss": 1.1603, + "step": 3460 + }, + { + "epoch": 0.2, + "grad_norm": 13.992435890958625, + "learning_rate": 6.507042253521127e-06, + "loss": 1.1057, + "step": 3465 + }, + { + "epoch": 0.2, + "grad_norm": 8.727581752017006, + "learning_rate": 6.516431924882629e-06, + "loss": 1.1899, + "step": 3470 + }, + { + "epoch": 0.2, + "grad_norm": 15.205494216931335, + "learning_rate": 6.525821596244132e-06, + "loss": 1.1475, + "step": 3475 + }, + { + "epoch": 0.2, + "grad_norm": 21.09712047658045, + "learning_rate": 6.5352112676056345e-06, + "loss": 1.2599, + "step": 3480 + }, + { + "epoch": 0.2, + "grad_norm": 10.585943164007832, + "learning_rate": 6.544600938967137e-06, + "loss": 1.1485, + "step": 3485 + }, + { + "epoch": 0.2, + "grad_norm": 14.70113802638682, + "learning_rate": 6.5539906103286385e-06, + "loss": 1.1576, + "step": 3490 + }, + { + "epoch": 0.2, + "grad_norm": 10.476299406363472, + "learning_rate": 6.563380281690142e-06, + "loss": 1.2452, + "step": 3495 + }, + { + "epoch": 0.2, + "grad_norm": 35.03789912898917, + "learning_rate": 6.572769953051644e-06, + "loss": 1.161, + "step": 3500 + }, + { + "epoch": 0.2, + "grad_norm": 27.36668377715275, + "learning_rate": 6.582159624413146e-06, + "loss": 1.1931, + "step": 3505 + }, + { + "epoch": 0.2, + "grad_norm": 9.136014877955109, + "learning_rate": 6.591549295774648e-06, + "loss": 1.161, + "step": 3510 + }, + { + "epoch": 0.2, + "grad_norm": 12.84250751227113, + "learning_rate": 6.6009389671361514e-06, + "loss": 1.1951, + "step": 3515 + }, + { + "epoch": 0.2, + "grad_norm": 10.378067004203, + "learning_rate": 6.610328638497653e-06, + "loss": 1.1747, + "step": 3520 + }, + { + "epoch": 0.2, + "grad_norm": 12.81809857296866, + "learning_rate": 6.619718309859155e-06, + "loss": 1.1838, + "step": 3525 + }, + { + "epoch": 0.2, + "grad_norm": 13.566027578446029, + "learning_rate": 6.629107981220658e-06, + "loss": 1.2146, + "step": 3530 + }, + { + "epoch": 0.2, + "grad_norm": 22.616110729334892, + "learning_rate": 6.63849765258216e-06, + "loss": 1.217, + "step": 3535 + }, + { + "epoch": 0.2, + "grad_norm": 15.777553651375694, + "learning_rate": 6.647887323943663e-06, + "loss": 1.1952, + "step": 3540 + }, + { + "epoch": 0.2, + "grad_norm": 17.766432365714778, + "learning_rate": 6.657276995305165e-06, + "loss": 1.1913, + "step": 3545 + }, + { + "epoch": 0.2, + "grad_norm": 23.87892511875046, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1854, + "step": 3550 + }, + { + "epoch": 0.2, + "grad_norm": 14.134182584773212, + "learning_rate": 6.67605633802817e-06, + "loss": 1.2105, + "step": 3555 + }, + { + "epoch": 0.2, + "grad_norm": 10.472091344972434, + "learning_rate": 6.685446009389672e-06, + "loss": 1.17, + "step": 3560 + }, + { + "epoch": 0.2, + "grad_norm": 10.822251781811772, + "learning_rate": 6.694835680751174e-06, + "loss": 1.1933, + "step": 3565 + }, + { + "epoch": 0.2, + "grad_norm": 15.129273978116814, + "learning_rate": 6.704225352112676e-06, + "loss": 1.2379, + "step": 3570 + }, + { + "epoch": 0.2, + "grad_norm": 23.817218249527652, + "learning_rate": 6.71361502347418e-06, + "loss": 1.1708, + "step": 3575 + }, + { + "epoch": 0.2, + "grad_norm": 28.537208165380388, + "learning_rate": 6.723004694835681e-06, + "loss": 1.161, + "step": 3580 + }, + { + "epoch": 0.2, + "grad_norm": 16.494696977137288, + "learning_rate": 6.7323943661971836e-06, + "loss": 1.1503, + "step": 3585 + }, + { + "epoch": 0.2, + "grad_norm": 68.31195592819469, + "learning_rate": 6.741784037558686e-06, + "loss": 1.243, + "step": 3590 + }, + { + "epoch": 0.2, + "grad_norm": 40.88593787189487, + "learning_rate": 6.7511737089201875e-06, + "loss": 1.2053, + "step": 3595 + }, + { + "epoch": 0.2, + "grad_norm": 13.129499892476897, + "learning_rate": 6.760563380281691e-06, + "loss": 1.2732, + "step": 3600 + }, + { + "epoch": 0.2, + "grad_norm": 42.19342123857229, + "learning_rate": 6.769953051643193e-06, + "loss": 1.1678, + "step": 3605 + }, + { + "epoch": 0.2, + "grad_norm": 9.406888645661365, + "learning_rate": 6.779342723004695e-06, + "loss": 1.2387, + "step": 3610 + }, + { + "epoch": 0.2, + "grad_norm": 24.730586011039748, + "learning_rate": 6.788732394366197e-06, + "loss": 1.2213, + "step": 3615 + }, + { + "epoch": 0.2, + "grad_norm": 9.977379864150745, + "learning_rate": 6.7981220657277005e-06, + "loss": 1.1901, + "step": 3620 + }, + { + "epoch": 0.2, + "grad_norm": 61.8058816801069, + "learning_rate": 6.807511737089203e-06, + "loss": 1.1543, + "step": 3625 + }, + { + "epoch": 0.2, + "grad_norm": 29.135776663159877, + "learning_rate": 6.8169014084507045e-06, + "loss": 1.2084, + "step": 3630 + }, + { + "epoch": 0.2, + "grad_norm": 15.69234399232136, + "learning_rate": 6.826291079812207e-06, + "loss": 1.1876, + "step": 3635 + }, + { + "epoch": 0.21, + "grad_norm": 12.404150099415487, + "learning_rate": 6.83568075117371e-06, + "loss": 1.1912, + "step": 3640 + }, + { + "epoch": 0.21, + "grad_norm": 18.070747134810834, + "learning_rate": 6.845070422535212e-06, + "loss": 1.2465, + "step": 3645 + }, + { + "epoch": 0.21, + "grad_norm": 11.393409730076835, + "learning_rate": 6.854460093896714e-06, + "loss": 1.1301, + "step": 3650 + }, + { + "epoch": 0.21, + "grad_norm": 10.319997724780428, + "learning_rate": 6.863849765258217e-06, + "loss": 1.1826, + "step": 3655 + }, + { + "epoch": 0.21, + "grad_norm": 18.14056625190511, + "learning_rate": 6.873239436619719e-06, + "loss": 1.188, + "step": 3660 + }, + { + "epoch": 0.21, + "grad_norm": 17.06116512876983, + "learning_rate": 6.882629107981221e-06, + "loss": 1.1528, + "step": 3665 + }, + { + "epoch": 0.21, + "grad_norm": 13.17917024296913, + "learning_rate": 6.892018779342724e-06, + "loss": 1.2301, + "step": 3670 + }, + { + "epoch": 0.21, + "grad_norm": 39.495521701693804, + "learning_rate": 6.901408450704225e-06, + "loss": 1.2008, + "step": 3675 + }, + { + "epoch": 0.21, + "grad_norm": 33.18895419124601, + "learning_rate": 6.910798122065729e-06, + "loss": 1.2165, + "step": 3680 + }, + { + "epoch": 0.21, + "grad_norm": 28.62316229900056, + "learning_rate": 6.920187793427231e-06, + "loss": 1.2534, + "step": 3685 + }, + { + "epoch": 0.21, + "grad_norm": 54.32211781057552, + "learning_rate": 6.929577464788733e-06, + "loss": 1.2292, + "step": 3690 + }, + { + "epoch": 0.21, + "grad_norm": 56.82054736290158, + "learning_rate": 6.938967136150235e-06, + "loss": 1.2068, + "step": 3695 + }, + { + "epoch": 0.21, + "grad_norm": 13.029008545931667, + "learning_rate": 6.948356807511738e-06, + "loss": 1.2286, + "step": 3700 + }, + { + "epoch": 0.21, + "grad_norm": 14.402768156419866, + "learning_rate": 6.95774647887324e-06, + "loss": 1.2406, + "step": 3705 + }, + { + "epoch": 0.21, + "grad_norm": 16.536206448928514, + "learning_rate": 6.967136150234742e-06, + "loss": 1.1751, + "step": 3710 + }, + { + "epoch": 0.21, + "grad_norm": 10.748296040716767, + "learning_rate": 6.976525821596245e-06, + "loss": 1.2088, + "step": 3715 + }, + { + "epoch": 0.21, + "grad_norm": 8.62386252533626, + "learning_rate": 6.985915492957746e-06, + "loss": 1.2141, + "step": 3720 + }, + { + "epoch": 0.21, + "grad_norm": 30.942116402328708, + "learning_rate": 6.9953051643192495e-06, + "loss": 1.2274, + "step": 3725 + }, + { + "epoch": 0.21, + "grad_norm": 40.26536107697977, + "learning_rate": 7.004694835680752e-06, + "loss": 1.2088, + "step": 3730 + }, + { + "epoch": 0.21, + "grad_norm": 11.619462153605463, + "learning_rate": 7.0140845070422535e-06, + "loss": 1.1983, + "step": 3735 + }, + { + "epoch": 0.21, + "grad_norm": 20.964427584238834, + "learning_rate": 7.023474178403756e-06, + "loss": 1.238, + "step": 3740 + }, + { + "epoch": 0.21, + "grad_norm": 19.65296109542318, + "learning_rate": 7.032863849765259e-06, + "loss": 1.1932, + "step": 3745 + }, + { + "epoch": 0.21, + "grad_norm": 17.22501001260899, + "learning_rate": 7.042253521126761e-06, + "loss": 1.1857, + "step": 3750 + }, + { + "epoch": 0.21, + "grad_norm": 12.360921962093933, + "learning_rate": 7.051643192488263e-06, + "loss": 1.1414, + "step": 3755 + }, + { + "epoch": 0.21, + "grad_norm": 11.304445795952342, + "learning_rate": 7.0610328638497664e-06, + "loss": 1.2094, + "step": 3760 + }, + { + "epoch": 0.21, + "grad_norm": 13.736799655817833, + "learning_rate": 7.070422535211268e-06, + "loss": 1.1651, + "step": 3765 + }, + { + "epoch": 0.21, + "grad_norm": 15.340443257491685, + "learning_rate": 7.0798122065727704e-06, + "loss": 1.2106, + "step": 3770 + }, + { + "epoch": 0.21, + "grad_norm": 57.75088553318209, + "learning_rate": 7.089201877934273e-06, + "loss": 1.2133, + "step": 3775 + }, + { + "epoch": 0.21, + "grad_norm": 18.102570078350915, + "learning_rate": 7.098591549295776e-06, + "loss": 1.206, + "step": 3780 + }, + { + "epoch": 0.21, + "grad_norm": 17.34966176346707, + "learning_rate": 7.107981220657278e-06, + "loss": 1.2293, + "step": 3785 + }, + { + "epoch": 0.21, + "grad_norm": 8.013929238073546, + "learning_rate": 7.11737089201878e-06, + "loss": 1.244, + "step": 3790 + }, + { + "epoch": 0.21, + "grad_norm": 14.017428219682708, + "learning_rate": 7.1267605633802825e-06, + "loss": 1.2131, + "step": 3795 + }, + { + "epoch": 0.21, + "grad_norm": 38.2059808102528, + "learning_rate": 7.136150234741784e-06, + "loss": 1.1732, + "step": 3800 + }, + { + "epoch": 0.21, + "grad_norm": 18.18166957051223, + "learning_rate": 7.145539906103287e-06, + "loss": 1.1776, + "step": 3805 + }, + { + "epoch": 0.21, + "grad_norm": 7.630825940208817, + "learning_rate": 7.15492957746479e-06, + "loss": 1.2183, + "step": 3810 + }, + { + "epoch": 0.21, + "grad_norm": 9.973979962381074, + "learning_rate": 7.164319248826291e-06, + "loss": 1.2085, + "step": 3815 + }, + { + "epoch": 0.22, + "grad_norm": 18.371477416055765, + "learning_rate": 7.173708920187794e-06, + "loss": 1.2123, + "step": 3820 + }, + { + "epoch": 0.22, + "grad_norm": 22.95078187102011, + "learning_rate": 7.183098591549297e-06, + "loss": 1.2317, + "step": 3825 + }, + { + "epoch": 0.22, + "grad_norm": 19.633831261862134, + "learning_rate": 7.192488262910799e-06, + "loss": 1.1827, + "step": 3830 + }, + { + "epoch": 0.22, + "grad_norm": 34.9900080578578, + "learning_rate": 7.201877934272301e-06, + "loss": 1.1642, + "step": 3835 + }, + { + "epoch": 0.22, + "grad_norm": 31.593510530964537, + "learning_rate": 7.211267605633803e-06, + "loss": 1.1815, + "step": 3840 + }, + { + "epoch": 0.22, + "grad_norm": 13.554527659078145, + "learning_rate": 7.220657276995305e-06, + "loss": 1.2266, + "step": 3845 + }, + { + "epoch": 0.22, + "grad_norm": 27.523021720743884, + "learning_rate": 7.230046948356808e-06, + "loss": 1.195, + "step": 3850 + }, + { + "epoch": 0.22, + "grad_norm": 84.84982009791756, + "learning_rate": 7.239436619718311e-06, + "loss": 1.2443, + "step": 3855 + }, + { + "epoch": 0.22, + "grad_norm": 14.432189680659697, + "learning_rate": 7.248826291079812e-06, + "loss": 1.165, + "step": 3860 + }, + { + "epoch": 0.22, + "grad_norm": 10.854220309725344, + "learning_rate": 7.2582159624413155e-06, + "loss": 1.2511, + "step": 3865 + }, + { + "epoch": 0.22, + "grad_norm": 31.319517601510057, + "learning_rate": 7.267605633802818e-06, + "loss": 1.2311, + "step": 3870 + }, + { + "epoch": 0.22, + "grad_norm": 11.750601797395683, + "learning_rate": 7.2769953051643195e-06, + "loss": 1.2083, + "step": 3875 + }, + { + "epoch": 0.22, + "grad_norm": 9.113609084691047, + "learning_rate": 7.286384976525822e-06, + "loss": 1.1807, + "step": 3880 + }, + { + "epoch": 0.22, + "grad_norm": 76.31055625879928, + "learning_rate": 7.295774647887325e-06, + "loss": 1.22, + "step": 3885 + }, + { + "epoch": 0.22, + "grad_norm": 46.4921319101062, + "learning_rate": 7.305164319248827e-06, + "loss": 1.2519, + "step": 3890 + }, + { + "epoch": 0.22, + "grad_norm": 34.476256756362176, + "learning_rate": 7.314553990610329e-06, + "loss": 1.2219, + "step": 3895 + }, + { + "epoch": 0.22, + "grad_norm": 47.99845942276182, + "learning_rate": 7.3239436619718316e-06, + "loss": 1.1994, + "step": 3900 + }, + { + "epoch": 0.22, + "grad_norm": 29.45466850995247, + "learning_rate": 7.333333333333333e-06, + "loss": 1.179, + "step": 3905 + }, + { + "epoch": 0.22, + "grad_norm": 13.558244967085193, + "learning_rate": 7.342723004694836e-06, + "loss": 1.1744, + "step": 3910 + }, + { + "epoch": 0.22, + "grad_norm": 14.34414112113531, + "learning_rate": 7.352112676056339e-06, + "loss": 1.1872, + "step": 3915 + }, + { + "epoch": 0.22, + "grad_norm": 16.416129737879526, + "learning_rate": 7.361502347417841e-06, + "loss": 1.1791, + "step": 3920 + }, + { + "epoch": 0.22, + "grad_norm": 10.076925129853986, + "learning_rate": 7.370892018779343e-06, + "loss": 1.2536, + "step": 3925 + }, + { + "epoch": 0.22, + "grad_norm": 16.459658344319436, + "learning_rate": 7.380281690140846e-06, + "loss": 1.1564, + "step": 3930 + }, + { + "epoch": 0.22, + "grad_norm": 12.418908912329496, + "learning_rate": 7.3896713615023485e-06, + "loss": 1.2002, + "step": 3935 + }, + { + "epoch": 0.22, + "grad_norm": 9.350250261591546, + "learning_rate": 7.39906103286385e-06, + "loss": 1.2101, + "step": 3940 + }, + { + "epoch": 0.22, + "grad_norm": 12.054635065283787, + "learning_rate": 7.4084507042253525e-06, + "loss": 1.18, + "step": 3945 + }, + { + "epoch": 0.22, + "grad_norm": 15.66569204829225, + "learning_rate": 7.417840375586856e-06, + "loss": 1.1737, + "step": 3950 + }, + { + "epoch": 0.22, + "grad_norm": 12.125203143270163, + "learning_rate": 7.427230046948357e-06, + "loss": 1.2223, + "step": 3955 + }, + { + "epoch": 0.22, + "grad_norm": 23.73720907789374, + "learning_rate": 7.43661971830986e-06, + "loss": 1.161, + "step": 3960 + }, + { + "epoch": 0.22, + "grad_norm": 13.332365920244538, + "learning_rate": 7.446009389671362e-06, + "loss": 1.2081, + "step": 3965 + }, + { + "epoch": 0.22, + "grad_norm": 41.037071426492446, + "learning_rate": 7.4553990610328646e-06, + "loss": 1.1583, + "step": 3970 + }, + { + "epoch": 0.22, + "grad_norm": 31.54174964649289, + "learning_rate": 7.464788732394367e-06, + "loss": 1.1936, + "step": 3975 + }, + { + "epoch": 0.22, + "grad_norm": 13.638670956253598, + "learning_rate": 7.474178403755869e-06, + "loss": 1.1246, + "step": 3980 + }, + { + "epoch": 0.22, + "grad_norm": 47.33858300112234, + "learning_rate": 7.483568075117371e-06, + "loss": 1.2131, + "step": 3985 + }, + { + "epoch": 0.22, + "grad_norm": 8.125222564039044, + "learning_rate": 7.492957746478874e-06, + "loss": 1.2149, + "step": 3990 + }, + { + "epoch": 0.23, + "grad_norm": 12.060015672819873, + "learning_rate": 7.502347417840377e-06, + "loss": 1.1817, + "step": 3995 + }, + { + "epoch": 0.23, + "grad_norm": 7.717886632697339, + "learning_rate": 7.511737089201878e-06, + "loss": 1.1792, + "step": 4000 + }, + { + "epoch": 0.23, + "grad_norm": 22.735666868633807, + "learning_rate": 7.521126760563381e-06, + "loss": 1.2003, + "step": 4005 + }, + { + "epoch": 0.23, + "grad_norm": 12.679319560312683, + "learning_rate": 7.530516431924884e-06, + "loss": 1.1944, + "step": 4010 + }, + { + "epoch": 0.23, + "grad_norm": 14.321591404409501, + "learning_rate": 7.5399061032863855e-06, + "loss": 1.1637, + "step": 4015 + }, + { + "epoch": 0.23, + "grad_norm": 10.569842640812825, + "learning_rate": 7.549295774647888e-06, + "loss": 1.1933, + "step": 4020 + }, + { + "epoch": 0.23, + "grad_norm": 8.966539846253095, + "learning_rate": 7.55868544600939e-06, + "loss": 1.1931, + "step": 4025 + }, + { + "epoch": 0.23, + "grad_norm": 8.298177602548586, + "learning_rate": 7.568075117370892e-06, + "loss": 1.2147, + "step": 4030 + }, + { + "epoch": 0.23, + "grad_norm": 11.695716274478213, + "learning_rate": 7.577464788732395e-06, + "loss": 1.1568, + "step": 4035 + }, + { + "epoch": 0.23, + "grad_norm": 8.76956762071812, + "learning_rate": 7.5868544600938975e-06, + "loss": 1.2203, + "step": 4040 + }, + { + "epoch": 0.23, + "grad_norm": 11.74207568799746, + "learning_rate": 7.596244131455399e-06, + "loss": 1.1174, + "step": 4045 + }, + { + "epoch": 0.23, + "grad_norm": 13.746951692900796, + "learning_rate": 7.6056338028169015e-06, + "loss": 1.2095, + "step": 4050 + }, + { + "epoch": 0.23, + "grad_norm": 25.823990537237783, + "learning_rate": 7.615023474178405e-06, + "loss": 1.1828, + "step": 4055 + }, + { + "epoch": 0.23, + "grad_norm": 9.522634108230067, + "learning_rate": 7.624413145539906e-06, + "loss": 1.219, + "step": 4060 + }, + { + "epoch": 0.23, + "grad_norm": 8.862682594839754, + "learning_rate": 7.633802816901409e-06, + "loss": 1.2119, + "step": 4065 + }, + { + "epoch": 0.23, + "grad_norm": 37.50414447404458, + "learning_rate": 7.643192488262911e-06, + "loss": 1.1571, + "step": 4070 + }, + { + "epoch": 0.23, + "grad_norm": 29.026688227016276, + "learning_rate": 7.652582159624414e-06, + "loss": 1.1998, + "step": 4075 + }, + { + "epoch": 0.23, + "grad_norm": 18.831027066123458, + "learning_rate": 7.661971830985916e-06, + "loss": 1.2157, + "step": 4080 + }, + { + "epoch": 0.23, + "grad_norm": 8.160512019870453, + "learning_rate": 7.671361502347418e-06, + "loss": 1.1936, + "step": 4085 + }, + { + "epoch": 0.23, + "grad_norm": 11.509104337919787, + "learning_rate": 7.680751173708921e-06, + "loss": 1.2712, + "step": 4090 + }, + { + "epoch": 0.23, + "grad_norm": 14.264105952348327, + "learning_rate": 7.690140845070423e-06, + "loss": 1.2473, + "step": 4095 + }, + { + "epoch": 0.23, + "grad_norm": 15.324056959006983, + "learning_rate": 7.699530516431926e-06, + "loss": 1.23, + "step": 4100 + }, + { + "epoch": 0.23, + "grad_norm": 18.00961819776812, + "learning_rate": 7.708920187793428e-06, + "loss": 1.1555, + "step": 4105 + }, + { + "epoch": 0.23, + "grad_norm": 8.830030393314683, + "learning_rate": 7.71830985915493e-06, + "loss": 1.1994, + "step": 4110 + }, + { + "epoch": 0.23, + "grad_norm": 24.400722597596747, + "learning_rate": 7.727699530516433e-06, + "loss": 1.2221, + "step": 4115 + }, + { + "epoch": 0.23, + "grad_norm": 13.517134784716312, + "learning_rate": 7.737089201877935e-06, + "loss": 1.2175, + "step": 4120 + }, + { + "epoch": 0.23, + "grad_norm": 12.65491023895828, + "learning_rate": 7.746478873239436e-06, + "loss": 1.1672, + "step": 4125 + }, + { + "epoch": 0.23, + "grad_norm": 7.7455423499664615, + "learning_rate": 7.75586854460094e-06, + "loss": 1.1662, + "step": 4130 + }, + { + "epoch": 0.23, + "grad_norm": 7.862721002479682, + "learning_rate": 7.765258215962443e-06, + "loss": 1.203, + "step": 4135 + }, + { + "epoch": 0.23, + "grad_norm": 21.376653945251896, + "learning_rate": 7.774647887323943e-06, + "loss": 1.254, + "step": 4140 + }, + { + "epoch": 0.23, + "grad_norm": 16.882981877205165, + "learning_rate": 7.784037558685447e-06, + "loss": 1.1537, + "step": 4145 + }, + { + "epoch": 0.23, + "grad_norm": 33.72581253134319, + "learning_rate": 7.79342723004695e-06, + "loss": 1.2069, + "step": 4150 + }, + { + "epoch": 0.23, + "grad_norm": 16.53421172681963, + "learning_rate": 7.80281690140845e-06, + "loss": 1.1842, + "step": 4155 + }, + { + "epoch": 0.23, + "grad_norm": 28.955238680225012, + "learning_rate": 7.812206572769953e-06, + "loss": 1.1971, + "step": 4160 + }, + { + "epoch": 0.23, + "grad_norm": 53.484123103401174, + "learning_rate": 7.821596244131457e-06, + "loss": 1.19, + "step": 4165 + }, + { + "epoch": 0.23, + "grad_norm": 56.311802418144374, + "learning_rate": 7.830985915492958e-06, + "loss": 1.2309, + "step": 4170 + }, + { + "epoch": 0.24, + "grad_norm": 54.745201532177774, + "learning_rate": 7.84037558685446e-06, + "loss": 1.2278, + "step": 4175 + }, + { + "epoch": 0.24, + "grad_norm": 12.370907547871159, + "learning_rate": 7.849765258215963e-06, + "loss": 1.2722, + "step": 4180 + }, + { + "epoch": 0.24, + "grad_norm": 46.65810052536865, + "learning_rate": 7.859154929577465e-06, + "loss": 1.2049, + "step": 4185 + }, + { + "epoch": 0.24, + "grad_norm": 31.81831041104849, + "learning_rate": 7.868544600938967e-06, + "loss": 1.2209, + "step": 4190 + }, + { + "epoch": 0.24, + "grad_norm": 32.47601424145794, + "learning_rate": 7.87793427230047e-06, + "loss": 1.2545, + "step": 4195 + }, + { + "epoch": 0.24, + "grad_norm": 31.787518737832805, + "learning_rate": 7.887323943661972e-06, + "loss": 1.2616, + "step": 4200 + }, + { + "epoch": 0.24, + "grad_norm": 26.705372881127936, + "learning_rate": 7.896713615023475e-06, + "loss": 1.1764, + "step": 4205 + }, + { + "epoch": 0.24, + "grad_norm": 42.69334237596915, + "learning_rate": 7.906103286384977e-06, + "loss": 1.1996, + "step": 4210 + }, + { + "epoch": 0.24, + "grad_norm": 27.09679778149449, + "learning_rate": 7.91549295774648e-06, + "loss": 1.2149, + "step": 4215 + }, + { + "epoch": 0.24, + "grad_norm": 32.73831121920157, + "learning_rate": 7.924882629107982e-06, + "loss": 1.2362, + "step": 4220 + }, + { + "epoch": 0.24, + "grad_norm": 21.04835758853057, + "learning_rate": 7.934272300469484e-06, + "loss": 1.194, + "step": 4225 + }, + { + "epoch": 0.24, + "grad_norm": 22.251811150037636, + "learning_rate": 7.943661971830987e-06, + "loss": 1.1803, + "step": 4230 + }, + { + "epoch": 0.24, + "grad_norm": 24.281136896647453, + "learning_rate": 7.95305164319249e-06, + "loss": 1.1946, + "step": 4235 + }, + { + "epoch": 0.24, + "grad_norm": 10.126753471815968, + "learning_rate": 7.962441314553992e-06, + "loss": 1.2082, + "step": 4240 + }, + { + "epoch": 0.24, + "grad_norm": 28.520843363187964, + "learning_rate": 7.971830985915494e-06, + "loss": 1.2242, + "step": 4245 + }, + { + "epoch": 0.24, + "grad_norm": 16.517790290601727, + "learning_rate": 7.981220657276996e-06, + "loss": 1.2475, + "step": 4250 + }, + { + "epoch": 0.24, + "grad_norm": 26.652401187780274, + "learning_rate": 7.990610328638499e-06, + "loss": 1.2229, + "step": 4255 + }, + { + "epoch": 0.24, + "grad_norm": 13.409431570584443, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2134, + "step": 4260 + }, + { + "epoch": 0.24, + "grad_norm": 18.12748565038928, + "learning_rate": 8.009389671361502e-06, + "loss": 1.2264, + "step": 4265 + }, + { + "epoch": 0.24, + "grad_norm": 9.312110984657279, + "learning_rate": 8.018779342723006e-06, + "loss": 1.182, + "step": 4270 + }, + { + "epoch": 0.24, + "grad_norm": 52.16608985314976, + "learning_rate": 8.028169014084509e-06, + "loss": 1.1355, + "step": 4275 + }, + { + "epoch": 0.24, + "grad_norm": 17.120160867118052, + "learning_rate": 8.03755868544601e-06, + "loss": 1.2416, + "step": 4280 + }, + { + "epoch": 0.24, + "grad_norm": 13.830831717537642, + "learning_rate": 8.046948356807512e-06, + "loss": 1.225, + "step": 4285 + }, + { + "epoch": 0.24, + "grad_norm": 43.23822029068536, + "learning_rate": 8.056338028169016e-06, + "loss": 1.2252, + "step": 4290 + }, + { + "epoch": 0.24, + "grad_norm": 11.278674301492558, + "learning_rate": 8.065727699530517e-06, + "loss": 1.2109, + "step": 4295 + }, + { + "epoch": 0.24, + "grad_norm": 12.288013498220494, + "learning_rate": 8.075117370892019e-06, + "loss": 1.1789, + "step": 4300 + }, + { + "epoch": 0.24, + "grad_norm": 12.26449336507538, + "learning_rate": 8.084507042253521e-06, + "loss": 1.2125, + "step": 4305 + }, + { + "epoch": 0.24, + "grad_norm": 28.22555996148371, + "learning_rate": 8.093896713615024e-06, + "loss": 1.2431, + "step": 4310 + }, + { + "epoch": 0.24, + "grad_norm": 16.867982626962476, + "learning_rate": 8.103286384976526e-06, + "loss": 1.2525, + "step": 4315 + }, + { + "epoch": 0.24, + "grad_norm": 10.028491223313324, + "learning_rate": 8.112676056338029e-06, + "loss": 1.1883, + "step": 4320 + }, + { + "epoch": 0.24, + "grad_norm": 10.2026817833496, + "learning_rate": 8.122065727699531e-06, + "loss": 1.1986, + "step": 4325 + }, + { + "epoch": 0.24, + "grad_norm": 27.554321300631596, + "learning_rate": 8.131455399061033e-06, + "loss": 1.1934, + "step": 4330 + }, + { + "epoch": 0.24, + "grad_norm": 13.581434368028852, + "learning_rate": 8.140845070422536e-06, + "loss": 1.2165, + "step": 4335 + }, + { + "epoch": 0.24, + "grad_norm": 18.80090735090302, + "learning_rate": 8.150234741784038e-06, + "loss": 1.1882, + "step": 4340 + }, + { + "epoch": 0.24, + "grad_norm": 18.10623653337208, + "learning_rate": 8.15962441314554e-06, + "loss": 1.2051, + "step": 4345 + }, + { + "epoch": 0.25, + "grad_norm": 36.58103521997585, + "learning_rate": 8.169014084507043e-06, + "loss": 1.165, + "step": 4350 + }, + { + "epoch": 0.25, + "grad_norm": 20.14362401499682, + "learning_rate": 8.178403755868546e-06, + "loss": 1.1963, + "step": 4355 + }, + { + "epoch": 0.25, + "grad_norm": 23.915175786758955, + "learning_rate": 8.187793427230048e-06, + "loss": 1.1793, + "step": 4360 + }, + { + "epoch": 0.25, + "grad_norm": 83.0767168465954, + "learning_rate": 8.19718309859155e-06, + "loss": 1.217, + "step": 4365 + }, + { + "epoch": 0.25, + "grad_norm": 60.3543840448068, + "learning_rate": 8.206572769953053e-06, + "loss": 1.2637, + "step": 4370 + }, + { + "epoch": 0.25, + "grad_norm": 48.38956559363697, + "learning_rate": 8.215962441314555e-06, + "loss": 1.1145, + "step": 4375 + }, + { + "epoch": 0.25, + "grad_norm": 12.06381918741665, + "learning_rate": 8.225352112676058e-06, + "loss": 1.2502, + "step": 4380 + }, + { + "epoch": 0.25, + "grad_norm": 17.613602776471925, + "learning_rate": 8.23474178403756e-06, + "loss": 1.253, + "step": 4385 + }, + { + "epoch": 0.25, + "grad_norm": 67.98537964903069, + "learning_rate": 8.24413145539906e-06, + "loss": 1.2557, + "step": 4390 + }, + { + "epoch": 0.25, + "grad_norm": 44.88995190776058, + "learning_rate": 8.253521126760565e-06, + "loss": 1.2989, + "step": 4395 + }, + { + "epoch": 0.25, + "grad_norm": 28.976224526658328, + "learning_rate": 8.262910798122067e-06, + "loss": 1.1992, + "step": 4400 + }, + { + "epoch": 0.25, + "grad_norm": 11.817326532997889, + "learning_rate": 8.272300469483568e-06, + "loss": 1.1621, + "step": 4405 + }, + { + "epoch": 0.25, + "grad_norm": 15.74428012005997, + "learning_rate": 8.28169014084507e-06, + "loss": 1.1887, + "step": 4410 + }, + { + "epoch": 0.25, + "grad_norm": 10.572505703699525, + "learning_rate": 8.291079812206575e-06, + "loss": 1.2101, + "step": 4415 + }, + { + "epoch": 0.25, + "grad_norm": 25.415921386403607, + "learning_rate": 8.300469483568075e-06, + "loss": 1.2156, + "step": 4420 + }, + { + "epoch": 0.25, + "grad_norm": 21.61583806645396, + "learning_rate": 8.309859154929578e-06, + "loss": 1.1939, + "step": 4425 + }, + { + "epoch": 0.25, + "grad_norm": 19.863954359581307, + "learning_rate": 8.31924882629108e-06, + "loss": 1.193, + "step": 4430 + }, + { + "epoch": 0.25, + "grad_norm": 38.64725509136549, + "learning_rate": 8.328638497652583e-06, + "loss": 1.2823, + "step": 4435 + }, + { + "epoch": 0.25, + "grad_norm": 21.766446850354708, + "learning_rate": 8.338028169014085e-06, + "loss": 1.1997, + "step": 4440 + }, + { + "epoch": 0.25, + "grad_norm": 49.93259454328654, + "learning_rate": 8.347417840375587e-06, + "loss": 1.2411, + "step": 4445 + }, + { + "epoch": 0.25, + "grad_norm": 58.99810320942265, + "learning_rate": 8.35680751173709e-06, + "loss": 1.2283, + "step": 4450 + }, + { + "epoch": 0.25, + "grad_norm": 10.671889593603145, + "learning_rate": 8.366197183098592e-06, + "loss": 1.2491, + "step": 4455 + }, + { + "epoch": 0.25, + "grad_norm": 8.782965260071162, + "learning_rate": 8.375586854460095e-06, + "loss": 1.18, + "step": 4460 + }, + { + "epoch": 0.25, + "grad_norm": 21.14152789516212, + "learning_rate": 8.384976525821597e-06, + "loss": 1.2081, + "step": 4465 + }, + { + "epoch": 0.25, + "grad_norm": 15.434041552947308, + "learning_rate": 8.3943661971831e-06, + "loss": 1.1511, + "step": 4470 + }, + { + "epoch": 0.25, + "grad_norm": 11.056764649773962, + "learning_rate": 8.403755868544602e-06, + "loss": 1.2038, + "step": 4475 + }, + { + "epoch": 0.25, + "grad_norm": 8.02810844754425, + "learning_rate": 8.413145539906104e-06, + "loss": 1.2548, + "step": 4480 + }, + { + "epoch": 0.25, + "grad_norm": 24.719945686128177, + "learning_rate": 8.422535211267607e-06, + "loss": 1.2403, + "step": 4485 + }, + { + "epoch": 0.25, + "grad_norm": 46.08082388148565, + "learning_rate": 8.431924882629109e-06, + "loss": 1.2058, + "step": 4490 + }, + { + "epoch": 0.25, + "grad_norm": 26.814904159754715, + "learning_rate": 8.44131455399061e-06, + "loss": 1.1815, + "step": 4495 + }, + { + "epoch": 0.25, + "grad_norm": 35.51873251767294, + "learning_rate": 8.450704225352114e-06, + "loss": 1.1935, + "step": 4500 + }, + { + "epoch": 0.25, + "grad_norm": 18.390534518678628, + "learning_rate": 8.460093896713616e-06, + "loss": 1.261, + "step": 4505 + }, + { + "epoch": 0.25, + "grad_norm": 19.526978010888968, + "learning_rate": 8.469483568075117e-06, + "loss": 1.1904, + "step": 4510 + }, + { + "epoch": 0.25, + "grad_norm": 7.487866155998266, + "learning_rate": 8.47887323943662e-06, + "loss": 1.2252, + "step": 4515 + }, + { + "epoch": 0.25, + "grad_norm": 28.22177824337065, + "learning_rate": 8.488262910798124e-06, + "loss": 1.2275, + "step": 4520 + }, + { + "epoch": 0.25, + "grad_norm": 22.611165472988805, + "learning_rate": 8.497652582159626e-06, + "loss": 1.2108, + "step": 4525 + }, + { + "epoch": 0.26, + "grad_norm": 8.378918799576965, + "learning_rate": 8.507042253521127e-06, + "loss": 1.2636, + "step": 4530 + }, + { + "epoch": 0.26, + "grad_norm": 10.497233258422098, + "learning_rate": 8.51643192488263e-06, + "loss": 1.1891, + "step": 4535 + }, + { + "epoch": 0.26, + "grad_norm": 22.49226598940317, + "learning_rate": 8.525821596244133e-06, + "loss": 1.1832, + "step": 4540 + }, + { + "epoch": 0.26, + "grad_norm": 19.168432021952608, + "learning_rate": 8.535211267605634e-06, + "loss": 1.2072, + "step": 4545 + }, + { + "epoch": 0.26, + "grad_norm": 26.166076586005975, + "learning_rate": 8.544600938967136e-06, + "loss": 1.2226, + "step": 4550 + }, + { + "epoch": 0.26, + "grad_norm": 32.03204570802435, + "learning_rate": 8.553990610328639e-06, + "loss": 1.1851, + "step": 4555 + }, + { + "epoch": 0.26, + "grad_norm": 12.092581131426625, + "learning_rate": 8.563380281690141e-06, + "loss": 1.1864, + "step": 4560 + }, + { + "epoch": 0.26, + "grad_norm": 13.669659448564795, + "learning_rate": 8.572769953051644e-06, + "loss": 1.2309, + "step": 4565 + }, + { + "epoch": 0.26, + "grad_norm": 19.334502010788857, + "learning_rate": 8.582159624413146e-06, + "loss": 1.2154, + "step": 4570 + }, + { + "epoch": 0.26, + "grad_norm": 13.563671018584959, + "learning_rate": 8.591549295774648e-06, + "loss": 1.2385, + "step": 4575 + }, + { + "epoch": 0.26, + "grad_norm": 13.915338118752953, + "learning_rate": 8.600938967136151e-06, + "loss": 1.1964, + "step": 4580 + }, + { + "epoch": 0.26, + "grad_norm": 26.404055809630883, + "learning_rate": 8.610328638497653e-06, + "loss": 1.1839, + "step": 4585 + }, + { + "epoch": 0.26, + "grad_norm": 11.222126968100023, + "learning_rate": 8.619718309859156e-06, + "loss": 1.2581, + "step": 4590 + }, + { + "epoch": 0.26, + "grad_norm": 9.35747409325916, + "learning_rate": 8.629107981220658e-06, + "loss": 1.2179, + "step": 4595 + }, + { + "epoch": 0.26, + "grad_norm": 11.990756284594863, + "learning_rate": 8.63849765258216e-06, + "loss": 1.2231, + "step": 4600 + }, + { + "epoch": 0.26, + "grad_norm": 13.257116674804113, + "learning_rate": 8.647887323943663e-06, + "loss": 1.1873, + "step": 4605 + }, + { + "epoch": 0.26, + "grad_norm": 28.933897560856717, + "learning_rate": 8.657276995305165e-06, + "loss": 1.2032, + "step": 4610 + }, + { + "epoch": 0.26, + "grad_norm": 10.215272714102335, + "learning_rate": 8.666666666666668e-06, + "loss": 1.2329, + "step": 4615 + }, + { + "epoch": 0.26, + "grad_norm": 11.913604246367948, + "learning_rate": 8.676056338028169e-06, + "loss": 1.1643, + "step": 4620 + }, + { + "epoch": 0.26, + "grad_norm": 18.192712631352123, + "learning_rate": 8.685446009389673e-06, + "loss": 1.206, + "step": 4625 + }, + { + "epoch": 0.26, + "grad_norm": 20.39635890815786, + "learning_rate": 8.694835680751175e-06, + "loss": 1.2078, + "step": 4630 + }, + { + "epoch": 0.26, + "grad_norm": 10.542066863681356, + "learning_rate": 8.704225352112676e-06, + "loss": 1.1961, + "step": 4635 + }, + { + "epoch": 0.26, + "grad_norm": 13.022672362985384, + "learning_rate": 8.713615023474178e-06, + "loss": 1.1985, + "step": 4640 + }, + { + "epoch": 0.26, + "grad_norm": 16.92321634589073, + "learning_rate": 8.723004694835682e-06, + "loss": 1.2378, + "step": 4645 + }, + { + "epoch": 0.26, + "grad_norm": 13.357184906029232, + "learning_rate": 8.732394366197183e-06, + "loss": 1.2266, + "step": 4650 + }, + { + "epoch": 0.26, + "grad_norm": 8.056095828904821, + "learning_rate": 8.741784037558685e-06, + "loss": 1.1735, + "step": 4655 + }, + { + "epoch": 0.26, + "grad_norm": 12.816915017876703, + "learning_rate": 8.751173708920188e-06, + "loss": 1.1604, + "step": 4660 + }, + { + "epoch": 0.26, + "grad_norm": 12.612070891630275, + "learning_rate": 8.760563380281692e-06, + "loss": 1.2305, + "step": 4665 + }, + { + "epoch": 0.26, + "grad_norm": 10.759724594659053, + "learning_rate": 8.769953051643193e-06, + "loss": 1.1982, + "step": 4670 + }, + { + "epoch": 0.26, + "grad_norm": 6.514505344613759, + "learning_rate": 8.779342723004695e-06, + "loss": 1.1367, + "step": 4675 + }, + { + "epoch": 0.26, + "grad_norm": 16.03064258087366, + "learning_rate": 8.7887323943662e-06, + "loss": 1.2106, + "step": 4680 + }, + { + "epoch": 0.26, + "grad_norm": 22.814805988832852, + "learning_rate": 8.7981220657277e-06, + "loss": 1.1732, + "step": 4685 + }, + { + "epoch": 0.26, + "grad_norm": 12.906934924379284, + "learning_rate": 8.807511737089202e-06, + "loss": 1.2103, + "step": 4690 + }, + { + "epoch": 0.26, + "grad_norm": 25.346428870841898, + "learning_rate": 8.816901408450705e-06, + "loss": 1.2158, + "step": 4695 + }, + { + "epoch": 0.26, + "grad_norm": 93.89877917994153, + "learning_rate": 8.826291079812207e-06, + "loss": 1.2685, + "step": 4700 + }, + { + "epoch": 0.27, + "grad_norm": 37.7336374532386, + "learning_rate": 8.83568075117371e-06, + "loss": 1.2076, + "step": 4705 + }, + { + "epoch": 0.27, + "grad_norm": 21.686061239154387, + "learning_rate": 8.845070422535212e-06, + "loss": 1.1559, + "step": 4710 + }, + { + "epoch": 0.27, + "grad_norm": 7.26863873637512, + "learning_rate": 8.854460093896714e-06, + "loss": 1.262, + "step": 4715 + }, + { + "epoch": 0.27, + "grad_norm": 17.399365209935816, + "learning_rate": 8.863849765258217e-06, + "loss": 1.1888, + "step": 4720 + }, + { + "epoch": 0.27, + "grad_norm": 28.833717861525898, + "learning_rate": 8.87323943661972e-06, + "loss": 1.193, + "step": 4725 + }, + { + "epoch": 0.27, + "grad_norm": 12.17236565349994, + "learning_rate": 8.882629107981222e-06, + "loss": 1.228, + "step": 4730 + }, + { + "epoch": 0.27, + "grad_norm": 20.49984344740007, + "learning_rate": 8.892018779342724e-06, + "loss": 1.1805, + "step": 4735 + }, + { + "epoch": 0.27, + "grad_norm": 23.133834074648217, + "learning_rate": 8.901408450704227e-06, + "loss": 1.231, + "step": 4740 + }, + { + "epoch": 0.27, + "grad_norm": 9.390859731210462, + "learning_rate": 8.910798122065727e-06, + "loss": 1.2167, + "step": 4745 + }, + { + "epoch": 0.27, + "grad_norm": 7.52393152719337, + "learning_rate": 8.920187793427231e-06, + "loss": 1.2012, + "step": 4750 + }, + { + "epoch": 0.27, + "grad_norm": 14.63053260827425, + "learning_rate": 8.929577464788734e-06, + "loss": 1.1925, + "step": 4755 + }, + { + "epoch": 0.27, + "grad_norm": 32.73148648410042, + "learning_rate": 8.938967136150235e-06, + "loss": 1.2605, + "step": 4760 + }, + { + "epoch": 0.27, + "grad_norm": 6.894352059864662, + "learning_rate": 8.948356807511737e-06, + "loss": 1.2162, + "step": 4765 + }, + { + "epoch": 0.27, + "grad_norm": 17.552047195004757, + "learning_rate": 8.957746478873241e-06, + "loss": 1.2326, + "step": 4770 + }, + { + "epoch": 0.27, + "grad_norm": 12.151321061078818, + "learning_rate": 8.967136150234742e-06, + "loss": 1.23, + "step": 4775 + }, + { + "epoch": 0.27, + "grad_norm": 10.886873360044373, + "learning_rate": 8.976525821596244e-06, + "loss": 1.1822, + "step": 4780 + }, + { + "epoch": 0.27, + "grad_norm": 7.398381585266354, + "learning_rate": 8.985915492957748e-06, + "loss": 1.218, + "step": 4785 + }, + { + "epoch": 0.27, + "grad_norm": 9.457273127763088, + "learning_rate": 8.995305164319249e-06, + "loss": 1.1844, + "step": 4790 + }, + { + "epoch": 0.27, + "grad_norm": 9.001274637047235, + "learning_rate": 9.004694835680751e-06, + "loss": 1.2374, + "step": 4795 + }, + { + "epoch": 0.27, + "grad_norm": 7.323113614516511, + "learning_rate": 9.014084507042254e-06, + "loss": 1.1926, + "step": 4800 + }, + { + "epoch": 0.27, + "grad_norm": 9.424173138049385, + "learning_rate": 9.023474178403756e-06, + "loss": 1.1746, + "step": 4805 + }, + { + "epoch": 0.27, + "grad_norm": 7.750633924642878, + "learning_rate": 9.032863849765259e-06, + "loss": 1.1695, + "step": 4810 + }, + { + "epoch": 0.27, + "grad_norm": 13.126106427680092, + "learning_rate": 9.042253521126761e-06, + "loss": 1.2219, + "step": 4815 + }, + { + "epoch": 0.27, + "grad_norm": 13.440550076998624, + "learning_rate": 9.051643192488264e-06, + "loss": 1.197, + "step": 4820 + }, + { + "epoch": 0.27, + "grad_norm": 16.805224265781256, + "learning_rate": 9.061032863849766e-06, + "loss": 1.151, + "step": 4825 + }, + { + "epoch": 0.27, + "grad_norm": 11.839277125488668, + "learning_rate": 9.070422535211268e-06, + "loss": 1.1953, + "step": 4830 + }, + { + "epoch": 0.27, + "grad_norm": 7.487407125805589, + "learning_rate": 9.07981220657277e-06, + "loss": 1.1639, + "step": 4835 + }, + { + "epoch": 0.27, + "grad_norm": 15.147288333159524, + "learning_rate": 9.089201877934273e-06, + "loss": 1.1562, + "step": 4840 + }, + { + "epoch": 0.27, + "grad_norm": 12.170524064010127, + "learning_rate": 9.098591549295776e-06, + "loss": 1.2053, + "step": 4845 + }, + { + "epoch": 0.27, + "grad_norm": 21.159133945462376, + "learning_rate": 9.107981220657278e-06, + "loss": 1.2215, + "step": 4850 + }, + { + "epoch": 0.27, + "grad_norm": 18.317344425477746, + "learning_rate": 9.11737089201878e-06, + "loss": 1.2007, + "step": 4855 + }, + { + "epoch": 0.27, + "grad_norm": 14.153987978554724, + "learning_rate": 9.126760563380283e-06, + "loss": 1.2317, + "step": 4860 + }, + { + "epoch": 0.27, + "grad_norm": 33.34311167199715, + "learning_rate": 9.136150234741785e-06, + "loss": 1.188, + "step": 4865 + }, + { + "epoch": 0.27, + "grad_norm": 12.06773661122188, + "learning_rate": 9.145539906103286e-06, + "loss": 1.138, + "step": 4870 + }, + { + "epoch": 0.27, + "grad_norm": 53.31008122043855, + "learning_rate": 9.15492957746479e-06, + "loss": 1.223, + "step": 4875 + }, + { + "epoch": 0.27, + "grad_norm": 97.84584817125534, + "learning_rate": 9.164319248826293e-06, + "loss": 1.2707, + "step": 4880 + }, + { + "epoch": 0.28, + "grad_norm": 63.0543719233712, + "learning_rate": 9.173708920187793e-06, + "loss": 1.2382, + "step": 4885 + }, + { + "epoch": 0.28, + "grad_norm": 25.27796698046594, + "learning_rate": 9.183098591549296e-06, + "loss": 1.2052, + "step": 4890 + }, + { + "epoch": 0.28, + "grad_norm": 15.754193419573788, + "learning_rate": 9.1924882629108e-06, + "loss": 1.2351, + "step": 4895 + }, + { + "epoch": 0.28, + "grad_norm": 75.82582834495221, + "learning_rate": 9.2018779342723e-06, + "loss": 1.2637, + "step": 4900 + }, + { + "epoch": 0.28, + "grad_norm": 31.605845668635858, + "learning_rate": 9.211267605633803e-06, + "loss": 1.2442, + "step": 4905 + }, + { + "epoch": 0.28, + "grad_norm": 24.00807115841491, + "learning_rate": 9.220657276995307e-06, + "loss": 1.264, + "step": 4910 + }, + { + "epoch": 0.28, + "grad_norm": 46.746809972706465, + "learning_rate": 9.230046948356808e-06, + "loss": 1.2684, + "step": 4915 + }, + { + "epoch": 0.28, + "grad_norm": 16.930482339991464, + "learning_rate": 9.23943661971831e-06, + "loss": 1.2334, + "step": 4920 + }, + { + "epoch": 0.28, + "grad_norm": 13.376896059644336, + "learning_rate": 9.248826291079813e-06, + "loss": 1.1814, + "step": 4925 + }, + { + "epoch": 0.28, + "grad_norm": 15.253694936032222, + "learning_rate": 9.258215962441315e-06, + "loss": 1.2407, + "step": 4930 + }, + { + "epoch": 0.28, + "grad_norm": 16.872345619849714, + "learning_rate": 9.267605633802817e-06, + "loss": 1.1661, + "step": 4935 + }, + { + "epoch": 0.28, + "grad_norm": 17.877162737493244, + "learning_rate": 9.27699530516432e-06, + "loss": 1.2197, + "step": 4940 + }, + { + "epoch": 0.28, + "grad_norm": 25.009862820708754, + "learning_rate": 9.286384976525822e-06, + "loss": 1.1932, + "step": 4945 + }, + { + "epoch": 0.28, + "grad_norm": 8.191903839893259, + "learning_rate": 9.295774647887325e-06, + "loss": 1.207, + "step": 4950 + }, + { + "epoch": 0.28, + "grad_norm": 15.3620995658234, + "learning_rate": 9.305164319248827e-06, + "loss": 1.1951, + "step": 4955 + }, + { + "epoch": 0.28, + "grad_norm": 42.848416867624465, + "learning_rate": 9.31455399061033e-06, + "loss": 1.1801, + "step": 4960 + }, + { + "epoch": 0.28, + "grad_norm": 7.973034679820064, + "learning_rate": 9.323943661971832e-06, + "loss": 1.2264, + "step": 4965 + }, + { + "epoch": 0.28, + "grad_norm": 14.464394544881255, + "learning_rate": 9.333333333333334e-06, + "loss": 1.2056, + "step": 4970 + }, + { + "epoch": 0.28, + "grad_norm": 14.694987514410068, + "learning_rate": 9.342723004694837e-06, + "loss": 1.2022, + "step": 4975 + }, + { + "epoch": 0.28, + "grad_norm": 18.62508281391501, + "learning_rate": 9.35211267605634e-06, + "loss": 1.1698, + "step": 4980 + }, + { + "epoch": 0.28, + "grad_norm": 9.597224579554815, + "learning_rate": 9.361502347417842e-06, + "loss": 1.1909, + "step": 4985 + }, + { + "epoch": 0.28, + "grad_norm": 50.200478722221405, + "learning_rate": 9.370892018779344e-06, + "loss": 1.212, + "step": 4990 + }, + { + "epoch": 0.28, + "grad_norm": 23.166968312343474, + "learning_rate": 9.380281690140845e-06, + "loss": 1.2054, + "step": 4995 + }, + { + "epoch": 0.28, + "grad_norm": 27.50082609891482, + "learning_rate": 9.389671361502349e-06, + "loss": 1.2143, + "step": 5000 + }, + { + "epoch": 0.28, + "grad_norm": 13.526418712566558, + "learning_rate": 9.399061032863851e-06, + "loss": 1.2128, + "step": 5005 + }, + { + "epoch": 0.28, + "grad_norm": 18.575803570472214, + "learning_rate": 9.408450704225352e-06, + "loss": 1.2097, + "step": 5010 + }, + { + "epoch": 0.28, + "grad_norm": 22.375062956236498, + "learning_rate": 9.417840375586856e-06, + "loss": 1.1992, + "step": 5015 + }, + { + "epoch": 0.28, + "grad_norm": 43.71697187903528, + "learning_rate": 9.427230046948358e-06, + "loss": 1.2348, + "step": 5020 + }, + { + "epoch": 0.28, + "grad_norm": 20.416802698197525, + "learning_rate": 9.43661971830986e-06, + "loss": 1.2097, + "step": 5025 + }, + { + "epoch": 0.28, + "grad_norm": 17.384073176517635, + "learning_rate": 9.446009389671362e-06, + "loss": 1.2415, + "step": 5030 + }, + { + "epoch": 0.28, + "grad_norm": 9.513791754954687, + "learning_rate": 9.455399061032866e-06, + "loss": 1.2087, + "step": 5035 + }, + { + "epoch": 0.28, + "grad_norm": 37.451133021295504, + "learning_rate": 9.464788732394366e-06, + "loss": 1.2042, + "step": 5040 + }, + { + "epoch": 0.28, + "grad_norm": 25.269686324868847, + "learning_rate": 9.474178403755869e-06, + "loss": 1.2279, + "step": 5045 + }, + { + "epoch": 0.28, + "grad_norm": 11.323648410174899, + "learning_rate": 9.483568075117371e-06, + "loss": 1.2422, + "step": 5050 + }, + { + "epoch": 0.28, + "grad_norm": 12.894795533491129, + "learning_rate": 9.492957746478874e-06, + "loss": 1.1976, + "step": 5055 + }, + { + "epoch": 0.29, + "grad_norm": 10.497996133238441, + "learning_rate": 9.502347417840376e-06, + "loss": 1.1788, + "step": 5060 + }, + { + "epoch": 0.29, + "grad_norm": 24.26837565399261, + "learning_rate": 9.511737089201879e-06, + "loss": 1.1912, + "step": 5065 + }, + { + "epoch": 0.29, + "grad_norm": 16.685763569894704, + "learning_rate": 9.521126760563381e-06, + "loss": 1.2126, + "step": 5070 + }, + { + "epoch": 0.29, + "grad_norm": 9.914181612884889, + "learning_rate": 9.530516431924883e-06, + "loss": 1.2282, + "step": 5075 + }, + { + "epoch": 0.29, + "grad_norm": 34.564605352546806, + "learning_rate": 9.539906103286386e-06, + "loss": 1.1896, + "step": 5080 + }, + { + "epoch": 0.29, + "grad_norm": 15.217776506429574, + "learning_rate": 9.549295774647888e-06, + "loss": 1.2424, + "step": 5085 + }, + { + "epoch": 0.29, + "grad_norm": 10.717742410152061, + "learning_rate": 9.55868544600939e-06, + "loss": 1.2019, + "step": 5090 + }, + { + "epoch": 0.29, + "grad_norm": 13.549766784551434, + "learning_rate": 9.568075117370893e-06, + "loss": 1.2022, + "step": 5095 + }, + { + "epoch": 0.29, + "grad_norm": 15.441506824486499, + "learning_rate": 9.577464788732394e-06, + "loss": 1.2492, + "step": 5100 + }, + { + "epoch": 0.29, + "grad_norm": 20.704859865177966, + "learning_rate": 9.586854460093898e-06, + "loss": 1.2659, + "step": 5105 + }, + { + "epoch": 0.29, + "grad_norm": 10.938016303757815, + "learning_rate": 9.5962441314554e-06, + "loss": 1.2315, + "step": 5110 + }, + { + "epoch": 0.29, + "grad_norm": 9.955300844308784, + "learning_rate": 9.605633802816903e-06, + "loss": 1.1809, + "step": 5115 + }, + { + "epoch": 0.29, + "grad_norm": 10.783693029149658, + "learning_rate": 9.615023474178405e-06, + "loss": 1.2424, + "step": 5120 + }, + { + "epoch": 0.29, + "grad_norm": 9.820839527288985, + "learning_rate": 9.624413145539908e-06, + "loss": 1.1924, + "step": 5125 + }, + { + "epoch": 0.29, + "grad_norm": 7.242449432636181, + "learning_rate": 9.63380281690141e-06, + "loss": 1.1465, + "step": 5130 + }, + { + "epoch": 0.29, + "grad_norm": 25.358715380622098, + "learning_rate": 9.64319248826291e-06, + "loss": 1.2252, + "step": 5135 + }, + { + "epoch": 0.29, + "grad_norm": 16.684351273646044, + "learning_rate": 9.652582159624415e-06, + "loss": 1.1993, + "step": 5140 + }, + { + "epoch": 0.29, + "grad_norm": 10.680154433781327, + "learning_rate": 9.661971830985917e-06, + "loss": 1.2464, + "step": 5145 + }, + { + "epoch": 0.29, + "grad_norm": 10.177913733583562, + "learning_rate": 9.671361502347418e-06, + "loss": 1.2024, + "step": 5150 + }, + { + "epoch": 0.29, + "grad_norm": 17.0362502634753, + "learning_rate": 9.68075117370892e-06, + "loss": 1.1835, + "step": 5155 + }, + { + "epoch": 0.29, + "grad_norm": 26.9558842363334, + "learning_rate": 9.690140845070424e-06, + "loss": 1.1635, + "step": 5160 + }, + { + "epoch": 0.29, + "grad_norm": 18.942749649374964, + "learning_rate": 9.699530516431925e-06, + "loss": 1.2109, + "step": 5165 + }, + { + "epoch": 0.29, + "grad_norm": 11.829725606056792, + "learning_rate": 9.708920187793428e-06, + "loss": 1.1928, + "step": 5170 + }, + { + "epoch": 0.29, + "grad_norm": 7.25680118822684, + "learning_rate": 9.71830985915493e-06, + "loss": 1.2693, + "step": 5175 + }, + { + "epoch": 0.29, + "grad_norm": 24.313307859101393, + "learning_rate": 9.727699530516432e-06, + "loss": 1.1852, + "step": 5180 + }, + { + "epoch": 0.29, + "grad_norm": 23.367700122245164, + "learning_rate": 9.737089201877935e-06, + "loss": 1.2617, + "step": 5185 + }, + { + "epoch": 0.29, + "grad_norm": 11.692227606630539, + "learning_rate": 9.746478873239437e-06, + "loss": 1.1946, + "step": 5190 + }, + { + "epoch": 0.29, + "grad_norm": 9.188404391780125, + "learning_rate": 9.75586854460094e-06, + "loss": 1.1914, + "step": 5195 + }, + { + "epoch": 0.29, + "grad_norm": 6.923435624679081, + "learning_rate": 9.765258215962442e-06, + "loss": 1.2436, + "step": 5200 + }, + { + "epoch": 0.29, + "grad_norm": 13.620573114568305, + "learning_rate": 9.774647887323945e-06, + "loss": 1.2035, + "step": 5205 + }, + { + "epoch": 0.29, + "grad_norm": 15.537849595936141, + "learning_rate": 9.784037558685447e-06, + "loss": 1.1962, + "step": 5210 + }, + { + "epoch": 0.29, + "grad_norm": 19.793103112018994, + "learning_rate": 9.79342723004695e-06, + "loss": 1.1887, + "step": 5215 + }, + { + "epoch": 0.29, + "grad_norm": 12.759387206168986, + "learning_rate": 9.802816901408452e-06, + "loss": 1.2161, + "step": 5220 + }, + { + "epoch": 0.29, + "grad_norm": 33.568472541916144, + "learning_rate": 9.812206572769954e-06, + "loss": 1.2223, + "step": 5225 + }, + { + "epoch": 0.29, + "grad_norm": 32.03329398432156, + "learning_rate": 9.821596244131457e-06, + "loss": 1.1922, + "step": 5230 + }, + { + "epoch": 0.29, + "grad_norm": 7.986653092221317, + "learning_rate": 9.830985915492959e-06, + "loss": 1.1862, + "step": 5235 + }, + { + "epoch": 0.3, + "grad_norm": 34.68362616331035, + "learning_rate": 9.84037558685446e-06, + "loss": 1.1731, + "step": 5240 + }, + { + "epoch": 0.3, + "grad_norm": 8.757833240331694, + "learning_rate": 9.849765258215964e-06, + "loss": 1.2275, + "step": 5245 + }, + { + "epoch": 0.3, + "grad_norm": 9.732448948046146, + "learning_rate": 9.859154929577466e-06, + "loss": 1.2045, + "step": 5250 + }, + { + "epoch": 0.3, + "grad_norm": 27.160606468261093, + "learning_rate": 9.868544600938969e-06, + "loss": 1.2143, + "step": 5255 + }, + { + "epoch": 0.3, + "grad_norm": 32.5303111400796, + "learning_rate": 9.87793427230047e-06, + "loss": 1.2168, + "step": 5260 + }, + { + "epoch": 0.3, + "grad_norm": 9.339242928379411, + "learning_rate": 9.887323943661974e-06, + "loss": 1.2535, + "step": 5265 + }, + { + "epoch": 0.3, + "grad_norm": 38.25669067612021, + "learning_rate": 9.896713615023476e-06, + "loss": 1.1996, + "step": 5270 + }, + { + "epoch": 0.3, + "grad_norm": 29.544270638998693, + "learning_rate": 9.906103286384977e-06, + "loss": 1.2133, + "step": 5275 + }, + { + "epoch": 0.3, + "grad_norm": 32.02082441843353, + "learning_rate": 9.915492957746479e-06, + "loss": 1.2179, + "step": 5280 + }, + { + "epoch": 0.3, + "grad_norm": 53.503063665296374, + "learning_rate": 9.924882629107983e-06, + "loss": 1.218, + "step": 5285 + }, + { + "epoch": 0.3, + "grad_norm": 17.44623121392335, + "learning_rate": 9.934272300469484e-06, + "loss": 1.1835, + "step": 5290 + }, + { + "epoch": 0.3, + "grad_norm": 10.714919664090878, + "learning_rate": 9.943661971830986e-06, + "loss": 1.2108, + "step": 5295 + }, + { + "epoch": 0.3, + "grad_norm": 16.31137408950519, + "learning_rate": 9.953051643192489e-06, + "loss": 1.1877, + "step": 5300 + }, + { + "epoch": 0.3, + "grad_norm": 16.843465143372992, + "learning_rate": 9.962441314553991e-06, + "loss": 1.1946, + "step": 5305 + }, + { + "epoch": 0.3, + "grad_norm": 8.691963663826979, + "learning_rate": 9.971830985915494e-06, + "loss": 1.1819, + "step": 5310 + }, + { + "epoch": 0.3, + "grad_norm": 10.847767715476808, + "learning_rate": 9.981220657276996e-06, + "loss": 1.2201, + "step": 5315 + }, + { + "epoch": 0.3, + "grad_norm": 59.0052613075288, + "learning_rate": 9.990610328638498e-06, + "loss": 1.1814, + "step": 5320 + }, + { + "epoch": 0.3, + "grad_norm": 42.90524030242737, + "learning_rate": 1e-05, + "loss": 1.2171, + "step": 5325 + }, + { + "epoch": 0.3, + "grad_norm": 17.642478887489734, + "learning_rate": 9.999999731363967e-06, + "loss": 1.2269, + "step": 5330 + }, + { + "epoch": 0.3, + "grad_norm": 35.97618179675625, + "learning_rate": 9.999998925455896e-06, + "loss": 1.2143, + "step": 5335 + }, + { + "epoch": 0.3, + "grad_norm": 10.759564398963768, + "learning_rate": 9.999997582275873e-06, + "loss": 1.2483, + "step": 5340 + }, + { + "epoch": 0.3, + "grad_norm": 19.71902916847821, + "learning_rate": 9.999995701824045e-06, + "loss": 1.1916, + "step": 5345 + }, + { + "epoch": 0.3, + "grad_norm": 20.70424631708859, + "learning_rate": 9.99999328410061e-06, + "loss": 1.1702, + "step": 5350 + }, + { + "epoch": 0.3, + "grad_norm": 10.6804488590414, + "learning_rate": 9.99999032910583e-06, + "loss": 1.1901, + "step": 5355 + }, + { + "epoch": 0.3, + "grad_norm": 7.783159093383655, + "learning_rate": 9.999986836840025e-06, + "loss": 1.213, + "step": 5360 + }, + { + "epoch": 0.3, + "grad_norm": 11.091343304753137, + "learning_rate": 9.999982807303565e-06, + "loss": 1.1593, + "step": 5365 + }, + { + "epoch": 0.3, + "grad_norm": 8.762379753402147, + "learning_rate": 9.999978240496888e-06, + "loss": 1.1877, + "step": 5370 + }, + { + "epoch": 0.3, + "grad_norm": 10.131886057956466, + "learning_rate": 9.99997313642048e-06, + "loss": 1.17, + "step": 5375 + }, + { + "epoch": 0.3, + "grad_norm": 13.58416480541799, + "learning_rate": 9.999967495074893e-06, + "loss": 1.2139, + "step": 5380 + }, + { + "epoch": 0.3, + "grad_norm": 20.13355735589607, + "learning_rate": 9.999961316460734e-06, + "loss": 1.2132, + "step": 5385 + }, + { + "epoch": 0.3, + "grad_norm": 10.716110353909023, + "learning_rate": 9.999954600578664e-06, + "loss": 1.2026, + "step": 5390 + }, + { + "epoch": 0.3, + "grad_norm": 17.14182793444988, + "learning_rate": 9.999947347429404e-06, + "loss": 1.1717, + "step": 5395 + }, + { + "epoch": 0.3, + "grad_norm": 11.543748536284797, + "learning_rate": 9.999939557013735e-06, + "loss": 1.2499, + "step": 5400 + }, + { + "epoch": 0.3, + "grad_norm": 11.874892555428925, + "learning_rate": 9.999931229332497e-06, + "loss": 1.2488, + "step": 5405 + }, + { + "epoch": 0.3, + "grad_norm": 8.052901422161222, + "learning_rate": 9.99992236438658e-06, + "loss": 1.2287, + "step": 5410 + }, + { + "epoch": 0.31, + "grad_norm": 12.42883514807317, + "learning_rate": 9.99991296217694e-06, + "loss": 1.232, + "step": 5415 + }, + { + "epoch": 0.31, + "grad_norm": 21.873226141550873, + "learning_rate": 9.999903022704585e-06, + "loss": 1.194, + "step": 5420 + }, + { + "epoch": 0.31, + "grad_norm": 27.301087086132636, + "learning_rate": 9.999892545970584e-06, + "loss": 1.2601, + "step": 5425 + }, + { + "epoch": 0.31, + "grad_norm": 32.91165160043027, + "learning_rate": 9.999881531976061e-06, + "loss": 1.2243, + "step": 5430 + }, + { + "epoch": 0.31, + "grad_norm": 10.174456466038595, + "learning_rate": 9.999869980722203e-06, + "loss": 1.2251, + "step": 5435 + }, + { + "epoch": 0.31, + "grad_norm": 13.547114510797721, + "learning_rate": 9.99985789221025e-06, + "loss": 1.2368, + "step": 5440 + }, + { + "epoch": 0.31, + "grad_norm": 13.863205681315343, + "learning_rate": 9.9998452664415e-06, + "loss": 1.2096, + "step": 5445 + }, + { + "epoch": 0.31, + "grad_norm": 17.244074030599695, + "learning_rate": 9.99983210341731e-06, + "loss": 1.2346, + "step": 5450 + }, + { + "epoch": 0.31, + "grad_norm": 44.47088945480686, + "learning_rate": 9.999818403139095e-06, + "loss": 1.2125, + "step": 5455 + }, + { + "epoch": 0.31, + "grad_norm": 40.326348389319726, + "learning_rate": 9.999804165608324e-06, + "loss": 1.1811, + "step": 5460 + }, + { + "epoch": 0.31, + "grad_norm": 48.70966946626771, + "learning_rate": 9.999789390826533e-06, + "loss": 1.2393, + "step": 5465 + }, + { + "epoch": 0.31, + "grad_norm": 15.254712614925019, + "learning_rate": 9.999774078795305e-06, + "loss": 1.2123, + "step": 5470 + }, + { + "epoch": 0.31, + "grad_norm": 14.37011727537402, + "learning_rate": 9.999758229516285e-06, + "loss": 1.2538, + "step": 5475 + }, + { + "epoch": 0.31, + "grad_norm": 19.1875008874081, + "learning_rate": 9.99974184299118e-06, + "loss": 1.1837, + "step": 5480 + }, + { + "epoch": 0.31, + "grad_norm": 30.704990829622357, + "learning_rate": 9.999724919221748e-06, + "loss": 1.2427, + "step": 5485 + }, + { + "epoch": 0.31, + "grad_norm": 9.034428590584625, + "learning_rate": 9.999707458209806e-06, + "loss": 1.1904, + "step": 5490 + }, + { + "epoch": 0.31, + "grad_norm": 13.073346430131144, + "learning_rate": 9.999689459957235e-06, + "loss": 1.2091, + "step": 5495 + }, + { + "epoch": 0.31, + "grad_norm": 25.739096234399756, + "learning_rate": 9.999670924465965e-06, + "loss": 1.2148, + "step": 5500 + }, + { + "epoch": 0.31, + "grad_norm": 25.476229200823596, + "learning_rate": 9.999651851737988e-06, + "loss": 1.2147, + "step": 5505 + }, + { + "epoch": 0.31, + "grad_norm": 20.755577749329202, + "learning_rate": 9.999632241775356e-06, + "loss": 1.2629, + "step": 5510 + }, + { + "epoch": 0.31, + "grad_norm": 11.853478498323742, + "learning_rate": 9.999612094580173e-06, + "loss": 1.2378, + "step": 5515 + }, + { + "epoch": 0.31, + "grad_norm": 9.94071427053803, + "learning_rate": 9.999591410154607e-06, + "loss": 1.1723, + "step": 5520 + }, + { + "epoch": 0.31, + "grad_norm": 9.987992075584888, + "learning_rate": 9.999570188500878e-06, + "loss": 1.2089, + "step": 5525 + }, + { + "epoch": 0.31, + "grad_norm": 10.367646734886607, + "learning_rate": 9.999548429621268e-06, + "loss": 1.173, + "step": 5530 + }, + { + "epoch": 0.31, + "grad_norm": 28.945020269537835, + "learning_rate": 9.999526133518114e-06, + "loss": 1.2657, + "step": 5535 + }, + { + "epoch": 0.31, + "grad_norm": 20.0141231729636, + "learning_rate": 9.999503300193812e-06, + "loss": 1.1477, + "step": 5540 + }, + { + "epoch": 0.31, + "grad_norm": 35.39670647732335, + "learning_rate": 9.999479929650817e-06, + "loss": 1.1829, + "step": 5545 + }, + { + "epoch": 0.31, + "grad_norm": 8.426164209134736, + "learning_rate": 9.999456021891639e-06, + "loss": 1.1728, + "step": 5550 + }, + { + "epoch": 0.31, + "grad_norm": 9.157225159006694, + "learning_rate": 9.999431576918846e-06, + "loss": 1.2227, + "step": 5555 + }, + { + "epoch": 0.31, + "grad_norm": 11.200093175825737, + "learning_rate": 9.999406594735069e-06, + "loss": 1.195, + "step": 5560 + }, + { + "epoch": 0.31, + "grad_norm": 12.359635002109798, + "learning_rate": 9.999381075342985e-06, + "loss": 1.1369, + "step": 5565 + }, + { + "epoch": 0.31, + "grad_norm": 11.130749881958652, + "learning_rate": 9.999355018745342e-06, + "loss": 1.2477, + "step": 5570 + }, + { + "epoch": 0.31, + "grad_norm": 37.31754887801666, + "learning_rate": 9.999328424944941e-06, + "loss": 1.2176, + "step": 5575 + }, + { + "epoch": 0.31, + "grad_norm": 17.99169364129181, + "learning_rate": 9.999301293944634e-06, + "loss": 1.1923, + "step": 5580 + }, + { + "epoch": 0.31, + "grad_norm": 39.25216367288758, + "learning_rate": 9.999273625747342e-06, + "loss": 1.2632, + "step": 5585 + }, + { + "epoch": 0.31, + "grad_norm": 20.378364338026277, + "learning_rate": 9.999245420356033e-06, + "loss": 1.2102, + "step": 5590 + }, + { + "epoch": 0.32, + "grad_norm": 26.550975094038357, + "learning_rate": 9.999216677773741e-06, + "loss": 1.2125, + "step": 5595 + }, + { + "epoch": 0.32, + "grad_norm": 14.447423120345595, + "learning_rate": 9.999187398003553e-06, + "loss": 1.1979, + "step": 5600 + }, + { + "epoch": 0.32, + "grad_norm": 10.126437570958707, + "learning_rate": 9.999157581048618e-06, + "loss": 1.1757, + "step": 5605 + }, + { + "epoch": 0.32, + "grad_norm": 8.668495802312965, + "learning_rate": 9.999127226912137e-06, + "loss": 1.2428, + "step": 5610 + }, + { + "epoch": 0.32, + "grad_norm": 10.988932733946095, + "learning_rate": 9.999096335597372e-06, + "loss": 1.2566, + "step": 5615 + }, + { + "epoch": 0.32, + "grad_norm": 9.905705436481886, + "learning_rate": 9.999064907107643e-06, + "loss": 1.2127, + "step": 5620 + }, + { + "epoch": 0.32, + "grad_norm": 11.778779821183123, + "learning_rate": 9.999032941446327e-06, + "loss": 1.228, + "step": 5625 + }, + { + "epoch": 0.32, + "grad_norm": 32.87174609874904, + "learning_rate": 9.99900043861686e-06, + "loss": 1.2092, + "step": 5630 + }, + { + "epoch": 0.32, + "grad_norm": 115.50877638594692, + "learning_rate": 9.998967398622734e-06, + "loss": 1.4483, + "step": 5635 + }, + { + "epoch": 0.32, + "grad_norm": 881.4583633594746, + "learning_rate": 9.998933821467498e-06, + "loss": 5.8272, + "step": 5640 + }, + { + "epoch": 0.32, + "grad_norm": 3625.864863542416, + "learning_rate": 9.99889970715476e-06, + "loss": 31.1807, + "step": 5645 + }, + { + "epoch": 0.32, + "grad_norm": 305.2992238370064, + "learning_rate": 9.998865055688189e-06, + "loss": 15.8795, + "step": 5650 + }, + { + "epoch": 0.32, + "grad_norm": 220.90265182525806, + "learning_rate": 9.998829867071505e-06, + "loss": 11.2603, + "step": 5655 + }, + { + "epoch": 0.32, + "grad_norm": 202.50617493743596, + "learning_rate": 9.998794141308489e-06, + "loss": 9.4577, + "step": 5660 + }, + { + "epoch": 0.32, + "grad_norm": 127.08734520183457, + "learning_rate": 9.998757878402982e-06, + "loss": 8.3117, + "step": 5665 + }, + { + "epoch": 0.32, + "grad_norm": 46.60949031393593, + "learning_rate": 9.998721078358881e-06, + "loss": 7.9507, + "step": 5670 + }, + { + "epoch": 0.32, + "grad_norm": 96.18974630241713, + "learning_rate": 9.998683741180138e-06, + "loss": 7.5956, + "step": 5675 + }, + { + "epoch": 0.32, + "grad_norm": 96.12491744486918, + "learning_rate": 9.998645866870766e-06, + "loss": 7.315, + "step": 5680 + }, + { + "epoch": 0.32, + "grad_norm": 167.2630103539481, + "learning_rate": 9.998607455434835e-06, + "loss": 7.3938, + "step": 5685 + }, + { + "epoch": 0.32, + "grad_norm": 222.38158996981866, + "learning_rate": 9.998568506876473e-06, + "loss": 7.3077, + "step": 5690 + }, + { + "epoch": 0.32, + "grad_norm": 151.93538832351118, + "learning_rate": 9.998529021199863e-06, + "loss": 6.9725, + "step": 5695 + }, + { + "epoch": 0.32, + "grad_norm": 103.61752098498127, + "learning_rate": 9.998488998409251e-06, + "loss": 6.874, + "step": 5700 + }, + { + "epoch": 0.32, + "grad_norm": 296.2096876645142, + "learning_rate": 9.998448438508937e-06, + "loss": 6.7711, + "step": 5705 + }, + { + "epoch": 0.32, + "grad_norm": 200.06188781368692, + "learning_rate": 9.998407341503276e-06, + "loss": 6.8071, + "step": 5710 + }, + { + "epoch": 0.32, + "grad_norm": 56.562649527634086, + "learning_rate": 9.998365707396688e-06, + "loss": 6.5955, + "step": 5715 + }, + { + "epoch": 0.32, + "grad_norm": 131.0841260135904, + "learning_rate": 9.998323536193643e-06, + "loss": 6.1754, + "step": 5720 + }, + { + "epoch": 0.32, + "grad_norm": 91.185524542102, + "learning_rate": 9.998280827898679e-06, + "loss": 6.5431, + "step": 5725 + }, + { + "epoch": 0.32, + "grad_norm": 177.60728612407365, + "learning_rate": 9.998237582516377e-06, + "loss": 6.5339, + "step": 5730 + }, + { + "epoch": 0.32, + "grad_norm": 92.57721147555587, + "learning_rate": 9.99819380005139e-06, + "loss": 6.3031, + "step": 5735 + }, + { + "epoch": 0.32, + "grad_norm": 85.28429333005862, + "learning_rate": 9.99814948050842e-06, + "loss": 5.9641, + "step": 5740 + }, + { + "epoch": 0.32, + "grad_norm": 46.15466298428571, + "learning_rate": 9.99810462389223e-06, + "loss": 5.73, + "step": 5745 + }, + { + "epoch": 0.32, + "grad_norm": 108.04987142422614, + "learning_rate": 9.998059230207639e-06, + "loss": 5.5082, + "step": 5750 + }, + { + "epoch": 0.32, + "grad_norm": 71.01705772068496, + "learning_rate": 9.998013299459525e-06, + "loss": 5.1304, + "step": 5755 + }, + { + "epoch": 0.32, + "grad_norm": 166.47184090502537, + "learning_rate": 9.997966831652826e-06, + "loss": 4.7207, + "step": 5760 + }, + { + "epoch": 0.32, + "grad_norm": 108.34809596443222, + "learning_rate": 9.997919826792532e-06, + "loss": 4.4064, + "step": 5765 + }, + { + "epoch": 0.33, + "grad_norm": 203.25488572574156, + "learning_rate": 9.997872284883697e-06, + "loss": 4.1995, + "step": 5770 + }, + { + "epoch": 0.33, + "grad_norm": 76.96913596718933, + "learning_rate": 9.997824205931426e-06, + "loss": 3.918, + "step": 5775 + }, + { + "epoch": 0.33, + "grad_norm": 46.789318270837775, + "learning_rate": 9.997775589940886e-06, + "loss": 3.2998, + "step": 5780 + }, + { + "epoch": 0.33, + "grad_norm": 150.9068237438903, + "learning_rate": 9.997726436917303e-06, + "loss": 2.9199, + "step": 5785 + }, + { + "epoch": 0.33, + "grad_norm": 101.81714399310493, + "learning_rate": 9.997676746865958e-06, + "loss": 2.8614, + "step": 5790 + }, + { + "epoch": 0.33, + "grad_norm": 39.513324392664025, + "learning_rate": 9.997626519792192e-06, + "loss": 2.3969, + "step": 5795 + }, + { + "epoch": 0.33, + "grad_norm": 110.3570993391848, + "learning_rate": 9.997575755701397e-06, + "loss": 3.1355, + "step": 5800 + }, + { + "epoch": 0.33, + "grad_norm": 39.52089431069558, + "learning_rate": 9.997524454599033e-06, + "loss": 3.2513, + "step": 5805 + }, + { + "epoch": 0.33, + "grad_norm": 33.58047088835631, + "learning_rate": 9.997472616490608e-06, + "loss": 2.8184, + "step": 5810 + }, + { + "epoch": 0.33, + "grad_norm": 65.7301556926572, + "learning_rate": 9.9974202413817e-06, + "loss": 2.6076, + "step": 5815 + }, + { + "epoch": 0.33, + "grad_norm": 49.74937374199039, + "learning_rate": 9.997367329277927e-06, + "loss": 2.2297, + "step": 5820 + }, + { + "epoch": 0.33, + "grad_norm": 56.065182542415954, + "learning_rate": 9.99731388018498e-06, + "loss": 2.1355, + "step": 5825 + }, + { + "epoch": 0.33, + "grad_norm": 109.14082554172863, + "learning_rate": 9.997259894108604e-06, + "loss": 2.0288, + "step": 5830 + }, + { + "epoch": 0.33, + "grad_norm": 33.05771008468508, + "learning_rate": 9.997205371054595e-06, + "loss": 2.0012, + "step": 5835 + }, + { + "epoch": 0.33, + "grad_norm": 57.27210785048184, + "learning_rate": 9.997150311028817e-06, + "loss": 1.9041, + "step": 5840 + }, + { + "epoch": 0.33, + "grad_norm": 16.765176528339722, + "learning_rate": 9.997094714037184e-06, + "loss": 1.7866, + "step": 5845 + }, + { + "epoch": 0.33, + "grad_norm": 20.644104953020037, + "learning_rate": 9.997038580085667e-06, + "loss": 1.7795, + "step": 5850 + }, + { + "epoch": 0.33, + "grad_norm": 13.885425575621817, + "learning_rate": 9.996981909180302e-06, + "loss": 1.6351, + "step": 5855 + }, + { + "epoch": 0.33, + "grad_norm": 16.47464971728927, + "learning_rate": 9.996924701327181e-06, + "loss": 1.7488, + "step": 5860 + }, + { + "epoch": 0.33, + "grad_norm": 35.885926326951555, + "learning_rate": 9.996866956532445e-06, + "loss": 1.6965, + "step": 5865 + }, + { + "epoch": 0.33, + "grad_norm": 30.187527884353738, + "learning_rate": 9.996808674802302e-06, + "loss": 1.5814, + "step": 5870 + }, + { + "epoch": 0.33, + "grad_norm": 10.593526588901867, + "learning_rate": 9.996749856143015e-06, + "loss": 1.5712, + "step": 5875 + }, + { + "epoch": 0.33, + "grad_norm": 17.543105853979235, + "learning_rate": 9.996690500560901e-06, + "loss": 1.6264, + "step": 5880 + }, + { + "epoch": 0.33, + "grad_norm": 19.130086679293516, + "learning_rate": 9.996630608062343e-06, + "loss": 1.5903, + "step": 5885 + }, + { + "epoch": 0.33, + "grad_norm": 24.336913613381725, + "learning_rate": 9.996570178653775e-06, + "loss": 1.5608, + "step": 5890 + }, + { + "epoch": 0.33, + "grad_norm": 11.863603621841438, + "learning_rate": 9.996509212341688e-06, + "loss": 1.5058, + "step": 5895 + }, + { + "epoch": 0.33, + "grad_norm": 17.92143656057317, + "learning_rate": 9.996447709132636e-06, + "loss": 1.5812, + "step": 5900 + }, + { + "epoch": 0.33, + "grad_norm": 27.622210473749963, + "learning_rate": 9.996385669033226e-06, + "loss": 1.4991, + "step": 5905 + }, + { + "epoch": 0.33, + "grad_norm": 17.14597569004163, + "learning_rate": 9.996323092050124e-06, + "loss": 1.3941, + "step": 5910 + }, + { + "epoch": 0.33, + "grad_norm": 8.666020322647581, + "learning_rate": 9.996259978190057e-06, + "loss": 1.4521, + "step": 5915 + }, + { + "epoch": 0.33, + "grad_norm": 12.34972346254299, + "learning_rate": 9.996196327459805e-06, + "loss": 1.4744, + "step": 5920 + }, + { + "epoch": 0.33, + "grad_norm": 10.553647830129735, + "learning_rate": 9.996132139866209e-06, + "loss": 1.4585, + "step": 5925 + }, + { + "epoch": 0.33, + "grad_norm": 11.795663010068404, + "learning_rate": 9.996067415416162e-06, + "loss": 1.4352, + "step": 5930 + }, + { + "epoch": 0.33, + "grad_norm": 9.003136410264096, + "learning_rate": 9.996002154116623e-06, + "loss": 1.453, + "step": 5935 + }, + { + "epoch": 0.33, + "grad_norm": 22.519047791976174, + "learning_rate": 9.995936355974604e-06, + "loss": 1.4043, + "step": 5940 + }, + { + "epoch": 0.33, + "grad_norm": 22.01002822942101, + "learning_rate": 9.995870020997174e-06, + "loss": 1.4325, + "step": 5945 + }, + { + "epoch": 0.34, + "grad_norm": 16.875887721533456, + "learning_rate": 9.995803149191464e-06, + "loss": 1.3588, + "step": 5950 + }, + { + "epoch": 0.34, + "grad_norm": 8.485119099960336, + "learning_rate": 9.995735740564654e-06, + "loss": 1.3962, + "step": 5955 + }, + { + "epoch": 0.34, + "grad_norm": 19.37483936114496, + "learning_rate": 9.995667795123993e-06, + "loss": 1.4035, + "step": 5960 + }, + { + "epoch": 0.34, + "grad_norm": 11.990121869896935, + "learning_rate": 9.99559931287678e-06, + "loss": 1.3746, + "step": 5965 + }, + { + "epoch": 0.34, + "grad_norm": 11.323034006028854, + "learning_rate": 9.995530293830373e-06, + "loss": 1.4105, + "step": 5970 + }, + { + "epoch": 0.34, + "grad_norm": 25.63935213087236, + "learning_rate": 9.995460737992189e-06, + "loss": 1.3679, + "step": 5975 + }, + { + "epoch": 0.34, + "grad_norm": 14.770733129860528, + "learning_rate": 9.995390645369702e-06, + "loss": 1.3796, + "step": 5980 + }, + { + "epoch": 0.34, + "grad_norm": 19.724713081643543, + "learning_rate": 9.995320015970444e-06, + "loss": 1.3579, + "step": 5985 + }, + { + "epoch": 0.34, + "grad_norm": 13.353192099760806, + "learning_rate": 9.995248849802005e-06, + "loss": 1.3298, + "step": 5990 + }, + { + "epoch": 0.34, + "grad_norm": 41.078784542354725, + "learning_rate": 9.99517714687203e-06, + "loss": 1.3807, + "step": 5995 + }, + { + "epoch": 0.34, + "grad_norm": 15.714724279848866, + "learning_rate": 9.995104907188224e-06, + "loss": 1.3605, + "step": 6000 + }, + { + "epoch": 0.34, + "grad_norm": 18.188363629473503, + "learning_rate": 9.995032130758352e-06, + "loss": 1.3698, + "step": 6005 + }, + { + "epoch": 0.34, + "grad_norm": 17.443782220759342, + "learning_rate": 9.994958817590235e-06, + "loss": 1.356, + "step": 6010 + }, + { + "epoch": 0.34, + "grad_norm": 48.070755393626605, + "learning_rate": 9.994884967691746e-06, + "loss": 1.3474, + "step": 6015 + }, + { + "epoch": 0.34, + "grad_norm": 9.701750321001162, + "learning_rate": 9.994810581070824e-06, + "loss": 1.2907, + "step": 6020 + }, + { + "epoch": 0.34, + "grad_norm": 21.92830765227391, + "learning_rate": 9.994735657735461e-06, + "loss": 1.3804, + "step": 6025 + }, + { + "epoch": 0.34, + "grad_norm": 12.06167115759183, + "learning_rate": 9.994660197693709e-06, + "loss": 1.306, + "step": 6030 + }, + { + "epoch": 0.34, + "grad_norm": 15.024618993003601, + "learning_rate": 9.994584200953675e-06, + "loss": 1.3732, + "step": 6035 + }, + { + "epoch": 0.34, + "grad_norm": 47.32997844861397, + "learning_rate": 9.994507667523525e-06, + "loss": 1.3245, + "step": 6040 + }, + { + "epoch": 0.34, + "grad_norm": 13.72115080743169, + "learning_rate": 9.994430597411485e-06, + "loss": 1.3369, + "step": 6045 + }, + { + "epoch": 0.34, + "grad_norm": 31.174924094202126, + "learning_rate": 9.994352990625834e-06, + "loss": 1.3063, + "step": 6050 + }, + { + "epoch": 0.34, + "grad_norm": 16.511141474095467, + "learning_rate": 9.994274847174914e-06, + "loss": 1.316, + "step": 6055 + }, + { + "epoch": 0.34, + "grad_norm": 39.562695270008206, + "learning_rate": 9.994196167067119e-06, + "loss": 1.3749, + "step": 6060 + }, + { + "epoch": 0.34, + "grad_norm": 30.851507876936136, + "learning_rate": 9.994116950310906e-06, + "loss": 1.3287, + "step": 6065 + }, + { + "epoch": 0.34, + "grad_norm": 17.80246557600141, + "learning_rate": 9.994037196914783e-06, + "loss": 1.303, + "step": 6070 + }, + { + "epoch": 0.34, + "grad_norm": 10.735455033227984, + "learning_rate": 9.993956906887325e-06, + "loss": 1.3022, + "step": 6075 + }, + { + "epoch": 0.34, + "grad_norm": 9.090530761996225, + "learning_rate": 9.993876080237156e-06, + "loss": 1.3235, + "step": 6080 + }, + { + "epoch": 0.34, + "grad_norm": 18.250823813128243, + "learning_rate": 9.993794716972965e-06, + "loss": 1.2754, + "step": 6085 + }, + { + "epoch": 0.34, + "grad_norm": 7.751606006352525, + "learning_rate": 9.993712817103492e-06, + "loss": 1.2513, + "step": 6090 + }, + { + "epoch": 0.34, + "grad_norm": 16.24144172013427, + "learning_rate": 9.993630380637535e-06, + "loss": 1.2602, + "step": 6095 + }, + { + "epoch": 0.34, + "grad_norm": 7.462754315735199, + "learning_rate": 9.993547407583959e-06, + "loss": 1.2914, + "step": 6100 + }, + { + "epoch": 0.34, + "grad_norm": 10.361145071242321, + "learning_rate": 9.993463897951674e-06, + "loss": 1.2964, + "step": 6105 + }, + { + "epoch": 0.34, + "grad_norm": 16.5607624591136, + "learning_rate": 9.993379851749654e-06, + "loss": 1.294, + "step": 6110 + }, + { + "epoch": 0.34, + "grad_norm": 6.514105902885473, + "learning_rate": 9.993295268986933e-06, + "loss": 1.3002, + "step": 6115 + }, + { + "epoch": 0.34, + "grad_norm": 22.428444692759303, + "learning_rate": 9.993210149672599e-06, + "loss": 1.2937, + "step": 6120 + }, + { + "epoch": 0.35, + "grad_norm": 6.3766845403608885, + "learning_rate": 9.993124493815795e-06, + "loss": 1.2976, + "step": 6125 + }, + { + "epoch": 0.35, + "grad_norm": 10.438019509579105, + "learning_rate": 9.99303830142573e-06, + "loss": 1.2802, + "step": 6130 + }, + { + "epoch": 0.35, + "grad_norm": 7.211470289024709, + "learning_rate": 9.992951572511663e-06, + "loss": 1.2915, + "step": 6135 + }, + { + "epoch": 0.35, + "grad_norm": 8.094169692326695, + "learning_rate": 9.992864307082913e-06, + "loss": 1.3278, + "step": 6140 + }, + { + "epoch": 0.35, + "grad_norm": 8.0882662818744, + "learning_rate": 9.99277650514886e-06, + "loss": 1.3465, + "step": 6145 + }, + { + "epoch": 0.35, + "grad_norm": 11.03194416213248, + "learning_rate": 9.992688166718936e-06, + "loss": 1.2896, + "step": 6150 + }, + { + "epoch": 0.35, + "grad_norm": 7.655716652474666, + "learning_rate": 9.992599291802632e-06, + "loss": 1.2799, + "step": 6155 + }, + { + "epoch": 0.35, + "grad_norm": 6.493938727180848, + "learning_rate": 9.992509880409502e-06, + "loss": 1.248, + "step": 6160 + }, + { + "epoch": 0.35, + "grad_norm": 7.579512459797152, + "learning_rate": 9.992419932549153e-06, + "loss": 1.3345, + "step": 6165 + }, + { + "epoch": 0.35, + "grad_norm": 9.364772561346149, + "learning_rate": 9.992329448231247e-06, + "loss": 1.2394, + "step": 6170 + }, + { + "epoch": 0.35, + "grad_norm": 7.170354962495536, + "learning_rate": 9.992238427465509e-06, + "loss": 1.2742, + "step": 6175 + }, + { + "epoch": 0.35, + "grad_norm": 6.749806841645428, + "learning_rate": 9.99214687026172e-06, + "loss": 1.2616, + "step": 6180 + }, + { + "epoch": 0.35, + "grad_norm": 10.81401302120067, + "learning_rate": 9.992054776629718e-06, + "loss": 1.2788, + "step": 6185 + }, + { + "epoch": 0.35, + "grad_norm": 12.618846351763926, + "learning_rate": 9.991962146579398e-06, + "loss": 1.2593, + "step": 6190 + }, + { + "epoch": 0.35, + "grad_norm": 7.48171112079907, + "learning_rate": 9.991868980120715e-06, + "loss": 1.2739, + "step": 6195 + }, + { + "epoch": 0.35, + "grad_norm": 9.382895403840644, + "learning_rate": 9.991775277263679e-06, + "loss": 1.2356, + "step": 6200 + }, + { + "epoch": 0.35, + "grad_norm": 8.339821057005265, + "learning_rate": 9.991681038018358e-06, + "loss": 1.2174, + "step": 6205 + }, + { + "epoch": 0.35, + "grad_norm": 6.43383633184435, + "learning_rate": 9.99158626239488e-06, + "loss": 1.252, + "step": 6210 + }, + { + "epoch": 0.35, + "grad_norm": 12.853426960471305, + "learning_rate": 9.99149095040343e-06, + "loss": 1.2671, + "step": 6215 + }, + { + "epoch": 0.35, + "grad_norm": 7.38063312361502, + "learning_rate": 9.991395102054247e-06, + "loss": 1.3231, + "step": 6220 + }, + { + "epoch": 0.35, + "grad_norm": 18.293696295463246, + "learning_rate": 9.99129871735763e-06, + "loss": 1.2648, + "step": 6225 + }, + { + "epoch": 0.35, + "grad_norm": 9.106918476852423, + "learning_rate": 9.991201796323942e-06, + "loss": 1.2838, + "step": 6230 + }, + { + "epoch": 0.35, + "grad_norm": 11.306838698078574, + "learning_rate": 9.991104338963589e-06, + "loss": 1.213, + "step": 6235 + }, + { + "epoch": 0.35, + "grad_norm": 7.798825223650576, + "learning_rate": 9.99100634528705e-06, + "loss": 1.2426, + "step": 6240 + }, + { + "epoch": 0.35, + "grad_norm": 7.919194363165933, + "learning_rate": 9.990907815304852e-06, + "loss": 1.2312, + "step": 6245 + }, + { + "epoch": 0.35, + "grad_norm": 7.478546964961088, + "learning_rate": 9.99080874902758e-06, + "loss": 1.2159, + "step": 6250 + }, + { + "epoch": 0.35, + "grad_norm": 8.07205165223581, + "learning_rate": 9.990709146465883e-06, + "loss": 1.2935, + "step": 6255 + }, + { + "epoch": 0.35, + "grad_norm": 18.683757685547498, + "learning_rate": 9.990609007630463e-06, + "loss": 1.2632, + "step": 6260 + }, + { + "epoch": 0.35, + "grad_norm": 15.307852796498807, + "learning_rate": 9.990508332532081e-06, + "loss": 1.2277, + "step": 6265 + }, + { + "epoch": 0.35, + "grad_norm": 7.457185329870928, + "learning_rate": 9.990407121181552e-06, + "loss": 1.2405, + "step": 6270 + }, + { + "epoch": 0.35, + "grad_norm": 12.918195701636762, + "learning_rate": 9.990305373589755e-06, + "loss": 1.2337, + "step": 6275 + }, + { + "epoch": 0.35, + "grad_norm": 6.652700093334805, + "learning_rate": 9.990203089767621e-06, + "loss": 1.2668, + "step": 6280 + }, + { + "epoch": 0.35, + "grad_norm": 7.710269812484548, + "learning_rate": 9.990100269726142e-06, + "loss": 1.2457, + "step": 6285 + }, + { + "epoch": 0.35, + "grad_norm": 5.886033872146542, + "learning_rate": 9.989996913476365e-06, + "loss": 1.2653, + "step": 6290 + }, + { + "epoch": 0.35, + "grad_norm": 6.875680118840527, + "learning_rate": 9.989893021029397e-06, + "loss": 1.2257, + "step": 6295 + }, + { + "epoch": 0.35, + "grad_norm": 13.031372531354194, + "learning_rate": 9.989788592396403e-06, + "loss": 1.2417, + "step": 6300 + }, + { + "epoch": 0.36, + "grad_norm": 6.029512932873606, + "learning_rate": 9.989683627588602e-06, + "loss": 1.2226, + "step": 6305 + }, + { + "epoch": 0.36, + "grad_norm": 10.366888709665293, + "learning_rate": 9.989578126617275e-06, + "loss": 1.2136, + "step": 6310 + }, + { + "epoch": 0.36, + "grad_norm": 6.228820263441918, + "learning_rate": 9.989472089493757e-06, + "loss": 1.2434, + "step": 6315 + }, + { + "epoch": 0.36, + "grad_norm": 8.872041024498126, + "learning_rate": 9.989365516229443e-06, + "loss": 1.1995, + "step": 6320 + }, + { + "epoch": 0.36, + "grad_norm": 11.195402871756778, + "learning_rate": 9.989258406835784e-06, + "loss": 1.26, + "step": 6325 + }, + { + "epoch": 0.36, + "grad_norm": 7.746412979279807, + "learning_rate": 9.989150761324291e-06, + "loss": 1.2123, + "step": 6330 + }, + { + "epoch": 0.36, + "grad_norm": 9.677966169342557, + "learning_rate": 9.989042579706529e-06, + "loss": 1.2162, + "step": 6335 + }, + { + "epoch": 0.36, + "grad_norm": 8.547617666929987, + "learning_rate": 9.988933861994126e-06, + "loss": 1.1848, + "step": 6340 + }, + { + "epoch": 0.36, + "grad_norm": 11.536929004723213, + "learning_rate": 9.988824608198759e-06, + "loss": 1.2127, + "step": 6345 + }, + { + "epoch": 0.36, + "grad_norm": 6.949556034222528, + "learning_rate": 9.98871481833217e-06, + "loss": 1.2518, + "step": 6350 + }, + { + "epoch": 0.36, + "grad_norm": 7.790578838003556, + "learning_rate": 9.988604492406158e-06, + "loss": 1.1982, + "step": 6355 + }, + { + "epoch": 0.36, + "grad_norm": 7.176996419741183, + "learning_rate": 9.988493630432577e-06, + "loss": 1.2282, + "step": 6360 + }, + { + "epoch": 0.36, + "grad_norm": 6.317991360196196, + "learning_rate": 9.988382232423338e-06, + "loss": 1.24, + "step": 6365 + }, + { + "epoch": 0.36, + "grad_norm": 6.271672244857991, + "learning_rate": 9.988270298390415e-06, + "loss": 1.2182, + "step": 6370 + }, + { + "epoch": 0.36, + "grad_norm": 8.743481707909398, + "learning_rate": 9.988157828345833e-06, + "loss": 1.2571, + "step": 6375 + }, + { + "epoch": 0.36, + "grad_norm": 10.888077544382435, + "learning_rate": 9.988044822301678e-06, + "loss": 1.2328, + "step": 6380 + }, + { + "epoch": 0.36, + "grad_norm": 7.160903762627039, + "learning_rate": 9.987931280270093e-06, + "loss": 1.1933, + "step": 6385 + }, + { + "epoch": 0.36, + "grad_norm": 15.00407774737748, + "learning_rate": 9.987817202263276e-06, + "loss": 1.2325, + "step": 6390 + }, + { + "epoch": 0.36, + "grad_norm": 6.029209848814627, + "learning_rate": 9.98770258829349e-06, + "loss": 1.1658, + "step": 6395 + }, + { + "epoch": 0.36, + "grad_norm": 26.30664677401515, + "learning_rate": 9.98758743837305e-06, + "loss": 1.1992, + "step": 6400 + }, + { + "epoch": 0.36, + "grad_norm": 18.125595188626505, + "learning_rate": 9.987471752514326e-06, + "loss": 1.2889, + "step": 6405 + }, + { + "epoch": 0.36, + "grad_norm": 20.211557792670988, + "learning_rate": 9.987355530729749e-06, + "loss": 1.2021, + "step": 6410 + }, + { + "epoch": 0.36, + "grad_norm": 12.557012792283704, + "learning_rate": 9.987238773031813e-06, + "loss": 1.1957, + "step": 6415 + }, + { + "epoch": 0.36, + "grad_norm": 10.691877776216604, + "learning_rate": 9.987121479433058e-06, + "loss": 1.2452, + "step": 6420 + }, + { + "epoch": 0.36, + "grad_norm": 7.072706922358888, + "learning_rate": 9.98700364994609e-06, + "loss": 1.2601, + "step": 6425 + }, + { + "epoch": 0.36, + "grad_norm": 26.229296615583486, + "learning_rate": 9.986885284583571e-06, + "loss": 1.2652, + "step": 6430 + }, + { + "epoch": 0.36, + "grad_norm": 20.646765147718657, + "learning_rate": 9.98676638335822e-06, + "loss": 1.3181, + "step": 6435 + }, + { + "epoch": 0.36, + "grad_norm": 19.255804620174242, + "learning_rate": 9.986646946282813e-06, + "loss": 1.414, + "step": 6440 + }, + { + "epoch": 0.36, + "grad_norm": 13.07395174063236, + "learning_rate": 9.986526973370183e-06, + "loss": 1.3736, + "step": 6445 + }, + { + "epoch": 0.36, + "grad_norm": 11.26355634975153, + "learning_rate": 9.986406464633223e-06, + "loss": 1.2844, + "step": 6450 + }, + { + "epoch": 0.36, + "grad_norm": 22.164598787057212, + "learning_rate": 9.986285420084882e-06, + "loss": 1.2579, + "step": 6455 + }, + { + "epoch": 0.36, + "grad_norm": 15.467313692602483, + "learning_rate": 9.986163839738166e-06, + "loss": 1.193, + "step": 6460 + }, + { + "epoch": 0.36, + "grad_norm": 6.508513702137008, + "learning_rate": 9.98604172360614e-06, + "loss": 1.2054, + "step": 6465 + }, + { + "epoch": 0.36, + "grad_norm": 23.286058715981014, + "learning_rate": 9.985919071701924e-06, + "loss": 1.2745, + "step": 6470 + }, + { + "epoch": 0.36, + "grad_norm": 13.995764903647617, + "learning_rate": 9.985795884038702e-06, + "loss": 1.2495, + "step": 6475 + }, + { + "epoch": 0.37, + "grad_norm": 21.336127048366873, + "learning_rate": 9.985672160629707e-06, + "loss": 1.2266, + "step": 6480 + }, + { + "epoch": 0.37, + "grad_norm": 18.059099352692346, + "learning_rate": 9.985547901488234e-06, + "loss": 1.2429, + "step": 6485 + }, + { + "epoch": 0.37, + "grad_norm": 15.811121913527234, + "learning_rate": 9.985423106627637e-06, + "loss": 1.2583, + "step": 6490 + }, + { + "epoch": 0.37, + "grad_norm": 17.09505169249471, + "learning_rate": 9.985297776061325e-06, + "loss": 1.1985, + "step": 6495 + }, + { + "epoch": 0.37, + "grad_norm": 8.115540489699223, + "learning_rate": 9.985171909802764e-06, + "loss": 1.1897, + "step": 6500 + }, + { + "epoch": 0.37, + "grad_norm": 7.421280212127511, + "learning_rate": 9.985045507865483e-06, + "loss": 1.1876, + "step": 6505 + }, + { + "epoch": 0.37, + "grad_norm": 8.42788968584019, + "learning_rate": 9.984918570263057e-06, + "loss": 1.2182, + "step": 6510 + }, + { + "epoch": 0.37, + "grad_norm": 17.183407227421018, + "learning_rate": 9.984791097009135e-06, + "loss": 1.1955, + "step": 6515 + }, + { + "epoch": 0.37, + "grad_norm": 7.311443818316873, + "learning_rate": 9.984663088117408e-06, + "loss": 1.2622, + "step": 6520 + }, + { + "epoch": 0.37, + "grad_norm": 5.801014423396312, + "learning_rate": 9.984534543601633e-06, + "loss": 1.1979, + "step": 6525 + }, + { + "epoch": 0.37, + "grad_norm": 6.702329591648484, + "learning_rate": 9.984405463475623e-06, + "loss": 1.2319, + "step": 6530 + }, + { + "epoch": 0.37, + "grad_norm": 6.6443117734074635, + "learning_rate": 9.984275847753249e-06, + "loss": 1.1722, + "step": 6535 + }, + { + "epoch": 0.37, + "grad_norm": 6.961602409349189, + "learning_rate": 9.984145696448436e-06, + "loss": 1.1836, + "step": 6540 + }, + { + "epoch": 0.37, + "grad_norm": 6.1218714975184945, + "learning_rate": 9.984015009575173e-06, + "loss": 1.2184, + "step": 6545 + }, + { + "epoch": 0.37, + "grad_norm": 8.770273509403772, + "learning_rate": 9.9838837871475e-06, + "loss": 1.2763, + "step": 6550 + }, + { + "epoch": 0.37, + "grad_norm": 6.412295504959258, + "learning_rate": 9.983752029179519e-06, + "loss": 1.2296, + "step": 6555 + }, + { + "epoch": 0.37, + "grad_norm": 7.005358612462016, + "learning_rate": 9.983619735685388e-06, + "loss": 1.2663, + "step": 6560 + }, + { + "epoch": 0.37, + "grad_norm": 12.303838874266845, + "learning_rate": 9.98348690667932e-06, + "loss": 1.3097, + "step": 6565 + }, + { + "epoch": 0.37, + "grad_norm": 9.482381417791832, + "learning_rate": 9.983353542175593e-06, + "loss": 1.1933, + "step": 6570 + }, + { + "epoch": 0.37, + "grad_norm": 7.8825771764517185, + "learning_rate": 9.983219642188534e-06, + "loss": 1.2223, + "step": 6575 + }, + { + "epoch": 0.37, + "grad_norm": 9.0285012987186, + "learning_rate": 9.98308520673253e-06, + "loss": 1.1972, + "step": 6580 + }, + { + "epoch": 0.37, + "grad_norm": 10.451173223452615, + "learning_rate": 9.982950235822029e-06, + "loss": 1.2012, + "step": 6585 + }, + { + "epoch": 0.37, + "grad_norm": 6.26244826372432, + "learning_rate": 9.982814729471534e-06, + "loss": 1.227, + "step": 6590 + }, + { + "epoch": 0.37, + "grad_norm": 6.762539474418195, + "learning_rate": 9.982678687695606e-06, + "loss": 1.1729, + "step": 6595 + }, + { + "epoch": 0.37, + "grad_norm": 5.948620552003382, + "learning_rate": 9.982542110508861e-06, + "loss": 1.2146, + "step": 6600 + }, + { + "epoch": 0.37, + "grad_norm": 5.969768118573688, + "learning_rate": 9.982404997925979e-06, + "loss": 1.1989, + "step": 6605 + }, + { + "epoch": 0.37, + "grad_norm": 5.409753394542134, + "learning_rate": 9.98226734996169e-06, + "loss": 1.186, + "step": 6610 + }, + { + "epoch": 0.37, + "grad_norm": 5.606136956491269, + "learning_rate": 9.982129166630784e-06, + "loss": 1.1183, + "step": 6615 + }, + { + "epoch": 0.37, + "grad_norm": 11.052044206488178, + "learning_rate": 9.981990447948115e-06, + "loss": 1.2006, + "step": 6620 + }, + { + "epoch": 0.37, + "grad_norm": 5.276486519041882, + "learning_rate": 9.981851193928582e-06, + "loss": 1.1967, + "step": 6625 + }, + { + "epoch": 0.37, + "grad_norm": 6.725980893011915, + "learning_rate": 9.981711404587154e-06, + "loss": 1.1951, + "step": 6630 + }, + { + "epoch": 0.37, + "grad_norm": 8.543421111360324, + "learning_rate": 9.981571079938848e-06, + "loss": 1.1752, + "step": 6635 + }, + { + "epoch": 0.37, + "grad_norm": 5.42120363304379, + "learning_rate": 9.981430219998745e-06, + "loss": 1.189, + "step": 6640 + }, + { + "epoch": 0.37, + "grad_norm": 6.31891174061556, + "learning_rate": 9.981288824781979e-06, + "loss": 1.179, + "step": 6645 + }, + { + "epoch": 0.37, + "grad_norm": 16.73574880272908, + "learning_rate": 9.981146894303746e-06, + "loss": 1.221, + "step": 6650 + }, + { + "epoch": 0.37, + "grad_norm": 6.625795137531074, + "learning_rate": 9.981004428579295e-06, + "loss": 1.2337, + "step": 6655 + }, + { + "epoch": 0.38, + "grad_norm": 11.926641347856304, + "learning_rate": 9.980861427623936e-06, + "loss": 1.1927, + "step": 6660 + }, + { + "epoch": 0.38, + "grad_norm": 10.027132323335643, + "learning_rate": 9.980717891453033e-06, + "loss": 1.229, + "step": 6665 + }, + { + "epoch": 0.38, + "grad_norm": 5.515724171980387, + "learning_rate": 9.98057382008201e-06, + "loss": 1.1977, + "step": 6670 + }, + { + "epoch": 0.38, + "grad_norm": 15.04002557342166, + "learning_rate": 9.980429213526353e-06, + "loss": 1.2101, + "step": 6675 + }, + { + "epoch": 0.38, + "grad_norm": 21.2277641416974, + "learning_rate": 9.980284071801593e-06, + "loss": 1.2256, + "step": 6680 + }, + { + "epoch": 0.38, + "grad_norm": 6.385543940602545, + "learning_rate": 9.980138394923332e-06, + "loss": 1.207, + "step": 6685 + }, + { + "epoch": 0.38, + "grad_norm": 6.407538260156455, + "learning_rate": 9.979992182907221e-06, + "loss": 1.2228, + "step": 6690 + }, + { + "epoch": 0.38, + "grad_norm": 7.668419958029871, + "learning_rate": 9.979845435768972e-06, + "loss": 1.1601, + "step": 6695 + }, + { + "epoch": 0.38, + "grad_norm": 9.056901129620707, + "learning_rate": 9.979698153524352e-06, + "loss": 1.1835, + "step": 6700 + }, + { + "epoch": 0.38, + "grad_norm": 7.809242809918025, + "learning_rate": 9.97955033618919e-06, + "loss": 1.1895, + "step": 6705 + }, + { + "epoch": 0.38, + "grad_norm": 6.7023950580463545, + "learning_rate": 9.979401983779365e-06, + "loss": 1.2104, + "step": 6710 + }, + { + "epoch": 0.38, + "grad_norm": 9.585746717283113, + "learning_rate": 9.979253096310823e-06, + "loss": 1.2242, + "step": 6715 + }, + { + "epoch": 0.38, + "grad_norm": 6.002910629314679, + "learning_rate": 9.97910367379956e-06, + "loss": 1.1486, + "step": 6720 + }, + { + "epoch": 0.38, + "grad_norm": 5.570248464117552, + "learning_rate": 9.978953716261635e-06, + "loss": 1.2567, + "step": 6725 + }, + { + "epoch": 0.38, + "grad_norm": 6.381962150302656, + "learning_rate": 9.978803223713156e-06, + "loss": 1.2097, + "step": 6730 + }, + { + "epoch": 0.38, + "grad_norm": 6.2509666853499715, + "learning_rate": 9.9786521961703e-06, + "loss": 1.1832, + "step": 6735 + }, + { + "epoch": 0.38, + "grad_norm": 5.526298456096228, + "learning_rate": 9.978500633649293e-06, + "loss": 1.1829, + "step": 6740 + }, + { + "epoch": 0.38, + "grad_norm": 6.6947050563226576, + "learning_rate": 9.97834853616642e-06, + "loss": 1.1662, + "step": 6745 + }, + { + "epoch": 0.38, + "grad_norm": 5.66838934819671, + "learning_rate": 9.978195903738025e-06, + "loss": 1.1819, + "step": 6750 + }, + { + "epoch": 0.38, + "grad_norm": 10.851742927177732, + "learning_rate": 9.978042736380512e-06, + "loss": 1.2557, + "step": 6755 + }, + { + "epoch": 0.38, + "grad_norm": 10.781792167301317, + "learning_rate": 9.977889034110335e-06, + "loss": 1.1885, + "step": 6760 + }, + { + "epoch": 0.38, + "grad_norm": 7.408764618901928, + "learning_rate": 9.977734796944014e-06, + "loss": 1.1605, + "step": 6765 + }, + { + "epoch": 0.38, + "grad_norm": 14.7193806777456, + "learning_rate": 9.97758002489812e-06, + "loss": 1.1567, + "step": 6770 + }, + { + "epoch": 0.38, + "grad_norm": 9.112051037999008, + "learning_rate": 9.977424717989283e-06, + "loss": 1.2238, + "step": 6775 + }, + { + "epoch": 0.38, + "grad_norm": 6.4197189642230095, + "learning_rate": 9.977268876234194e-06, + "loss": 1.1709, + "step": 6780 + }, + { + "epoch": 0.38, + "grad_norm": 7.464930167865641, + "learning_rate": 9.9771124996496e-06, + "loss": 1.1785, + "step": 6785 + }, + { + "epoch": 0.38, + "grad_norm": 7.589860503526791, + "learning_rate": 9.9769555882523e-06, + "loss": 1.2352, + "step": 6790 + }, + { + "epoch": 0.38, + "grad_norm": 15.092556393247447, + "learning_rate": 9.976798142059157e-06, + "loss": 1.2366, + "step": 6795 + }, + { + "epoch": 0.38, + "grad_norm": 9.813574251657148, + "learning_rate": 9.97664016108709e-06, + "loss": 1.197, + "step": 6800 + }, + { + "epoch": 0.38, + "grad_norm": 7.022672747157394, + "learning_rate": 9.976481645353072e-06, + "loss": 1.2248, + "step": 6805 + }, + { + "epoch": 0.38, + "grad_norm": 12.438375149101976, + "learning_rate": 9.976322594874142e-06, + "loss": 1.2111, + "step": 6810 + }, + { + "epoch": 0.38, + "grad_norm": 9.544602325550182, + "learning_rate": 9.976163009667385e-06, + "loss": 1.1698, + "step": 6815 + }, + { + "epoch": 0.38, + "grad_norm": 7.727415785040562, + "learning_rate": 9.976002889749952e-06, + "loss": 1.2037, + "step": 6820 + }, + { + "epoch": 0.38, + "grad_norm": 14.383405370863493, + "learning_rate": 9.975842235139047e-06, + "loss": 1.167, + "step": 6825 + }, + { + "epoch": 0.38, + "grad_norm": 5.669874700594074, + "learning_rate": 9.975681045851935e-06, + "loss": 1.2254, + "step": 6830 + }, + { + "epoch": 0.39, + "grad_norm": 10.957958807511885, + "learning_rate": 9.975519321905934e-06, + "loss": 1.1434, + "step": 6835 + }, + { + "epoch": 0.39, + "grad_norm": 7.459036766792778, + "learning_rate": 9.975357063318423e-06, + "loss": 1.2177, + "step": 6840 + }, + { + "epoch": 0.39, + "grad_norm": 9.745614678291854, + "learning_rate": 9.975194270106838e-06, + "loss": 1.192, + "step": 6845 + }, + { + "epoch": 0.39, + "grad_norm": 10.764960663693426, + "learning_rate": 9.97503094228867e-06, + "loss": 1.1663, + "step": 6850 + }, + { + "epoch": 0.39, + "grad_norm": 11.76346246690597, + "learning_rate": 9.974867079881473e-06, + "loss": 1.2545, + "step": 6855 + }, + { + "epoch": 0.39, + "grad_norm": 6.340690318549726, + "learning_rate": 9.974702682902853e-06, + "loss": 1.1791, + "step": 6860 + }, + { + "epoch": 0.39, + "grad_norm": 5.636852289243506, + "learning_rate": 9.974537751370473e-06, + "loss": 1.1937, + "step": 6865 + }, + { + "epoch": 0.39, + "grad_norm": 13.576039674025727, + "learning_rate": 9.974372285302056e-06, + "loss": 1.1739, + "step": 6870 + }, + { + "epoch": 0.39, + "grad_norm": 6.079931641374136, + "learning_rate": 9.974206284715387e-06, + "loss": 1.2428, + "step": 6875 + }, + { + "epoch": 0.39, + "grad_norm": 6.4279107914430185, + "learning_rate": 9.974039749628297e-06, + "loss": 1.1797, + "step": 6880 + }, + { + "epoch": 0.39, + "grad_norm": 8.246179295928092, + "learning_rate": 9.973872680058685e-06, + "loss": 1.1558, + "step": 6885 + }, + { + "epoch": 0.39, + "grad_norm": 8.78910949664825, + "learning_rate": 9.973705076024503e-06, + "loss": 1.198, + "step": 6890 + }, + { + "epoch": 0.39, + "grad_norm": 7.0796416496715215, + "learning_rate": 9.973536937543758e-06, + "loss": 1.1151, + "step": 6895 + }, + { + "epoch": 0.39, + "grad_norm": 11.01726678119124, + "learning_rate": 9.97336826463452e-06, + "loss": 1.1033, + "step": 6900 + }, + { + "epoch": 0.39, + "grad_norm": 16.893076369727662, + "learning_rate": 9.973199057314913e-06, + "loss": 1.1359, + "step": 6905 + }, + { + "epoch": 0.39, + "grad_norm": 6.065964614117026, + "learning_rate": 9.973029315603118e-06, + "loss": 1.1998, + "step": 6910 + }, + { + "epoch": 0.39, + "grad_norm": 7.56045272028396, + "learning_rate": 9.972859039517377e-06, + "loss": 1.1522, + "step": 6915 + }, + { + "epoch": 0.39, + "grad_norm": 5.657183858925124, + "learning_rate": 9.972688229075984e-06, + "loss": 1.1471, + "step": 6920 + }, + { + "epoch": 0.39, + "grad_norm": 10.611161738437362, + "learning_rate": 9.972516884297295e-06, + "loss": 1.1458, + "step": 6925 + }, + { + "epoch": 0.39, + "grad_norm": 6.304912127698508, + "learning_rate": 9.972345005199721e-06, + "loss": 1.2101, + "step": 6930 + }, + { + "epoch": 0.39, + "grad_norm": 6.495415165322395, + "learning_rate": 9.972172591801731e-06, + "loss": 1.2116, + "step": 6935 + }, + { + "epoch": 0.39, + "grad_norm": 6.805608571776813, + "learning_rate": 9.971999644121855e-06, + "loss": 1.1854, + "step": 6940 + }, + { + "epoch": 0.39, + "grad_norm": 6.142397334356656, + "learning_rate": 9.971826162178672e-06, + "loss": 1.1331, + "step": 6945 + }, + { + "epoch": 0.39, + "grad_norm": 5.769205911723637, + "learning_rate": 9.971652145990826e-06, + "loss": 1.1581, + "step": 6950 + }, + { + "epoch": 0.39, + "grad_norm": 6.606059039236939, + "learning_rate": 9.971477595577015e-06, + "loss": 1.1466, + "step": 6955 + }, + { + "epoch": 0.39, + "grad_norm": 5.909953088389111, + "learning_rate": 9.971302510955996e-06, + "loss": 1.1509, + "step": 6960 + }, + { + "epoch": 0.39, + "grad_norm": 5.326337758290452, + "learning_rate": 9.97112689214658e-06, + "loss": 1.156, + "step": 6965 + }, + { + "epoch": 0.39, + "grad_norm": 5.277565895609338, + "learning_rate": 9.970950739167645e-06, + "loss": 1.1802, + "step": 6970 + }, + { + "epoch": 0.39, + "grad_norm": 5.777569819012663, + "learning_rate": 9.97077405203811e-06, + "loss": 1.2291, + "step": 6975 + }, + { + "epoch": 0.39, + "grad_norm": 5.434539473523682, + "learning_rate": 9.970596830776967e-06, + "loss": 1.1833, + "step": 6980 + }, + { + "epoch": 0.39, + "grad_norm": 8.67024664056154, + "learning_rate": 9.970419075403256e-06, + "loss": 1.1416, + "step": 6985 + }, + { + "epoch": 0.39, + "grad_norm": 7.134607097455263, + "learning_rate": 9.970240785936082e-06, + "loss": 1.1764, + "step": 6990 + }, + { + "epoch": 0.39, + "grad_norm": 6.460788448567936, + "learning_rate": 9.970061962394599e-06, + "loss": 1.1595, + "step": 6995 + }, + { + "epoch": 0.39, + "grad_norm": 10.894085190018938, + "learning_rate": 9.969882604798025e-06, + "loss": 1.1614, + "step": 7000 + }, + { + "epoch": 0.39, + "grad_norm": 8.99500506951188, + "learning_rate": 9.96970271316563e-06, + "loss": 1.127, + "step": 7005 + }, + { + "epoch": 0.39, + "grad_norm": 5.801642701082123, + "learning_rate": 9.969522287516744e-06, + "loss": 1.1715, + "step": 7010 + }, + { + "epoch": 0.4, + "grad_norm": 6.321390118992963, + "learning_rate": 9.969341327870758e-06, + "loss": 1.1757, + "step": 7015 + }, + { + "epoch": 0.4, + "grad_norm": 6.9823147197924635, + "learning_rate": 9.969159834247116e-06, + "loss": 1.0909, + "step": 7020 + }, + { + "epoch": 0.4, + "grad_norm": 7.693311659686022, + "learning_rate": 9.968977806665316e-06, + "loss": 1.1536, + "step": 7025 + }, + { + "epoch": 0.4, + "grad_norm": 8.082207239457722, + "learning_rate": 9.968795245144923e-06, + "loss": 1.1878, + "step": 7030 + }, + { + "epoch": 0.4, + "grad_norm": 11.624350030905093, + "learning_rate": 9.968612149705553e-06, + "loss": 1.2021, + "step": 7035 + }, + { + "epoch": 0.4, + "grad_norm": 6.263920645404605, + "learning_rate": 9.968428520366877e-06, + "loss": 1.1464, + "step": 7040 + }, + { + "epoch": 0.4, + "grad_norm": 8.201694449389493, + "learning_rate": 9.96824435714863e-06, + "loss": 1.2044, + "step": 7045 + }, + { + "epoch": 0.4, + "grad_norm": 15.152478309158454, + "learning_rate": 9.9680596600706e-06, + "loss": 1.207, + "step": 7050 + }, + { + "epoch": 0.4, + "grad_norm": 12.621848561179025, + "learning_rate": 9.967874429152635e-06, + "loss": 1.1679, + "step": 7055 + }, + { + "epoch": 0.4, + "grad_norm": 13.26393855465338, + "learning_rate": 9.967688664414636e-06, + "loss": 1.1895, + "step": 7060 + }, + { + "epoch": 0.4, + "grad_norm": 5.808273309055244, + "learning_rate": 9.967502365876567e-06, + "loss": 1.136, + "step": 7065 + }, + { + "epoch": 0.4, + "grad_norm": 7.1364997458048345, + "learning_rate": 9.967315533558445e-06, + "loss": 1.1617, + "step": 7070 + }, + { + "epoch": 0.4, + "grad_norm": 9.255141481393444, + "learning_rate": 9.967128167480346e-06, + "loss": 1.1395, + "step": 7075 + }, + { + "epoch": 0.4, + "grad_norm": 7.404988482522825, + "learning_rate": 9.966940267662407e-06, + "loss": 1.1422, + "step": 7080 + }, + { + "epoch": 0.4, + "grad_norm": 9.162902349791633, + "learning_rate": 9.966751834124812e-06, + "loss": 1.2098, + "step": 7085 + }, + { + "epoch": 0.4, + "grad_norm": 7.472829080961775, + "learning_rate": 9.966562866887813e-06, + "loss": 1.1637, + "step": 7090 + }, + { + "epoch": 0.4, + "grad_norm": 12.065553020026686, + "learning_rate": 9.966373365971716e-06, + "loss": 1.1489, + "step": 7095 + }, + { + "epoch": 0.4, + "grad_norm": 7.926763823743649, + "learning_rate": 9.966183331396882e-06, + "loss": 1.1942, + "step": 7100 + }, + { + "epoch": 0.4, + "grad_norm": 7.670843399481209, + "learning_rate": 9.965992763183732e-06, + "loss": 1.1627, + "step": 7105 + }, + { + "epoch": 0.4, + "grad_norm": 6.252928513189513, + "learning_rate": 9.965801661352742e-06, + "loss": 1.1862, + "step": 7110 + }, + { + "epoch": 0.4, + "grad_norm": 10.935201089724359, + "learning_rate": 9.965610025924449e-06, + "loss": 1.1522, + "step": 7115 + }, + { + "epoch": 0.4, + "grad_norm": 9.763877416311463, + "learning_rate": 9.965417856919442e-06, + "loss": 1.1204, + "step": 7120 + }, + { + "epoch": 0.4, + "grad_norm": 8.404757273478117, + "learning_rate": 9.965225154358374e-06, + "loss": 1.1906, + "step": 7125 + }, + { + "epoch": 0.4, + "grad_norm": 7.131156521871148, + "learning_rate": 9.965031918261949e-06, + "loss": 1.1561, + "step": 7130 + }, + { + "epoch": 0.4, + "grad_norm": 7.646992235178871, + "learning_rate": 9.964838148650932e-06, + "loss": 1.1779, + "step": 7135 + }, + { + "epoch": 0.4, + "grad_norm": 6.351613934878443, + "learning_rate": 9.964643845546144e-06, + "loss": 1.2187, + "step": 7140 + }, + { + "epoch": 0.4, + "grad_norm": 6.524885418216677, + "learning_rate": 9.964449008968466e-06, + "loss": 1.1756, + "step": 7145 + }, + { + "epoch": 0.4, + "grad_norm": 6.151302393127044, + "learning_rate": 9.96425363893883e-06, + "loss": 1.1154, + "step": 7150 + }, + { + "epoch": 0.4, + "grad_norm": 6.447716616076492, + "learning_rate": 9.964057735478233e-06, + "loss": 1.2077, + "step": 7155 + }, + { + "epoch": 0.4, + "grad_norm": 6.235925532608152, + "learning_rate": 9.963861298607725e-06, + "loss": 1.1647, + "step": 7160 + }, + { + "epoch": 0.4, + "grad_norm": 7.19404020149289, + "learning_rate": 9.963664328348411e-06, + "loss": 1.1872, + "step": 7165 + }, + { + "epoch": 0.4, + "grad_norm": 10.228989315369796, + "learning_rate": 9.96346682472146e-06, + "loss": 1.1619, + "step": 7170 + }, + { + "epoch": 0.4, + "grad_norm": 19.338968868833145, + "learning_rate": 9.963268787748092e-06, + "loss": 1.1989, + "step": 7175 + }, + { + "epoch": 0.4, + "grad_norm": 7.038334328549363, + "learning_rate": 9.963070217449591e-06, + "loss": 1.1683, + "step": 7180 + }, + { + "epoch": 0.4, + "grad_norm": 7.920020476923544, + "learning_rate": 9.962871113847288e-06, + "loss": 1.1962, + "step": 7185 + }, + { + "epoch": 0.41, + "grad_norm": 6.116495517138593, + "learning_rate": 9.962671476962583e-06, + "loss": 1.1286, + "step": 7190 + }, + { + "epoch": 0.41, + "grad_norm": 6.209402600116505, + "learning_rate": 9.962471306816925e-06, + "loss": 1.1648, + "step": 7195 + }, + { + "epoch": 0.41, + "grad_norm": 10.265572653210617, + "learning_rate": 9.962270603431826e-06, + "loss": 1.1683, + "step": 7200 + }, + { + "epoch": 0.41, + "grad_norm": 6.246673041635199, + "learning_rate": 9.962069366828849e-06, + "loss": 1.1302, + "step": 7205 + }, + { + "epoch": 0.41, + "grad_norm": 6.521403920817779, + "learning_rate": 9.961867597029618e-06, + "loss": 1.1689, + "step": 7210 + }, + { + "epoch": 0.41, + "grad_norm": 6.598069555056234, + "learning_rate": 9.961665294055816e-06, + "loss": 1.1613, + "step": 7215 + }, + { + "epoch": 0.41, + "grad_norm": 7.443138575493562, + "learning_rate": 9.961462457929182e-06, + "loss": 1.1608, + "step": 7220 + }, + { + "epoch": 0.41, + "grad_norm": 5.528161260282685, + "learning_rate": 9.961259088671509e-06, + "loss": 1.1756, + "step": 7225 + }, + { + "epoch": 0.41, + "grad_norm": 5.732843000819472, + "learning_rate": 9.961055186304652e-06, + "loss": 1.1278, + "step": 7230 + }, + { + "epoch": 0.41, + "grad_norm": 9.928199552615537, + "learning_rate": 9.960850750850521e-06, + "loss": 1.1876, + "step": 7235 + }, + { + "epoch": 0.41, + "grad_norm": 5.7538052857280295, + "learning_rate": 9.960645782331083e-06, + "loss": 1.1372, + "step": 7240 + }, + { + "epoch": 0.41, + "grad_norm": 13.7527271250132, + "learning_rate": 9.96044028076836e-06, + "loss": 1.1318, + "step": 7245 + }, + { + "epoch": 0.41, + "grad_norm": 9.569642847177645, + "learning_rate": 9.960234246184439e-06, + "loss": 1.1291, + "step": 7250 + }, + { + "epoch": 0.41, + "grad_norm": 11.466131806616952, + "learning_rate": 9.960027678601455e-06, + "loss": 1.1659, + "step": 7255 + }, + { + "epoch": 0.41, + "grad_norm": 5.726398135012966, + "learning_rate": 9.95982057804161e-06, + "loss": 1.1753, + "step": 7260 + }, + { + "epoch": 0.41, + "grad_norm": 9.759427658793014, + "learning_rate": 9.959612944527152e-06, + "loss": 1.1702, + "step": 7265 + }, + { + "epoch": 0.41, + "grad_norm": 20.03926074299141, + "learning_rate": 9.959404778080395e-06, + "loss": 1.1924, + "step": 7270 + }, + { + "epoch": 0.41, + "grad_norm": 15.511034051194109, + "learning_rate": 9.959196078723708e-06, + "loss": 1.1421, + "step": 7275 + }, + { + "epoch": 0.41, + "grad_norm": 16.710780995117776, + "learning_rate": 9.958986846479515e-06, + "loss": 1.1799, + "step": 7280 + }, + { + "epoch": 0.41, + "grad_norm": 25.475026249973894, + "learning_rate": 9.9587770813703e-06, + "loss": 1.2095, + "step": 7285 + }, + { + "epoch": 0.41, + "grad_norm": 10.379179706896757, + "learning_rate": 9.958566783418603e-06, + "loss": 1.1971, + "step": 7290 + }, + { + "epoch": 0.41, + "grad_norm": 20.930843043565, + "learning_rate": 9.95835595264702e-06, + "loss": 1.1745, + "step": 7295 + }, + { + "epoch": 0.41, + "grad_norm": 13.1184329702384, + "learning_rate": 9.958144589078209e-06, + "loss": 1.1448, + "step": 7300 + }, + { + "epoch": 0.41, + "grad_norm": 13.214396997144293, + "learning_rate": 9.957932692734879e-06, + "loss": 1.1173, + "step": 7305 + }, + { + "epoch": 0.41, + "grad_norm": 9.193770064250026, + "learning_rate": 9.9577202636398e-06, + "loss": 1.1487, + "step": 7310 + }, + { + "epoch": 0.41, + "grad_norm": 16.036156338067784, + "learning_rate": 9.957507301815798e-06, + "loss": 1.1727, + "step": 7315 + }, + { + "epoch": 0.41, + "grad_norm": 17.139115079392365, + "learning_rate": 9.957293807285758e-06, + "loss": 1.1694, + "step": 7320 + }, + { + "epoch": 0.41, + "grad_norm": 6.630935230906257, + "learning_rate": 9.95707978007262e-06, + "loss": 1.1421, + "step": 7325 + }, + { + "epoch": 0.41, + "grad_norm": 20.386875452735694, + "learning_rate": 9.956865220199383e-06, + "loss": 1.182, + "step": 7330 + }, + { + "epoch": 0.41, + "grad_norm": 5.897287158181644, + "learning_rate": 9.956650127689101e-06, + "loss": 1.1908, + "step": 7335 + }, + { + "epoch": 0.41, + "grad_norm": 22.42602733433497, + "learning_rate": 9.956434502564888e-06, + "loss": 1.1796, + "step": 7340 + }, + { + "epoch": 0.41, + "grad_norm": 5.429001917172855, + "learning_rate": 9.956218344849912e-06, + "loss": 1.1795, + "step": 7345 + }, + { + "epoch": 0.41, + "grad_norm": 11.039869565325827, + "learning_rate": 9.956001654567404e-06, + "loss": 1.1654, + "step": 7350 + }, + { + "epoch": 0.41, + "grad_norm": 11.339977479867002, + "learning_rate": 9.955784431740643e-06, + "loss": 1.1704, + "step": 7355 + }, + { + "epoch": 0.41, + "grad_norm": 14.89971002783044, + "learning_rate": 9.955566676392975e-06, + "loss": 1.1557, + "step": 7360 + }, + { + "epoch": 0.41, + "grad_norm": 15.37787907433655, + "learning_rate": 9.955348388547797e-06, + "loss": 1.1279, + "step": 7365 + }, + { + "epoch": 0.42, + "grad_norm": 58.86646691267968, + "learning_rate": 9.955129568228563e-06, + "loss": 1.1859, + "step": 7370 + }, + { + "epoch": 0.42, + "grad_norm": 29.512956198196843, + "learning_rate": 9.95491021545879e-06, + "loss": 1.1538, + "step": 7375 + }, + { + "epoch": 0.42, + "grad_norm": 35.16903316999807, + "learning_rate": 9.954690330262047e-06, + "loss": 1.1339, + "step": 7380 + }, + { + "epoch": 0.42, + "grad_norm": 13.539298402278382, + "learning_rate": 9.95446991266196e-06, + "loss": 1.1266, + "step": 7385 + }, + { + "epoch": 0.42, + "grad_norm": 28.489479868594366, + "learning_rate": 9.954248962682216e-06, + "loss": 1.1918, + "step": 7390 + }, + { + "epoch": 0.42, + "grad_norm": 27.764903782860227, + "learning_rate": 9.954027480346557e-06, + "loss": 1.1717, + "step": 7395 + }, + { + "epoch": 0.42, + "grad_norm": 13.632048537372299, + "learning_rate": 9.95380546567878e-06, + "loss": 1.1579, + "step": 7400 + }, + { + "epoch": 0.42, + "grad_norm": 8.585556934143545, + "learning_rate": 9.953582918702743e-06, + "loss": 1.1804, + "step": 7405 + }, + { + "epoch": 0.42, + "grad_norm": 6.3839843334155795, + "learning_rate": 9.95335983944236e-06, + "loss": 1.1692, + "step": 7410 + }, + { + "epoch": 0.42, + "grad_norm": 17.115502943063046, + "learning_rate": 9.9531362279216e-06, + "loss": 1.1288, + "step": 7415 + }, + { + "epoch": 0.42, + "grad_norm": 7.697219157530452, + "learning_rate": 9.952912084164494e-06, + "loss": 1.1831, + "step": 7420 + }, + { + "epoch": 0.42, + "grad_norm": 19.229541416735028, + "learning_rate": 9.952687408195126e-06, + "loss": 1.1768, + "step": 7425 + }, + { + "epoch": 0.42, + "grad_norm": 8.249622746628285, + "learning_rate": 9.952462200037638e-06, + "loss": 1.1885, + "step": 7430 + }, + { + "epoch": 0.42, + "grad_norm": 6.0974162297522865, + "learning_rate": 9.952236459716229e-06, + "loss": 1.2087, + "step": 7435 + }, + { + "epoch": 0.42, + "grad_norm": 6.0724449592560745, + "learning_rate": 9.952010187255157e-06, + "loss": 1.1495, + "step": 7440 + }, + { + "epoch": 0.42, + "grad_norm": 5.3183452017936395, + "learning_rate": 9.951783382678735e-06, + "loss": 1.2043, + "step": 7445 + }, + { + "epoch": 0.42, + "grad_norm": 8.136328602147941, + "learning_rate": 9.951556046011336e-06, + "loss": 1.1497, + "step": 7450 + }, + { + "epoch": 0.42, + "grad_norm": 11.902056875987189, + "learning_rate": 9.951328177277385e-06, + "loss": 1.1182, + "step": 7455 + }, + { + "epoch": 0.42, + "grad_norm": 5.775983601548823, + "learning_rate": 9.95109977650137e-06, + "loss": 1.0865, + "step": 7460 + }, + { + "epoch": 0.42, + "grad_norm": 8.015998979312961, + "learning_rate": 9.950870843707834e-06, + "loss": 1.1436, + "step": 7465 + }, + { + "epoch": 0.42, + "grad_norm": 13.719815788065228, + "learning_rate": 9.950641378921375e-06, + "loss": 1.1483, + "step": 7470 + }, + { + "epoch": 0.42, + "grad_norm": 11.832901041077575, + "learning_rate": 9.950411382166652e-06, + "loss": 1.1446, + "step": 7475 + }, + { + "epoch": 0.42, + "grad_norm": 6.117160855859674, + "learning_rate": 9.950180853468377e-06, + "loss": 1.16, + "step": 7480 + }, + { + "epoch": 0.42, + "grad_norm": 7.2973352722122415, + "learning_rate": 9.949949792851323e-06, + "loss": 1.1186, + "step": 7485 + }, + { + "epoch": 0.42, + "grad_norm": 7.620669119265208, + "learning_rate": 9.949718200340319e-06, + "loss": 1.0882, + "step": 7490 + }, + { + "epoch": 0.42, + "grad_norm": 9.0780256771354, + "learning_rate": 9.949486075960248e-06, + "loss": 1.1225, + "step": 7495 + }, + { + "epoch": 0.42, + "grad_norm": 17.618159635632885, + "learning_rate": 9.949253419736055e-06, + "loss": 1.1239, + "step": 7500 + }, + { + "epoch": 0.42, + "grad_norm": 10.298895161700539, + "learning_rate": 9.94902023169274e-06, + "loss": 1.1855, + "step": 7505 + }, + { + "epoch": 0.42, + "grad_norm": 6.658594732022927, + "learning_rate": 9.948786511855358e-06, + "loss": 1.1472, + "step": 7510 + }, + { + "epoch": 0.42, + "grad_norm": 7.893464538077141, + "learning_rate": 9.948552260249025e-06, + "loss": 1.1582, + "step": 7515 + }, + { + "epoch": 0.42, + "grad_norm": 7.1407490289993705, + "learning_rate": 9.948317476898913e-06, + "loss": 1.149, + "step": 7520 + }, + { + "epoch": 0.42, + "grad_norm": 11.917156613629743, + "learning_rate": 9.948082161830249e-06, + "loss": 1.1368, + "step": 7525 + }, + { + "epoch": 0.42, + "grad_norm": 15.888994387264798, + "learning_rate": 9.947846315068319e-06, + "loss": 1.1887, + "step": 7530 + }, + { + "epoch": 0.42, + "grad_norm": 6.561917198848331, + "learning_rate": 9.947609936638468e-06, + "loss": 1.1206, + "step": 7535 + }, + { + "epoch": 0.42, + "grad_norm": 21.849109167201615, + "learning_rate": 9.947373026566089e-06, + "loss": 1.1509, + "step": 7540 + }, + { + "epoch": 0.43, + "grad_norm": 6.52580304317239, + "learning_rate": 9.947135584876649e-06, + "loss": 1.155, + "step": 7545 + }, + { + "epoch": 0.43, + "grad_norm": 14.229866299809071, + "learning_rate": 9.946897611595653e-06, + "loss": 1.201, + "step": 7550 + }, + { + "epoch": 0.43, + "grad_norm": 11.462107315609448, + "learning_rate": 9.94665910674868e-06, + "loss": 1.1297, + "step": 7555 + }, + { + "epoch": 0.43, + "grad_norm": 9.331772324871634, + "learning_rate": 9.946420070361351e-06, + "loss": 1.1619, + "step": 7560 + }, + { + "epoch": 0.43, + "grad_norm": 11.34909614633429, + "learning_rate": 9.946180502459357e-06, + "loss": 1.0676, + "step": 7565 + }, + { + "epoch": 0.43, + "grad_norm": 8.632008971359138, + "learning_rate": 9.94594040306844e-06, + "loss": 1.1648, + "step": 7570 + }, + { + "epoch": 0.43, + "grad_norm": 7.9404498847889675, + "learning_rate": 9.945699772214396e-06, + "loss": 1.0979, + "step": 7575 + }, + { + "epoch": 0.43, + "grad_norm": 7.043731618251074, + "learning_rate": 9.945458609923087e-06, + "loss": 1.1777, + "step": 7580 + }, + { + "epoch": 0.43, + "grad_norm": 6.426535997154359, + "learning_rate": 9.945216916220424e-06, + "loss": 1.1692, + "step": 7585 + }, + { + "epoch": 0.43, + "grad_norm": 6.159336716140868, + "learning_rate": 9.944974691132377e-06, + "loss": 1.1385, + "step": 7590 + }, + { + "epoch": 0.43, + "grad_norm": 16.590672728725128, + "learning_rate": 9.944731934684977e-06, + "loss": 1.1692, + "step": 7595 + }, + { + "epoch": 0.43, + "grad_norm": 7.229077313744524, + "learning_rate": 9.944488646904307e-06, + "loss": 1.18, + "step": 7600 + }, + { + "epoch": 0.43, + "grad_norm": 5.825320467379788, + "learning_rate": 9.94424482781651e-06, + "loss": 1.1294, + "step": 7605 + }, + { + "epoch": 0.43, + "grad_norm": 5.643517762309204, + "learning_rate": 9.944000477447786e-06, + "loss": 1.1281, + "step": 7610 + }, + { + "epoch": 0.43, + "grad_norm": 7.815094794970016, + "learning_rate": 9.943755595824391e-06, + "loss": 1.1863, + "step": 7615 + }, + { + "epoch": 0.43, + "grad_norm": 6.291662113231529, + "learning_rate": 9.94351018297264e-06, + "loss": 1.1704, + "step": 7620 + }, + { + "epoch": 0.43, + "grad_norm": 8.313163076129426, + "learning_rate": 9.943264238918902e-06, + "loss": 1.1465, + "step": 7625 + }, + { + "epoch": 0.43, + "grad_norm": 6.646545555699191, + "learning_rate": 9.943017763689604e-06, + "loss": 1.0835, + "step": 7630 + }, + { + "epoch": 0.43, + "grad_norm": 7.405593689627591, + "learning_rate": 9.942770757311233e-06, + "loss": 1.1047, + "step": 7635 + }, + { + "epoch": 0.43, + "grad_norm": 10.37489587592504, + "learning_rate": 9.94252321981033e-06, + "loss": 1.133, + "step": 7640 + }, + { + "epoch": 0.43, + "grad_norm": 6.762815622953501, + "learning_rate": 9.942275151213494e-06, + "loss": 1.1666, + "step": 7645 + }, + { + "epoch": 0.43, + "grad_norm": 8.110801366127728, + "learning_rate": 9.942026551547379e-06, + "loss": 1.1827, + "step": 7650 + }, + { + "epoch": 0.43, + "grad_norm": 8.930267312022218, + "learning_rate": 9.941777420838703e-06, + "loss": 1.1684, + "step": 7655 + }, + { + "epoch": 0.43, + "grad_norm": 7.520164914312182, + "learning_rate": 9.941527759114233e-06, + "loss": 1.1296, + "step": 7660 + }, + { + "epoch": 0.43, + "grad_norm": 10.823430279904931, + "learning_rate": 9.941277566400796e-06, + "loss": 1.1771, + "step": 7665 + }, + { + "epoch": 0.43, + "grad_norm": 8.016326789857857, + "learning_rate": 9.941026842725276e-06, + "loss": 1.1328, + "step": 7670 + }, + { + "epoch": 0.43, + "grad_norm": 5.560869699096651, + "learning_rate": 9.940775588114618e-06, + "loss": 1.1103, + "step": 7675 + }, + { + "epoch": 0.43, + "grad_norm": 8.34295283052181, + "learning_rate": 9.940523802595815e-06, + "loss": 1.1335, + "step": 7680 + }, + { + "epoch": 0.43, + "grad_norm": 7.5167217444197965, + "learning_rate": 9.940271486195927e-06, + "loss": 1.1454, + "step": 7685 + }, + { + "epoch": 0.43, + "grad_norm": 14.396138579977828, + "learning_rate": 9.940018638942064e-06, + "loss": 1.1313, + "step": 7690 + }, + { + "epoch": 0.43, + "grad_norm": 11.22222680824031, + "learning_rate": 9.939765260861396e-06, + "loss": 1.15, + "step": 7695 + }, + { + "epoch": 0.43, + "grad_norm": 6.151215904216118, + "learning_rate": 9.93951135198115e-06, + "loss": 1.1359, + "step": 7700 + }, + { + "epoch": 0.43, + "grad_norm": 13.737205761509378, + "learning_rate": 9.93925691232861e-06, + "loss": 1.1412, + "step": 7705 + }, + { + "epoch": 0.43, + "grad_norm": 7.671639300887087, + "learning_rate": 9.939001941931117e-06, + "loss": 1.1733, + "step": 7710 + }, + { + "epoch": 0.43, + "grad_norm": 5.5574397761620045, + "learning_rate": 9.938746440816066e-06, + "loss": 1.1836, + "step": 7715 + }, + { + "epoch": 0.43, + "grad_norm": 7.85283106925243, + "learning_rate": 9.938490409010915e-06, + "loss": 1.1481, + "step": 7720 + }, + { + "epoch": 0.44, + "grad_norm": 6.541983204590011, + "learning_rate": 9.938233846543172e-06, + "loss": 1.1624, + "step": 7725 + }, + { + "epoch": 0.44, + "grad_norm": 6.380153215234209, + "learning_rate": 9.93797675344041e-06, + "loss": 1.1082, + "step": 7730 + }, + { + "epoch": 0.44, + "grad_norm": 6.2240106442182075, + "learning_rate": 9.937719129730255e-06, + "loss": 1.1443, + "step": 7735 + }, + { + "epoch": 0.44, + "grad_norm": 6.153292924266712, + "learning_rate": 9.937460975440386e-06, + "loss": 1.1028, + "step": 7740 + }, + { + "epoch": 0.44, + "grad_norm": 6.5515823330429805, + "learning_rate": 9.937202290598541e-06, + "loss": 1.1233, + "step": 7745 + }, + { + "epoch": 0.44, + "grad_norm": 6.039659230667111, + "learning_rate": 9.936943075232525e-06, + "loss": 1.163, + "step": 7750 + }, + { + "epoch": 0.44, + "grad_norm": 9.662962324633087, + "learning_rate": 9.936683329370184e-06, + "loss": 1.1696, + "step": 7755 + }, + { + "epoch": 0.44, + "grad_norm": 6.239963909223691, + "learning_rate": 9.936423053039434e-06, + "loss": 1.1282, + "step": 7760 + }, + { + "epoch": 0.44, + "grad_norm": 13.721532366769768, + "learning_rate": 9.936162246268242e-06, + "loss": 1.1587, + "step": 7765 + }, + { + "epoch": 0.44, + "grad_norm": 5.721408140815109, + "learning_rate": 9.935900909084629e-06, + "loss": 1.0901, + "step": 7770 + }, + { + "epoch": 0.44, + "grad_norm": 6.135733225197569, + "learning_rate": 9.93563904151668e-06, + "loss": 1.1181, + "step": 7775 + }, + { + "epoch": 0.44, + "grad_norm": 6.593760239533519, + "learning_rate": 9.935376643592534e-06, + "loss": 1.1923, + "step": 7780 + }, + { + "epoch": 0.44, + "grad_norm": 8.112961542721804, + "learning_rate": 9.935113715340386e-06, + "loss": 1.1687, + "step": 7785 + }, + { + "epoch": 0.44, + "grad_norm": 6.566380989507593, + "learning_rate": 9.934850256788489e-06, + "loss": 1.1062, + "step": 7790 + }, + { + "epoch": 0.44, + "grad_norm": 14.984241345080738, + "learning_rate": 9.934586267965154e-06, + "loss": 1.2385, + "step": 7795 + }, + { + "epoch": 0.44, + "grad_norm": 7.231142467968554, + "learning_rate": 9.934321748898746e-06, + "loss": 1.1334, + "step": 7800 + }, + { + "epoch": 0.44, + "grad_norm": 8.889388377549382, + "learning_rate": 9.934056699617686e-06, + "loss": 1.1132, + "step": 7805 + }, + { + "epoch": 0.44, + "grad_norm": 13.186228712007964, + "learning_rate": 9.933791120150461e-06, + "loss": 1.1427, + "step": 7810 + }, + { + "epoch": 0.44, + "grad_norm": 10.151427183839397, + "learning_rate": 9.933525010525607e-06, + "loss": 1.1325, + "step": 7815 + }, + { + "epoch": 0.44, + "grad_norm": 7.792545697946491, + "learning_rate": 9.933258370771714e-06, + "loss": 1.1537, + "step": 7820 + }, + { + "epoch": 0.44, + "grad_norm": 13.400827415408557, + "learning_rate": 9.932991200917438e-06, + "loss": 1.1571, + "step": 7825 + }, + { + "epoch": 0.44, + "grad_norm": 7.807104821525981, + "learning_rate": 9.932723500991488e-06, + "loss": 1.1225, + "step": 7830 + }, + { + "epoch": 0.44, + "grad_norm": 8.30671607188519, + "learning_rate": 9.932455271022627e-06, + "loss": 1.1676, + "step": 7835 + }, + { + "epoch": 0.44, + "grad_norm": 6.924940762504854, + "learning_rate": 9.93218651103968e-06, + "loss": 1.1827, + "step": 7840 + }, + { + "epoch": 0.44, + "grad_norm": 8.50411716805518, + "learning_rate": 9.931917221071522e-06, + "loss": 1.1229, + "step": 7845 + }, + { + "epoch": 0.44, + "grad_norm": 7.676640545210377, + "learning_rate": 9.931647401147093e-06, + "loss": 1.1518, + "step": 7850 + }, + { + "epoch": 0.44, + "grad_norm": 16.80959535965297, + "learning_rate": 9.931377051295389e-06, + "loss": 1.1297, + "step": 7855 + }, + { + "epoch": 0.44, + "grad_norm": 12.811323956525596, + "learning_rate": 9.931106171545453e-06, + "loss": 1.1339, + "step": 7860 + }, + { + "epoch": 0.44, + "grad_norm": 5.548378617609879, + "learning_rate": 9.930834761926398e-06, + "loss": 1.0706, + "step": 7865 + }, + { + "epoch": 0.44, + "grad_norm": 11.138696070070708, + "learning_rate": 9.930562822467386e-06, + "loss": 1.1457, + "step": 7870 + }, + { + "epoch": 0.44, + "grad_norm": 10.851137940988485, + "learning_rate": 9.93029035319764e-06, + "loss": 1.1849, + "step": 7875 + }, + { + "epoch": 0.44, + "grad_norm": 6.246615709232681, + "learning_rate": 9.930017354146435e-06, + "loss": 1.2075, + "step": 7880 + }, + { + "epoch": 0.44, + "grad_norm": 7.004687896127214, + "learning_rate": 9.929743825343109e-06, + "loss": 1.1319, + "step": 7885 + }, + { + "epoch": 0.44, + "grad_norm": 8.31333613381446, + "learning_rate": 9.929469766817049e-06, + "loss": 1.1473, + "step": 7890 + }, + { + "epoch": 0.44, + "grad_norm": 5.947978349580737, + "learning_rate": 9.929195178597711e-06, + "loss": 1.1454, + "step": 7895 + }, + { + "epoch": 0.45, + "grad_norm": 6.596565430355072, + "learning_rate": 9.928920060714594e-06, + "loss": 1.1403, + "step": 7900 + }, + { + "epoch": 0.45, + "grad_norm": 10.853373502977693, + "learning_rate": 9.928644413197263e-06, + "loss": 1.2014, + "step": 7905 + }, + { + "epoch": 0.45, + "grad_norm": 6.177412393431067, + "learning_rate": 9.92836823607534e-06, + "loss": 1.1609, + "step": 7910 + }, + { + "epoch": 0.45, + "grad_norm": 7.7587644267845794, + "learning_rate": 9.9280915293785e-06, + "loss": 1.1264, + "step": 7915 + }, + { + "epoch": 0.45, + "grad_norm": 7.568438535524381, + "learning_rate": 9.927814293136475e-06, + "loss": 1.1654, + "step": 7920 + }, + { + "epoch": 0.45, + "grad_norm": 9.808904676175661, + "learning_rate": 9.927536527379055e-06, + "loss": 1.112, + "step": 7925 + }, + { + "epoch": 0.45, + "grad_norm": 9.428016435169493, + "learning_rate": 9.92725823213609e-06, + "loss": 1.1196, + "step": 7930 + }, + { + "epoch": 0.45, + "grad_norm": 7.161113797198686, + "learning_rate": 9.926979407437482e-06, + "loss": 1.1217, + "step": 7935 + }, + { + "epoch": 0.45, + "grad_norm": 7.182527071175802, + "learning_rate": 9.926700053313192e-06, + "loss": 1.1347, + "step": 7940 + }, + { + "epoch": 0.45, + "grad_norm": 7.627434994800816, + "learning_rate": 9.926420169793238e-06, + "loss": 1.1128, + "step": 7945 + }, + { + "epoch": 0.45, + "grad_norm": 6.476428644298823, + "learning_rate": 9.926139756907696e-06, + "loss": 1.1408, + "step": 7950 + }, + { + "epoch": 0.45, + "grad_norm": 6.018142300408826, + "learning_rate": 9.925858814686695e-06, + "loss": 1.1461, + "step": 7955 + }, + { + "epoch": 0.45, + "grad_norm": 8.01193884729803, + "learning_rate": 9.925577343160424e-06, + "loss": 1.1665, + "step": 7960 + }, + { + "epoch": 0.45, + "grad_norm": 7.862190238650837, + "learning_rate": 9.925295342359131e-06, + "loss": 1.1839, + "step": 7965 + }, + { + "epoch": 0.45, + "grad_norm": 6.228009776446208, + "learning_rate": 9.925012812313114e-06, + "loss": 1.1357, + "step": 7970 + }, + { + "epoch": 0.45, + "grad_norm": 5.8908933139796895, + "learning_rate": 9.924729753052737e-06, + "loss": 1.1461, + "step": 7975 + }, + { + "epoch": 0.45, + "grad_norm": 9.978023474287168, + "learning_rate": 9.924446164608414e-06, + "loss": 1.1718, + "step": 7980 + }, + { + "epoch": 0.45, + "grad_norm": 9.084048319902944, + "learning_rate": 9.924162047010617e-06, + "loss": 1.1096, + "step": 7985 + }, + { + "epoch": 0.45, + "grad_norm": 10.463896890378928, + "learning_rate": 9.923877400289873e-06, + "loss": 1.1951, + "step": 7990 + }, + { + "epoch": 0.45, + "grad_norm": 7.068209068736689, + "learning_rate": 9.923592224476776e-06, + "loss": 1.1646, + "step": 7995 + }, + { + "epoch": 0.45, + "grad_norm": 11.88910232611772, + "learning_rate": 9.923306519601963e-06, + "loss": 1.1643, + "step": 8000 + }, + { + "epoch": 0.45, + "grad_norm": 6.812356468825577, + "learning_rate": 9.923020285696135e-06, + "loss": 1.1443, + "step": 8005 + }, + { + "epoch": 0.45, + "grad_norm": 7.527069223093425, + "learning_rate": 9.922733522790052e-06, + "loss": 1.1663, + "step": 8010 + }, + { + "epoch": 0.45, + "grad_norm": 6.085587139001899, + "learning_rate": 9.922446230914526e-06, + "loss": 1.1198, + "step": 8015 + }, + { + "epoch": 0.45, + "grad_norm": 7.178732078404221, + "learning_rate": 9.922158410100429e-06, + "loss": 1.1256, + "step": 8020 + }, + { + "epoch": 0.45, + "grad_norm": 9.751355985385874, + "learning_rate": 9.921870060378687e-06, + "loss": 1.1237, + "step": 8025 + }, + { + "epoch": 0.45, + "grad_norm": 6.166352451055253, + "learning_rate": 9.921581181780284e-06, + "loss": 1.1568, + "step": 8030 + }, + { + "epoch": 0.45, + "grad_norm": 8.750446785875491, + "learning_rate": 9.921291774336263e-06, + "loss": 1.1299, + "step": 8035 + }, + { + "epoch": 0.45, + "grad_norm": 7.322602400290547, + "learning_rate": 9.921001838077722e-06, + "loss": 1.1105, + "step": 8040 + }, + { + "epoch": 0.45, + "grad_norm": 6.597169205942858, + "learning_rate": 9.920711373035815e-06, + "loss": 1.1518, + "step": 8045 + }, + { + "epoch": 0.45, + "grad_norm": 13.550861867124471, + "learning_rate": 9.920420379241755e-06, + "loss": 1.1043, + "step": 8050 + }, + { + "epoch": 0.45, + "grad_norm": 19.414144737746152, + "learning_rate": 9.920128856726809e-06, + "loss": 1.1357, + "step": 8055 + }, + { + "epoch": 0.45, + "grad_norm": 17.7879234758635, + "learning_rate": 9.919836805522303e-06, + "loss": 1.181, + "step": 8060 + }, + { + "epoch": 0.45, + "grad_norm": 6.935363874803764, + "learning_rate": 9.91954422565962e-06, + "loss": 1.1308, + "step": 8065 + }, + { + "epoch": 0.45, + "grad_norm": 15.197737171064103, + "learning_rate": 9.919251117170197e-06, + "loss": 1.1213, + "step": 8070 + }, + { + "epoch": 0.45, + "grad_norm": 10.491328310522661, + "learning_rate": 9.918957480085534e-06, + "loss": 1.1616, + "step": 8075 + }, + { + "epoch": 0.46, + "grad_norm": 8.186871938635413, + "learning_rate": 9.918663314437178e-06, + "loss": 1.1603, + "step": 8080 + }, + { + "epoch": 0.46, + "grad_norm": 12.805034098385828, + "learning_rate": 9.918368620256742e-06, + "loss": 1.1597, + "step": 8085 + }, + { + "epoch": 0.46, + "grad_norm": 16.897184300130437, + "learning_rate": 9.918073397575892e-06, + "loss": 1.1258, + "step": 8090 + }, + { + "epoch": 0.46, + "grad_norm": 5.925310029747384, + "learning_rate": 9.91777764642635e-06, + "loss": 1.1098, + "step": 8095 + }, + { + "epoch": 0.46, + "grad_norm": 8.66010055733957, + "learning_rate": 9.917481366839896e-06, + "loss": 1.1646, + "step": 8100 + }, + { + "epoch": 0.46, + "grad_norm": 12.550626296738047, + "learning_rate": 9.917184558848366e-06, + "loss": 1.1764, + "step": 8105 + }, + { + "epoch": 0.46, + "grad_norm": 8.349916065932398, + "learning_rate": 9.916887222483654e-06, + "loss": 1.1355, + "step": 8110 + }, + { + "epoch": 0.46, + "grad_norm": 18.41914856625254, + "learning_rate": 9.91658935777771e-06, + "loss": 1.1179, + "step": 8115 + }, + { + "epoch": 0.46, + "grad_norm": 20.949966070018647, + "learning_rate": 9.916290964762542e-06, + "loss": 1.0995, + "step": 8120 + }, + { + "epoch": 0.46, + "grad_norm": 13.116656904071101, + "learning_rate": 9.915992043470213e-06, + "loss": 1.1451, + "step": 8125 + }, + { + "epoch": 0.46, + "grad_norm": 11.57665638720786, + "learning_rate": 9.915692593932842e-06, + "loss": 1.0656, + "step": 8130 + }, + { + "epoch": 0.46, + "grad_norm": 7.121901944957988, + "learning_rate": 9.915392616182606e-06, + "loss": 1.1285, + "step": 8135 + }, + { + "epoch": 0.46, + "grad_norm": 7.763329818073514, + "learning_rate": 9.915092110251742e-06, + "loss": 1.1415, + "step": 8140 + }, + { + "epoch": 0.46, + "grad_norm": 21.174463104279873, + "learning_rate": 9.914791076172539e-06, + "loss": 1.1603, + "step": 8145 + }, + { + "epoch": 0.46, + "grad_norm": 8.687202572936014, + "learning_rate": 9.914489513977344e-06, + "loss": 1.173, + "step": 8150 + }, + { + "epoch": 0.46, + "grad_norm": 10.025317968688368, + "learning_rate": 9.91418742369856e-06, + "loss": 1.1799, + "step": 8155 + }, + { + "epoch": 0.46, + "grad_norm": 10.580685167669746, + "learning_rate": 9.91388480536865e-06, + "loss": 1.1417, + "step": 8160 + }, + { + "epoch": 0.46, + "grad_norm": 12.427871347485707, + "learning_rate": 9.913581659020131e-06, + "loss": 1.1431, + "step": 8165 + }, + { + "epoch": 0.46, + "grad_norm": 7.919016707093965, + "learning_rate": 9.913277984685577e-06, + "loss": 1.1343, + "step": 8170 + }, + { + "epoch": 0.46, + "grad_norm": 16.0728116259922, + "learning_rate": 9.91297378239762e-06, + "loss": 1.1714, + "step": 8175 + }, + { + "epoch": 0.46, + "grad_norm": 5.784404388459732, + "learning_rate": 9.912669052188948e-06, + "loss": 1.1071, + "step": 8180 + }, + { + "epoch": 0.46, + "grad_norm": 7.867643724095714, + "learning_rate": 9.912363794092306e-06, + "loss": 1.1165, + "step": 8185 + }, + { + "epoch": 0.46, + "grad_norm": 7.227876275716403, + "learning_rate": 9.912058008140493e-06, + "loss": 1.1179, + "step": 8190 + }, + { + "epoch": 0.46, + "grad_norm": 7.926626840265723, + "learning_rate": 9.911751694366368e-06, + "loss": 1.1547, + "step": 8195 + }, + { + "epoch": 0.46, + "grad_norm": 7.251534839962707, + "learning_rate": 9.911444852802848e-06, + "loss": 1.1411, + "step": 8200 + }, + { + "epoch": 0.46, + "grad_norm": 6.8221131575983085, + "learning_rate": 9.911137483482902e-06, + "loss": 1.1618, + "step": 8205 + }, + { + "epoch": 0.46, + "grad_norm": 14.951383048318903, + "learning_rate": 9.910829586439557e-06, + "loss": 1.1901, + "step": 8210 + }, + { + "epoch": 0.46, + "grad_norm": 5.394284096549184, + "learning_rate": 9.910521161705901e-06, + "loss": 1.1189, + "step": 8215 + }, + { + "epoch": 0.46, + "grad_norm": 7.058500387595784, + "learning_rate": 9.910212209315075e-06, + "loss": 1.1217, + "step": 8220 + }, + { + "epoch": 0.46, + "grad_norm": 5.475745755358913, + "learning_rate": 9.909902729300276e-06, + "loss": 1.1029, + "step": 8225 + }, + { + "epoch": 0.46, + "grad_norm": 7.321152476451876, + "learning_rate": 9.90959272169476e-06, + "loss": 1.165, + "step": 8230 + }, + { + "epoch": 0.46, + "grad_norm": 7.709737407683546, + "learning_rate": 9.909282186531838e-06, + "loss": 1.1298, + "step": 8235 + }, + { + "epoch": 0.46, + "grad_norm": 6.942039054846149, + "learning_rate": 9.90897112384488e-06, + "loss": 1.1079, + "step": 8240 + }, + { + "epoch": 0.46, + "grad_norm": 7.523822588201016, + "learning_rate": 9.908659533667308e-06, + "loss": 1.0914, + "step": 8245 + }, + { + "epoch": 0.46, + "grad_norm": 10.18599056710619, + "learning_rate": 9.908347416032607e-06, + "loss": 1.131, + "step": 8250 + }, + { + "epoch": 0.47, + "grad_norm": 12.298366837145794, + "learning_rate": 9.908034770974314e-06, + "loss": 1.1187, + "step": 8255 + }, + { + "epoch": 0.47, + "grad_norm": 5.266402219942813, + "learning_rate": 9.907721598526023e-06, + "loss": 1.0883, + "step": 8260 + }, + { + "epoch": 0.47, + "grad_norm": 16.441723810002678, + "learning_rate": 9.907407898721388e-06, + "loss": 1.176, + "step": 8265 + }, + { + "epoch": 0.47, + "grad_norm": 7.332685159253233, + "learning_rate": 9.907093671594116e-06, + "loss": 1.1322, + "step": 8270 + }, + { + "epoch": 0.47, + "grad_norm": 8.278492210164009, + "learning_rate": 9.906778917177971e-06, + "loss": 1.1418, + "step": 8275 + }, + { + "epoch": 0.47, + "grad_norm": 6.144336594304728, + "learning_rate": 9.906463635506778e-06, + "loss": 1.1232, + "step": 8280 + }, + { + "epoch": 0.47, + "grad_norm": 8.062488243792572, + "learning_rate": 9.906147826614412e-06, + "loss": 1.1556, + "step": 8285 + }, + { + "epoch": 0.47, + "grad_norm": 5.890762453851772, + "learning_rate": 9.90583149053481e-06, + "loss": 1.1198, + "step": 8290 + }, + { + "epoch": 0.47, + "grad_norm": 6.562691160318177, + "learning_rate": 9.905514627301964e-06, + "loss": 1.1202, + "step": 8295 + }, + { + "epoch": 0.47, + "grad_norm": 6.883518091845976, + "learning_rate": 9.90519723694992e-06, + "loss": 1.1748, + "step": 8300 + }, + { + "epoch": 0.47, + "grad_norm": 6.3553947016629015, + "learning_rate": 9.904879319512786e-06, + "loss": 1.1136, + "step": 8305 + }, + { + "epoch": 0.47, + "grad_norm": 9.529370012571578, + "learning_rate": 9.904560875024722e-06, + "loss": 1.1744, + "step": 8310 + }, + { + "epoch": 0.47, + "grad_norm": 7.86046118665941, + "learning_rate": 9.904241903519946e-06, + "loss": 1.1164, + "step": 8315 + }, + { + "epoch": 0.47, + "grad_norm": 19.259825004498953, + "learning_rate": 9.903922405032733e-06, + "loss": 1.1582, + "step": 8320 + }, + { + "epoch": 0.47, + "grad_norm": 19.898525371631383, + "learning_rate": 9.903602379597417e-06, + "loss": 1.1757, + "step": 8325 + }, + { + "epoch": 0.47, + "grad_norm": 10.642737075991965, + "learning_rate": 9.90328182724838e-06, + "loss": 1.1861, + "step": 8330 + }, + { + "epoch": 0.47, + "grad_norm": 5.60510125662481, + "learning_rate": 9.902960748020075e-06, + "loss": 1.1455, + "step": 8335 + }, + { + "epoch": 0.47, + "grad_norm": 9.650827903465728, + "learning_rate": 9.902639141946998e-06, + "loss": 1.1599, + "step": 8340 + }, + { + "epoch": 0.47, + "grad_norm": 20.65537674984556, + "learning_rate": 9.902317009063708e-06, + "loss": 1.1387, + "step": 8345 + }, + { + "epoch": 0.47, + "grad_norm": 7.416947762342985, + "learning_rate": 9.901994349404819e-06, + "loss": 1.1795, + "step": 8350 + }, + { + "epoch": 0.47, + "grad_norm": 6.1507077578132, + "learning_rate": 9.901671163005005e-06, + "loss": 1.149, + "step": 8355 + }, + { + "epoch": 0.47, + "grad_norm": 10.715233692577106, + "learning_rate": 9.90134744989899e-06, + "loss": 1.1451, + "step": 8360 + }, + { + "epoch": 0.47, + "grad_norm": 8.030405268061639, + "learning_rate": 9.901023210121561e-06, + "loss": 1.1271, + "step": 8365 + }, + { + "epoch": 0.47, + "grad_norm": 5.825841721827905, + "learning_rate": 9.900698443707557e-06, + "loss": 1.1039, + "step": 8370 + }, + { + "epoch": 0.47, + "grad_norm": 9.09960502605246, + "learning_rate": 9.900373150691878e-06, + "loss": 1.1529, + "step": 8375 + }, + { + "epoch": 0.47, + "grad_norm": 15.932854658669767, + "learning_rate": 9.900047331109477e-06, + "loss": 1.1195, + "step": 8380 + }, + { + "epoch": 0.47, + "grad_norm": 12.611839452996128, + "learning_rate": 9.899720984995364e-06, + "loss": 1.1164, + "step": 8385 + }, + { + "epoch": 0.47, + "grad_norm": 19.143709420338695, + "learning_rate": 9.899394112384607e-06, + "loss": 1.1199, + "step": 8390 + }, + { + "epoch": 0.47, + "grad_norm": 19.93958495130495, + "learning_rate": 9.89906671331233e-06, + "loss": 1.1552, + "step": 8395 + }, + { + "epoch": 0.47, + "grad_norm": 15.113234791150843, + "learning_rate": 9.898738787813714e-06, + "loss": 1.098, + "step": 8400 + }, + { + "epoch": 0.47, + "grad_norm": 10.88229020201969, + "learning_rate": 9.898410335923996e-06, + "loss": 1.1889, + "step": 8405 + }, + { + "epoch": 0.47, + "grad_norm": 21.719444611762174, + "learning_rate": 9.898081357678468e-06, + "loss": 1.1312, + "step": 8410 + }, + { + "epoch": 0.47, + "grad_norm": 21.299351015474322, + "learning_rate": 9.89775185311248e-06, + "loss": 1.0726, + "step": 8415 + }, + { + "epoch": 0.47, + "grad_norm": 14.398252044924147, + "learning_rate": 9.897421822261442e-06, + "loss": 1.166, + "step": 8420 + }, + { + "epoch": 0.47, + "grad_norm": 8.370785867700546, + "learning_rate": 9.897091265160816e-06, + "loss": 1.136, + "step": 8425 + }, + { + "epoch": 0.47, + "grad_norm": 7.826027373171965, + "learning_rate": 9.896760181846119e-06, + "loss": 1.0938, + "step": 8430 + }, + { + "epoch": 0.48, + "grad_norm": 14.512961897362741, + "learning_rate": 9.896428572352931e-06, + "loss": 1.1433, + "step": 8435 + }, + { + "epoch": 0.48, + "grad_norm": 10.732017075522059, + "learning_rate": 9.896096436716881e-06, + "loss": 1.1107, + "step": 8440 + }, + { + "epoch": 0.48, + "grad_norm": 18.09926900760528, + "learning_rate": 9.895763774973664e-06, + "loss": 1.1526, + "step": 8445 + }, + { + "epoch": 0.48, + "grad_norm": 10.859586133410717, + "learning_rate": 9.895430587159021e-06, + "loss": 1.113, + "step": 8450 + }, + { + "epoch": 0.48, + "grad_norm": 8.243516666618795, + "learning_rate": 9.895096873308757e-06, + "loss": 1.1483, + "step": 8455 + }, + { + "epoch": 0.48, + "grad_norm": 7.918660122367753, + "learning_rate": 9.894762633458728e-06, + "loss": 1.124, + "step": 8460 + }, + { + "epoch": 0.48, + "grad_norm": 14.463828178244237, + "learning_rate": 9.894427867644854e-06, + "loss": 1.1644, + "step": 8465 + }, + { + "epoch": 0.48, + "grad_norm": 9.938550384195903, + "learning_rate": 9.894092575903102e-06, + "loss": 1.1308, + "step": 8470 + }, + { + "epoch": 0.48, + "grad_norm": 20.086290071419377, + "learning_rate": 9.893756758269506e-06, + "loss": 1.2175, + "step": 8475 + }, + { + "epoch": 0.48, + "grad_norm": 28.692590544152967, + "learning_rate": 9.893420414780148e-06, + "loss": 1.1635, + "step": 8480 + }, + { + "epoch": 0.48, + "grad_norm": 6.991419829713242, + "learning_rate": 9.893083545471172e-06, + "loss": 1.1577, + "step": 8485 + }, + { + "epoch": 0.48, + "grad_norm": 12.811681583399924, + "learning_rate": 9.892746150378772e-06, + "loss": 1.1738, + "step": 8490 + }, + { + "epoch": 0.48, + "grad_norm": 8.16934930585563, + "learning_rate": 9.892408229539206e-06, + "loss": 1.0848, + "step": 8495 + }, + { + "epoch": 0.48, + "grad_norm": 9.418373610077596, + "learning_rate": 9.892069782988783e-06, + "loss": 1.1275, + "step": 8500 + }, + { + "epoch": 0.48, + "grad_norm": 7.7309271487165825, + "learning_rate": 9.891730810763872e-06, + "loss": 1.1448, + "step": 8505 + }, + { + "epoch": 0.48, + "grad_norm": 7.856025700983017, + "learning_rate": 9.891391312900895e-06, + "loss": 1.1658, + "step": 8510 + }, + { + "epoch": 0.48, + "grad_norm": 7.637788056262348, + "learning_rate": 9.891051289436335e-06, + "loss": 1.1244, + "step": 8515 + }, + { + "epoch": 0.48, + "grad_norm": 6.527743712569118, + "learning_rate": 9.890710740406728e-06, + "loss": 1.1124, + "step": 8520 + }, + { + "epoch": 0.48, + "grad_norm": 9.382154647775197, + "learning_rate": 9.890369665848667e-06, + "loss": 1.1388, + "step": 8525 + }, + { + "epoch": 0.48, + "grad_norm": 22.201215017275665, + "learning_rate": 9.890028065798805e-06, + "loss": 1.1463, + "step": 8530 + }, + { + "epoch": 0.48, + "grad_norm": 16.946811844953196, + "learning_rate": 9.889685940293843e-06, + "loss": 1.1496, + "step": 8535 + }, + { + "epoch": 0.48, + "grad_norm": 8.018285658393802, + "learning_rate": 9.889343289370549e-06, + "loss": 1.1317, + "step": 8540 + }, + { + "epoch": 0.48, + "grad_norm": 6.327708999852175, + "learning_rate": 9.88900011306574e-06, + "loss": 1.1272, + "step": 8545 + }, + { + "epoch": 0.48, + "grad_norm": 6.334283488082707, + "learning_rate": 9.888656411416292e-06, + "loss": 1.1236, + "step": 8550 + }, + { + "epoch": 0.48, + "grad_norm": 7.873331119329881, + "learning_rate": 9.888312184459137e-06, + "loss": 1.151, + "step": 8555 + }, + { + "epoch": 0.48, + "grad_norm": 5.367691852507198, + "learning_rate": 9.887967432231264e-06, + "loss": 1.0753, + "step": 8560 + }, + { + "epoch": 0.48, + "grad_norm": 13.632051776820594, + "learning_rate": 9.887622154769718e-06, + "loss": 1.0876, + "step": 8565 + }, + { + "epoch": 0.48, + "grad_norm": 12.774701151149621, + "learning_rate": 9.8872763521116e-06, + "loss": 1.2065, + "step": 8570 + }, + { + "epoch": 0.48, + "grad_norm": 12.0813360645562, + "learning_rate": 9.886930024294071e-06, + "loss": 1.1686, + "step": 8575 + }, + { + "epoch": 0.48, + "grad_norm": 31.83631671917243, + "learning_rate": 9.88658317135434e-06, + "loss": 1.1813, + "step": 8580 + }, + { + "epoch": 0.48, + "grad_norm": 12.237608240177158, + "learning_rate": 9.886235793329684e-06, + "loss": 1.1262, + "step": 8585 + }, + { + "epoch": 0.48, + "grad_norm": 5.974889676681796, + "learning_rate": 9.885887890257426e-06, + "loss": 1.0847, + "step": 8590 + }, + { + "epoch": 0.48, + "grad_norm": 7.314712493839495, + "learning_rate": 9.885539462174951e-06, + "loss": 1.1294, + "step": 8595 + }, + { + "epoch": 0.48, + "grad_norm": 6.084275999170101, + "learning_rate": 9.8851905091197e-06, + "loss": 1.1343, + "step": 8600 + }, + { + "epoch": 0.48, + "grad_norm": 7.725744409281265, + "learning_rate": 9.884841031129168e-06, + "loss": 1.104, + "step": 8605 + }, + { + "epoch": 0.49, + "grad_norm": 8.49318806526292, + "learning_rate": 9.88449102824091e-06, + "loss": 1.1604, + "step": 8610 + }, + { + "epoch": 0.49, + "grad_norm": 8.260404077681832, + "learning_rate": 9.884140500492534e-06, + "loss": 1.1262, + "step": 8615 + }, + { + "epoch": 0.49, + "grad_norm": 12.042390101029518, + "learning_rate": 9.883789447921704e-06, + "loss": 1.1761, + "step": 8620 + }, + { + "epoch": 0.49, + "grad_norm": 5.651268719845782, + "learning_rate": 9.883437870566145e-06, + "loss": 1.1607, + "step": 8625 + }, + { + "epoch": 0.49, + "grad_norm": 5.810416629282749, + "learning_rate": 9.883085768463636e-06, + "loss": 1.0684, + "step": 8630 + }, + { + "epoch": 0.49, + "grad_norm": 18.49464688357209, + "learning_rate": 9.88273314165201e-06, + "loss": 1.1466, + "step": 8635 + }, + { + "epoch": 0.49, + "grad_norm": 6.831790418939747, + "learning_rate": 9.882379990169157e-06, + "loss": 1.1272, + "step": 8640 + }, + { + "epoch": 0.49, + "grad_norm": 7.087714408267669, + "learning_rate": 9.882026314053028e-06, + "loss": 1.1517, + "step": 8645 + }, + { + "epoch": 0.49, + "grad_norm": 10.38482569343233, + "learning_rate": 9.881672113341625e-06, + "loss": 1.0987, + "step": 8650 + }, + { + "epoch": 0.49, + "grad_norm": 13.978551408443714, + "learning_rate": 9.881317388073008e-06, + "loss": 1.1682, + "step": 8655 + }, + { + "epoch": 0.49, + "grad_norm": 6.577930016224028, + "learning_rate": 9.880962138285296e-06, + "loss": 1.1361, + "step": 8660 + }, + { + "epoch": 0.49, + "grad_norm": 11.003460191777767, + "learning_rate": 9.88060636401666e-06, + "loss": 1.0991, + "step": 8665 + }, + { + "epoch": 0.49, + "grad_norm": 8.613147576585526, + "learning_rate": 9.880250065305333e-06, + "loss": 1.0562, + "step": 8670 + }, + { + "epoch": 0.49, + "grad_norm": 7.268487046062377, + "learning_rate": 9.879893242189596e-06, + "loss": 1.1133, + "step": 8675 + }, + { + "epoch": 0.49, + "grad_norm": 6.275443322415458, + "learning_rate": 9.879535894707793e-06, + "loss": 1.1094, + "step": 8680 + }, + { + "epoch": 0.49, + "grad_norm": 6.671029277425907, + "learning_rate": 9.879178022898325e-06, + "loss": 1.1468, + "step": 8685 + }, + { + "epoch": 0.49, + "grad_norm": 8.411401122550819, + "learning_rate": 9.878819626799644e-06, + "loss": 1.1176, + "step": 8690 + }, + { + "epoch": 0.49, + "grad_norm": 12.55076792576607, + "learning_rate": 9.878460706450261e-06, + "loss": 1.1391, + "step": 8695 + }, + { + "epoch": 0.49, + "grad_norm": 8.601832477972204, + "learning_rate": 9.878101261888747e-06, + "loss": 1.1068, + "step": 8700 + }, + { + "epoch": 0.49, + "grad_norm": 11.325638205613911, + "learning_rate": 9.877741293153721e-06, + "loss": 1.1427, + "step": 8705 + }, + { + "epoch": 0.49, + "grad_norm": 7.384084077786898, + "learning_rate": 9.877380800283869e-06, + "loss": 1.0938, + "step": 8710 + }, + { + "epoch": 0.49, + "grad_norm": 8.743329701469579, + "learning_rate": 9.877019783317924e-06, + "loss": 1.1259, + "step": 8715 + }, + { + "epoch": 0.49, + "grad_norm": 12.85590857224492, + "learning_rate": 9.876658242294679e-06, + "loss": 1.1482, + "step": 8720 + }, + { + "epoch": 0.49, + "grad_norm": 8.252324703638708, + "learning_rate": 9.876296177252982e-06, + "loss": 1.0955, + "step": 8725 + }, + { + "epoch": 0.49, + "grad_norm": 5.543101434848318, + "learning_rate": 9.87593358823174e-06, + "loss": 1.1265, + "step": 8730 + }, + { + "epoch": 0.49, + "grad_norm": 10.11810196707246, + "learning_rate": 9.875570475269918e-06, + "loss": 1.117, + "step": 8735 + }, + { + "epoch": 0.49, + "grad_norm": 9.870819290397085, + "learning_rate": 9.875206838406528e-06, + "loss": 1.1028, + "step": 8740 + }, + { + "epoch": 0.49, + "grad_norm": 6.127359370236454, + "learning_rate": 9.874842677680646e-06, + "loss": 1.1071, + "step": 8745 + }, + { + "epoch": 0.49, + "grad_norm": 13.461513470793973, + "learning_rate": 9.874477993131408e-06, + "loss": 1.1503, + "step": 8750 + }, + { + "epoch": 0.49, + "grad_norm": 5.90646484454837, + "learning_rate": 9.874112784797996e-06, + "loss": 1.1434, + "step": 8755 + }, + { + "epoch": 0.49, + "grad_norm": 8.401254201075805, + "learning_rate": 9.873747052719653e-06, + "loss": 1.1286, + "step": 8760 + }, + { + "epoch": 0.49, + "grad_norm": 6.199430274042413, + "learning_rate": 9.873380796935678e-06, + "loss": 1.0951, + "step": 8765 + }, + { + "epoch": 0.49, + "grad_norm": 6.247900062417915, + "learning_rate": 9.873014017485432e-06, + "loss": 1.102, + "step": 8770 + }, + { + "epoch": 0.49, + "grad_norm": 7.515539280285301, + "learning_rate": 9.87264671440832e-06, + "loss": 1.1216, + "step": 8775 + }, + { + "epoch": 0.49, + "grad_norm": 8.969039162871018, + "learning_rate": 9.872278887743817e-06, + "loss": 1.106, + "step": 8780 + }, + { + "epoch": 0.49, + "grad_norm": 8.045261243073863, + "learning_rate": 9.871910537531444e-06, + "loss": 1.1104, + "step": 8785 + }, + { + "epoch": 0.5, + "grad_norm": 9.103090105477524, + "learning_rate": 9.87154166381078e-06, + "loss": 1.1338, + "step": 8790 + }, + { + "epoch": 0.5, + "grad_norm": 6.637827420064879, + "learning_rate": 9.871172266621467e-06, + "loss": 1.089, + "step": 8795 + }, + { + "epoch": 0.5, + "grad_norm": 9.192056301224735, + "learning_rate": 9.870802346003194e-06, + "loss": 1.159, + "step": 8800 + }, + { + "epoch": 0.5, + "grad_norm": 15.177519571168098, + "learning_rate": 9.870431901995713e-06, + "loss": 1.1193, + "step": 8805 + }, + { + "epoch": 0.5, + "grad_norm": 10.03963120388044, + "learning_rate": 9.87006093463883e-06, + "loss": 1.1666, + "step": 8810 + }, + { + "epoch": 0.5, + "grad_norm": 9.5455509691663, + "learning_rate": 9.869689443972407e-06, + "loss": 1.1137, + "step": 8815 + }, + { + "epoch": 0.5, + "grad_norm": 12.175597327505168, + "learning_rate": 9.869317430036361e-06, + "loss": 1.1661, + "step": 8820 + }, + { + "epoch": 0.5, + "grad_norm": 7.13390041830503, + "learning_rate": 9.868944892870667e-06, + "loss": 1.161, + "step": 8825 + }, + { + "epoch": 0.5, + "grad_norm": 8.106778316679549, + "learning_rate": 9.868571832515356e-06, + "loss": 1.1145, + "step": 8830 + }, + { + "epoch": 0.5, + "grad_norm": 7.580016108080615, + "learning_rate": 9.868198249010515e-06, + "loss": 1.1323, + "step": 8835 + }, + { + "epoch": 0.5, + "grad_norm": 11.646913338222264, + "learning_rate": 9.867824142396287e-06, + "loss": 1.1246, + "step": 8840 + }, + { + "epoch": 0.5, + "grad_norm": 14.789300711494043, + "learning_rate": 9.86744951271287e-06, + "loss": 1.1512, + "step": 8845 + }, + { + "epoch": 0.5, + "grad_norm": 7.44934645700223, + "learning_rate": 9.867074360000524e-06, + "loss": 1.1336, + "step": 8850 + }, + { + "epoch": 0.5, + "grad_norm": 14.130068306637817, + "learning_rate": 9.866698684299556e-06, + "loss": 1.1355, + "step": 8855 + }, + { + "epoch": 0.5, + "grad_norm": 23.38322882222291, + "learning_rate": 9.866322485650338e-06, + "loss": 1.1629, + "step": 8860 + }, + { + "epoch": 0.5, + "grad_norm": 23.17977244139066, + "learning_rate": 9.86594576409329e-06, + "loss": 1.1563, + "step": 8865 + }, + { + "epoch": 0.5, + "grad_norm": 36.22213876931674, + "learning_rate": 9.865568519668896e-06, + "loss": 1.0806, + "step": 8870 + }, + { + "epoch": 0.5, + "grad_norm": 7.914144236726561, + "learning_rate": 9.865190752417691e-06, + "loss": 1.1389, + "step": 8875 + }, + { + "epoch": 0.5, + "grad_norm": 18.950702674673007, + "learning_rate": 9.864812462380267e-06, + "loss": 1.1194, + "step": 8880 + }, + { + "epoch": 0.5, + "grad_norm": 21.554579307891707, + "learning_rate": 9.864433649597274e-06, + "loss": 1.0886, + "step": 8885 + }, + { + "epoch": 0.5, + "grad_norm": 8.952643715967486, + "learning_rate": 9.864054314109417e-06, + "loss": 1.1137, + "step": 8890 + }, + { + "epoch": 0.5, + "grad_norm": 8.188043647866964, + "learning_rate": 9.863674455957457e-06, + "loss": 1.1232, + "step": 8895 + }, + { + "epoch": 0.5, + "grad_norm": 6.059623519784218, + "learning_rate": 9.863294075182212e-06, + "loss": 1.0934, + "step": 8900 + }, + { + "epoch": 0.5, + "grad_norm": 5.50108779868795, + "learning_rate": 9.862913171824557e-06, + "loss": 1.0984, + "step": 8905 + }, + { + "epoch": 0.5, + "grad_norm": 23.470460801032964, + "learning_rate": 9.862531745925419e-06, + "loss": 1.1431, + "step": 8910 + }, + { + "epoch": 0.5, + "grad_norm": 5.635266053974252, + "learning_rate": 9.862149797525785e-06, + "loss": 1.1121, + "step": 8915 + }, + { + "epoch": 0.5, + "grad_norm": 13.74285422981206, + "learning_rate": 9.861767326666695e-06, + "loss": 1.0956, + "step": 8920 + }, + { + "epoch": 0.5, + "grad_norm": 7.988992153452946, + "learning_rate": 9.86138433338925e-06, + "loss": 1.1517, + "step": 8925 + }, + { + "epoch": 0.5, + "grad_norm": 7.902158338876868, + "learning_rate": 9.861000817734605e-06, + "loss": 1.1701, + "step": 8930 + }, + { + "epoch": 0.5, + "grad_norm": 8.715489638432372, + "learning_rate": 9.860616779743968e-06, + "loss": 1.1776, + "step": 8935 + }, + { + "epoch": 0.5, + "grad_norm": 14.804661819538458, + "learning_rate": 9.860232219458607e-06, + "loss": 1.1341, + "step": 8940 + }, + { + "epoch": 0.5, + "grad_norm": 43.304049769680255, + "learning_rate": 9.859847136919842e-06, + "loss": 1.1533, + "step": 8945 + }, + { + "epoch": 0.5, + "grad_norm": 26.25033258227799, + "learning_rate": 9.859461532169057e-06, + "loss": 1.1089, + "step": 8950 + }, + { + "epoch": 0.5, + "grad_norm": 10.19941763309832, + "learning_rate": 9.859075405247682e-06, + "loss": 1.1551, + "step": 8955 + }, + { + "epoch": 0.5, + "grad_norm": 32.981446791884494, + "learning_rate": 9.85868875619721e-06, + "loss": 1.1759, + "step": 8960 + }, + { + "epoch": 0.51, + "grad_norm": 21.893948142009414, + "learning_rate": 9.858301585059188e-06, + "loss": 1.1261, + "step": 8965 + }, + { + "epoch": 0.51, + "grad_norm": 9.915641489935265, + "learning_rate": 9.857913891875222e-06, + "loss": 1.1577, + "step": 8970 + }, + { + "epoch": 0.51, + "grad_norm": 15.042933842324215, + "learning_rate": 9.857525676686966e-06, + "loss": 1.1192, + "step": 8975 + }, + { + "epoch": 0.51, + "grad_norm": 27.99212123543072, + "learning_rate": 9.857136939536137e-06, + "loss": 1.117, + "step": 8980 + }, + { + "epoch": 0.51, + "grad_norm": 18.57975891297702, + "learning_rate": 9.856747680464511e-06, + "loss": 1.1142, + "step": 8985 + }, + { + "epoch": 0.51, + "grad_norm": 44.46051231204827, + "learning_rate": 9.856357899513911e-06, + "loss": 1.1014, + "step": 8990 + }, + { + "epoch": 0.51, + "grad_norm": 8.635132820820676, + "learning_rate": 9.85596759672622e-06, + "loss": 1.0509, + "step": 8995 + }, + { + "epoch": 0.51, + "grad_norm": 22.916868614206592, + "learning_rate": 9.855576772143383e-06, + "loss": 1.1111, + "step": 9000 + }, + { + "epoch": 0.51, + "grad_norm": 13.594514080170606, + "learning_rate": 9.85518542580739e-06, + "loss": 1.1419, + "step": 9005 + }, + { + "epoch": 0.51, + "grad_norm": 20.252084074455954, + "learning_rate": 9.854793557760298e-06, + "loss": 1.0845, + "step": 9010 + }, + { + "epoch": 0.51, + "grad_norm": 25.571367513281952, + "learning_rate": 9.85440116804421e-06, + "loss": 1.1459, + "step": 9015 + }, + { + "epoch": 0.51, + "grad_norm": 16.735541352000926, + "learning_rate": 9.854008256701293e-06, + "loss": 1.1173, + "step": 9020 + }, + { + "epoch": 0.51, + "grad_norm": 12.966493016531478, + "learning_rate": 9.853614823773768e-06, + "loss": 1.1297, + "step": 9025 + }, + { + "epoch": 0.51, + "grad_norm": 8.44739383925974, + "learning_rate": 9.853220869303907e-06, + "loss": 1.1311, + "step": 9030 + }, + { + "epoch": 0.51, + "grad_norm": 6.261661916692985, + "learning_rate": 9.852826393334045e-06, + "loss": 1.1272, + "step": 9035 + }, + { + "epoch": 0.51, + "grad_norm": 8.115260549378783, + "learning_rate": 9.852431395906571e-06, + "loss": 1.1183, + "step": 9040 + }, + { + "epoch": 0.51, + "grad_norm": 6.911338588302018, + "learning_rate": 9.852035877063928e-06, + "loss": 1.1403, + "step": 9045 + }, + { + "epoch": 0.51, + "grad_norm": 6.65319648960198, + "learning_rate": 9.851639836848617e-06, + "loss": 1.1183, + "step": 9050 + }, + { + "epoch": 0.51, + "grad_norm": 5.598607649715242, + "learning_rate": 9.851243275303193e-06, + "loss": 1.1406, + "step": 9055 + }, + { + "epoch": 0.51, + "grad_norm": 14.302947523515613, + "learning_rate": 9.850846192470268e-06, + "loss": 1.1586, + "step": 9060 + }, + { + "epoch": 0.51, + "grad_norm": 6.521566317114554, + "learning_rate": 9.850448588392512e-06, + "loss": 1.0928, + "step": 9065 + }, + { + "epoch": 0.51, + "grad_norm": 9.635274741612458, + "learning_rate": 9.850050463112649e-06, + "loss": 1.1316, + "step": 9070 + }, + { + "epoch": 0.51, + "grad_norm": 23.599549137314003, + "learning_rate": 9.849651816673458e-06, + "loss": 1.1373, + "step": 9075 + }, + { + "epoch": 0.51, + "grad_norm": 9.554228488313795, + "learning_rate": 9.849252649117776e-06, + "loss": 1.1979, + "step": 9080 + }, + { + "epoch": 0.51, + "grad_norm": 5.777262147948352, + "learning_rate": 9.848852960488496e-06, + "loss": 1.1417, + "step": 9085 + }, + { + "epoch": 0.51, + "grad_norm": 10.990552729003635, + "learning_rate": 9.848452750828566e-06, + "loss": 1.1574, + "step": 9090 + }, + { + "epoch": 0.51, + "grad_norm": 10.988614664868656, + "learning_rate": 9.848052020180989e-06, + "loss": 1.1198, + "step": 9095 + }, + { + "epoch": 0.51, + "grad_norm": 8.631348102925648, + "learning_rate": 9.847650768588828e-06, + "loss": 1.1256, + "step": 9100 + }, + { + "epoch": 0.51, + "grad_norm": 8.998195523071896, + "learning_rate": 9.847248996095195e-06, + "loss": 1.1529, + "step": 9105 + }, + { + "epoch": 0.51, + "grad_norm": 7.971478396242029, + "learning_rate": 9.846846702743266e-06, + "loss": 1.1416, + "step": 9110 + }, + { + "epoch": 0.51, + "grad_norm": 5.8668717586872745, + "learning_rate": 9.84644388857627e-06, + "loss": 1.1337, + "step": 9115 + }, + { + "epoch": 0.51, + "grad_norm": 9.713458049355822, + "learning_rate": 9.846040553637488e-06, + "loss": 1.0957, + "step": 9120 + }, + { + "epoch": 0.51, + "grad_norm": 5.565561738164757, + "learning_rate": 9.84563669797026e-06, + "loss": 1.1209, + "step": 9125 + }, + { + "epoch": 0.51, + "grad_norm": 12.321301054180532, + "learning_rate": 9.845232321617984e-06, + "loss": 1.1173, + "step": 9130 + }, + { + "epoch": 0.51, + "grad_norm": 6.966828282551532, + "learning_rate": 9.844827424624113e-06, + "loss": 1.1384, + "step": 9135 + }, + { + "epoch": 0.51, + "grad_norm": 16.342561458763036, + "learning_rate": 9.844422007032151e-06, + "loss": 1.1627, + "step": 9140 + }, + { + "epoch": 0.52, + "grad_norm": 6.1135760028307615, + "learning_rate": 9.844016068885667e-06, + "loss": 1.1183, + "step": 9145 + }, + { + "epoch": 0.52, + "grad_norm": 9.567234212670838, + "learning_rate": 9.843609610228275e-06, + "loss": 1.0926, + "step": 9150 + }, + { + "epoch": 0.52, + "grad_norm": 7.844832749172317, + "learning_rate": 9.843202631103657e-06, + "loss": 1.101, + "step": 9155 + }, + { + "epoch": 0.52, + "grad_norm": 8.7398542540443, + "learning_rate": 9.84279513155554e-06, + "loss": 1.0889, + "step": 9160 + }, + { + "epoch": 0.52, + "grad_norm": 6.016156765500929, + "learning_rate": 9.842387111627713e-06, + "loss": 1.1259, + "step": 9165 + }, + { + "epoch": 0.52, + "grad_norm": 8.424652351047815, + "learning_rate": 9.84197857136402e-06, + "loss": 1.1377, + "step": 9170 + }, + { + "epoch": 0.52, + "grad_norm": 5.964914639726848, + "learning_rate": 9.84156951080836e-06, + "loss": 1.1055, + "step": 9175 + }, + { + "epoch": 0.52, + "grad_norm": 8.336882081168872, + "learning_rate": 9.84115993000469e-06, + "loss": 1.1356, + "step": 9180 + }, + { + "epoch": 0.52, + "grad_norm": 12.574342064183927, + "learning_rate": 9.840749828997017e-06, + "loss": 1.1032, + "step": 9185 + }, + { + "epoch": 0.52, + "grad_norm": 11.71087525020578, + "learning_rate": 9.840339207829413e-06, + "loss": 1.1026, + "step": 9190 + }, + { + "epoch": 0.52, + "grad_norm": 7.347434640965776, + "learning_rate": 9.839928066545999e-06, + "loss": 1.1359, + "step": 9195 + }, + { + "epoch": 0.52, + "grad_norm": 10.793520773613837, + "learning_rate": 9.839516405190955e-06, + "loss": 1.1373, + "step": 9200 + }, + { + "epoch": 0.52, + "grad_norm": 7.815179199001182, + "learning_rate": 9.839104223808512e-06, + "loss": 1.135, + "step": 9205 + }, + { + "epoch": 0.52, + "grad_norm": 6.861131066719821, + "learning_rate": 9.838691522442964e-06, + "loss": 1.1211, + "step": 9210 + }, + { + "epoch": 0.52, + "grad_norm": 7.0619283350116895, + "learning_rate": 9.83827830113866e-06, + "loss": 1.1362, + "step": 9215 + }, + { + "epoch": 0.52, + "grad_norm": 6.937969609258512, + "learning_rate": 9.837864559939998e-06, + "loss": 1.1229, + "step": 9220 + }, + { + "epoch": 0.52, + "grad_norm": 5.942605201843459, + "learning_rate": 9.837450298891437e-06, + "loss": 1.0998, + "step": 9225 + }, + { + "epoch": 0.52, + "grad_norm": 5.948647943576111, + "learning_rate": 9.837035518037495e-06, + "loss": 1.1823, + "step": 9230 + }, + { + "epoch": 0.52, + "grad_norm": 4.964748069626795, + "learning_rate": 9.836620217422735e-06, + "loss": 1.0749, + "step": 9235 + }, + { + "epoch": 0.52, + "grad_norm": 5.750341701906731, + "learning_rate": 9.83620439709179e-06, + "loss": 1.1699, + "step": 9240 + }, + { + "epoch": 0.52, + "grad_norm": 9.70908142575643, + "learning_rate": 9.835788057089337e-06, + "loss": 1.1368, + "step": 9245 + }, + { + "epoch": 0.52, + "grad_norm": 30.98794907794137, + "learning_rate": 9.835371197460117e-06, + "loss": 1.11, + "step": 9250 + }, + { + "epoch": 0.52, + "grad_norm": 25.88380382654531, + "learning_rate": 9.83495381824892e-06, + "loss": 1.1427, + "step": 9255 + }, + { + "epoch": 0.52, + "grad_norm": 10.790206155729912, + "learning_rate": 9.834535919500598e-06, + "loss": 1.1571, + "step": 9260 + }, + { + "epoch": 0.52, + "grad_norm": 10.644767137946909, + "learning_rate": 9.834117501260054e-06, + "loss": 1.1426, + "step": 9265 + }, + { + "epoch": 0.52, + "grad_norm": 9.418037283719267, + "learning_rate": 9.833698563572252e-06, + "loss": 1.1347, + "step": 9270 + }, + { + "epoch": 0.52, + "grad_norm": 20.399167830787746, + "learning_rate": 9.833279106482205e-06, + "loss": 1.0875, + "step": 9275 + }, + { + "epoch": 0.52, + "grad_norm": 19.52690583505267, + "learning_rate": 9.832859130034987e-06, + "loss": 1.1515, + "step": 9280 + }, + { + "epoch": 0.52, + "grad_norm": 11.841054786410181, + "learning_rate": 9.832438634275726e-06, + "loss": 1.0755, + "step": 9285 + }, + { + "epoch": 0.52, + "grad_norm": 14.689457248869871, + "learning_rate": 9.83201761924961e-06, + "loss": 1.1238, + "step": 9290 + }, + { + "epoch": 0.52, + "grad_norm": 6.5688380761419305, + "learning_rate": 9.83159608500187e-06, + "loss": 1.0593, + "step": 9295 + }, + { + "epoch": 0.52, + "grad_norm": 14.710014392324094, + "learning_rate": 9.831174031577812e-06, + "loss": 1.1344, + "step": 9300 + }, + { + "epoch": 0.52, + "grad_norm": 7.699248344298323, + "learning_rate": 9.83075145902278e-06, + "loss": 1.119, + "step": 9305 + }, + { + "epoch": 0.52, + "grad_norm": 6.59237732027422, + "learning_rate": 9.830328367382185e-06, + "loss": 1.0859, + "step": 9310 + }, + { + "epoch": 0.52, + "grad_norm": 18.493955902375863, + "learning_rate": 9.82990475670149e-06, + "loss": 1.1008, + "step": 9315 + }, + { + "epoch": 0.53, + "grad_norm": 16.571250308510237, + "learning_rate": 9.82948062702621e-06, + "loss": 1.1307, + "step": 9320 + }, + { + "epoch": 0.53, + "grad_norm": 16.61127247389848, + "learning_rate": 9.829055978401925e-06, + "loss": 1.1027, + "step": 9325 + }, + { + "epoch": 0.53, + "grad_norm": 15.779161856424208, + "learning_rate": 9.828630810874262e-06, + "loss": 1.0758, + "step": 9330 + }, + { + "epoch": 0.53, + "grad_norm": 8.012146361952567, + "learning_rate": 9.828205124488909e-06, + "loss": 1.1016, + "step": 9335 + }, + { + "epoch": 0.53, + "grad_norm": 9.38028933904781, + "learning_rate": 9.827778919291606e-06, + "loss": 1.181, + "step": 9340 + }, + { + "epoch": 0.53, + "grad_norm": 9.999416557246386, + "learning_rate": 9.827352195328151e-06, + "loss": 1.1356, + "step": 9345 + }, + { + "epoch": 0.53, + "grad_norm": 7.390646370920223, + "learning_rate": 9.8269249526444e-06, + "loss": 1.1025, + "step": 9350 + }, + { + "epoch": 0.53, + "grad_norm": 7.002946454465762, + "learning_rate": 9.826497191286257e-06, + "loss": 1.1567, + "step": 9355 + }, + { + "epoch": 0.53, + "grad_norm": 5.933892786820671, + "learning_rate": 9.826068911299692e-06, + "loss": 1.1228, + "step": 9360 + }, + { + "epoch": 0.53, + "grad_norm": 6.402605896074404, + "learning_rate": 9.825640112730723e-06, + "loss": 1.1722, + "step": 9365 + }, + { + "epoch": 0.53, + "grad_norm": 14.073746902114985, + "learning_rate": 9.825210795625426e-06, + "loss": 1.1494, + "step": 9370 + }, + { + "epoch": 0.53, + "grad_norm": 9.505770539857927, + "learning_rate": 9.824780960029934e-06, + "loss": 1.1371, + "step": 9375 + }, + { + "epoch": 0.53, + "grad_norm": 6.13806849938393, + "learning_rate": 9.824350605990434e-06, + "loss": 1.1668, + "step": 9380 + }, + { + "epoch": 0.53, + "grad_norm": 8.390959115620975, + "learning_rate": 9.823919733553171e-06, + "loss": 1.0758, + "step": 9385 + }, + { + "epoch": 0.53, + "grad_norm": 6.114862692148765, + "learning_rate": 9.823488342764442e-06, + "loss": 1.0858, + "step": 9390 + }, + { + "epoch": 0.53, + "grad_norm": 7.306169585772543, + "learning_rate": 9.823056433670603e-06, + "loss": 1.1119, + "step": 9395 + }, + { + "epoch": 0.53, + "grad_norm": 6.935150769462741, + "learning_rate": 9.822624006318065e-06, + "loss": 1.0986, + "step": 9400 + }, + { + "epoch": 0.53, + "grad_norm": 6.6496393432298495, + "learning_rate": 9.822191060753295e-06, + "loss": 1.1087, + "step": 9405 + }, + { + "epoch": 0.53, + "grad_norm": 10.943038279068391, + "learning_rate": 9.821757597022811e-06, + "loss": 1.121, + "step": 9410 + }, + { + "epoch": 0.53, + "grad_norm": 5.326836780816448, + "learning_rate": 9.821323615173195e-06, + "loss": 1.101, + "step": 9415 + }, + { + "epoch": 0.53, + "grad_norm": 5.551718225947079, + "learning_rate": 9.820889115251078e-06, + "loss": 1.0919, + "step": 9420 + }, + { + "epoch": 0.53, + "grad_norm": 6.127512681130131, + "learning_rate": 9.820454097303148e-06, + "loss": 1.132, + "step": 9425 + }, + { + "epoch": 0.53, + "grad_norm": 7.432004802350892, + "learning_rate": 9.820018561376153e-06, + "loss": 1.1897, + "step": 9430 + }, + { + "epoch": 0.53, + "grad_norm": 7.049844572848882, + "learning_rate": 9.81958250751689e-06, + "loss": 1.0832, + "step": 9435 + }, + { + "epoch": 0.53, + "grad_norm": 6.318701917149032, + "learning_rate": 9.819145935772217e-06, + "loss": 1.1318, + "step": 9440 + }, + { + "epoch": 0.53, + "grad_norm": 15.985454154216166, + "learning_rate": 9.818708846189041e-06, + "loss": 1.098, + "step": 9445 + }, + { + "epoch": 0.53, + "grad_norm": 7.19872846134024, + "learning_rate": 9.818271238814339e-06, + "loss": 1.1313, + "step": 9450 + }, + { + "epoch": 0.53, + "grad_norm": 5.521411137478754, + "learning_rate": 9.817833113695123e-06, + "loss": 1.132, + "step": 9455 + }, + { + "epoch": 0.53, + "grad_norm": 8.820409261480718, + "learning_rate": 9.817394470878479e-06, + "loss": 1.1095, + "step": 9460 + }, + { + "epoch": 0.53, + "grad_norm": 5.5678130983731124, + "learning_rate": 9.816955310411537e-06, + "loss": 1.0875, + "step": 9465 + }, + { + "epoch": 0.53, + "grad_norm": 6.667916390733254, + "learning_rate": 9.816515632341487e-06, + "loss": 1.1544, + "step": 9470 + }, + { + "epoch": 0.53, + "grad_norm": 8.102140788856543, + "learning_rate": 9.816075436715577e-06, + "loss": 1.1282, + "step": 9475 + }, + { + "epoch": 0.53, + "grad_norm": 6.432976660373403, + "learning_rate": 9.815634723581106e-06, + "loss": 1.0735, + "step": 9480 + }, + { + "epoch": 0.53, + "grad_norm": 5.8805646616392675, + "learning_rate": 9.815193492985432e-06, + "loss": 1.1147, + "step": 9485 + }, + { + "epoch": 0.53, + "grad_norm": 8.43628359747152, + "learning_rate": 9.814751744975964e-06, + "loss": 1.1482, + "step": 9490 + }, + { + "epoch": 0.53, + "grad_norm": 8.433577785768476, + "learning_rate": 9.814309479600173e-06, + "loss": 1.1606, + "step": 9495 + }, + { + "epoch": 0.54, + "grad_norm": 5.866263071903223, + "learning_rate": 9.813866696905581e-06, + "loss": 1.1712, + "step": 9500 + }, + { + "epoch": 0.54, + "grad_norm": 49.551941694609866, + "learning_rate": 9.813423396939767e-06, + "loss": 1.1682, + "step": 9505 + }, + { + "epoch": 0.54, + "grad_norm": 19.95562468305858, + "learning_rate": 9.812979579750367e-06, + "loss": 1.1726, + "step": 9510 + }, + { + "epoch": 0.54, + "grad_norm": 16.918281155827465, + "learning_rate": 9.812535245385069e-06, + "loss": 1.1331, + "step": 9515 + }, + { + "epoch": 0.54, + "grad_norm": 6.239698737640707, + "learning_rate": 9.81209039389162e-06, + "loss": 1.1467, + "step": 9520 + }, + { + "epoch": 0.54, + "grad_norm": 7.542450852206898, + "learning_rate": 9.811645025317819e-06, + "loss": 1.0984, + "step": 9525 + }, + { + "epoch": 0.54, + "grad_norm": 17.839824606202814, + "learning_rate": 9.811199139711528e-06, + "loss": 1.1459, + "step": 9530 + }, + { + "epoch": 0.54, + "grad_norm": 19.062707119434517, + "learning_rate": 9.810752737120652e-06, + "loss": 1.1725, + "step": 9535 + }, + { + "epoch": 0.54, + "grad_norm": 11.920835764199387, + "learning_rate": 9.810305817593167e-06, + "loss": 1.146, + "step": 9540 + }, + { + "epoch": 0.54, + "grad_norm": 9.823441097598744, + "learning_rate": 9.809858381177089e-06, + "loss": 1.1867, + "step": 9545 + }, + { + "epoch": 0.54, + "grad_norm": 18.483353058092547, + "learning_rate": 9.809410427920501e-06, + "loss": 1.1255, + "step": 9550 + }, + { + "epoch": 0.54, + "grad_norm": 9.792922139008056, + "learning_rate": 9.808961957871539e-06, + "loss": 1.0821, + "step": 9555 + }, + { + "epoch": 0.54, + "grad_norm": 6.5466522897197414, + "learning_rate": 9.80851297107839e-06, + "loss": 1.153, + "step": 9560 + }, + { + "epoch": 0.54, + "grad_norm": 8.5036408297158, + "learning_rate": 9.8080634675893e-06, + "loss": 1.1436, + "step": 9565 + }, + { + "epoch": 0.54, + "grad_norm": 30.775175095239852, + "learning_rate": 9.80761344745257e-06, + "loss": 1.1672, + "step": 9570 + }, + { + "epoch": 0.54, + "grad_norm": 7.275504767645358, + "learning_rate": 9.80716291071656e-06, + "loss": 1.1187, + "step": 9575 + }, + { + "epoch": 0.54, + "grad_norm": 18.957991468277854, + "learning_rate": 9.806711857429678e-06, + "loss": 1.1397, + "step": 9580 + }, + { + "epoch": 0.54, + "grad_norm": 17.726706035900417, + "learning_rate": 9.806260287640393e-06, + "loss": 1.0869, + "step": 9585 + }, + { + "epoch": 0.54, + "grad_norm": 23.00409452812483, + "learning_rate": 9.805808201397229e-06, + "loss": 1.0672, + "step": 9590 + }, + { + "epoch": 0.54, + "grad_norm": 8.166416366509448, + "learning_rate": 9.805355598748766e-06, + "loss": 1.1041, + "step": 9595 + }, + { + "epoch": 0.54, + "grad_norm": 22.53928607518761, + "learning_rate": 9.804902479743632e-06, + "loss": 1.1162, + "step": 9600 + }, + { + "epoch": 0.54, + "grad_norm": 34.38349793200656, + "learning_rate": 9.804448844430523e-06, + "loss": 1.129, + "step": 9605 + }, + { + "epoch": 0.54, + "grad_norm": 16.854031830090808, + "learning_rate": 9.803994692858184e-06, + "loss": 1.1163, + "step": 9610 + }, + { + "epoch": 0.54, + "grad_norm": 8.911815034854689, + "learning_rate": 9.80354002507541e-06, + "loss": 1.1344, + "step": 9615 + }, + { + "epoch": 0.54, + "grad_norm": 5.943951719386559, + "learning_rate": 9.803084841131063e-06, + "loss": 1.0573, + "step": 9620 + }, + { + "epoch": 0.54, + "grad_norm": 11.655449294062125, + "learning_rate": 9.802629141074053e-06, + "loss": 1.1198, + "step": 9625 + }, + { + "epoch": 0.54, + "grad_norm": 5.731686298770386, + "learning_rate": 9.802172924953344e-06, + "loss": 1.1135, + "step": 9630 + }, + { + "epoch": 0.54, + "grad_norm": 5.784653923093681, + "learning_rate": 9.801716192817962e-06, + "loss": 1.1518, + "step": 9635 + }, + { + "epoch": 0.54, + "grad_norm": 6.075680462367813, + "learning_rate": 9.801258944716982e-06, + "loss": 1.1319, + "step": 9640 + }, + { + "epoch": 0.54, + "grad_norm": 9.699910823794717, + "learning_rate": 9.800801180699542e-06, + "loss": 1.1177, + "step": 9645 + }, + { + "epoch": 0.54, + "grad_norm": 12.696372845431933, + "learning_rate": 9.800342900814825e-06, + "loss": 1.1271, + "step": 9650 + }, + { + "epoch": 0.54, + "grad_norm": 10.95683377412942, + "learning_rate": 9.799884105112078e-06, + "loss": 1.199, + "step": 9655 + }, + { + "epoch": 0.54, + "grad_norm": 9.02638592169939, + "learning_rate": 9.7994247936406e-06, + "loss": 1.1163, + "step": 9660 + }, + { + "epoch": 0.54, + "grad_norm": 14.014399656225942, + "learning_rate": 9.798964966449746e-06, + "loss": 1.1542, + "step": 9665 + }, + { + "epoch": 0.54, + "grad_norm": 12.85741257340364, + "learning_rate": 9.798504623588929e-06, + "loss": 1.1081, + "step": 9670 + }, + { + "epoch": 0.55, + "grad_norm": 28.312484526129953, + "learning_rate": 9.798043765107612e-06, + "loss": 1.1111, + "step": 9675 + }, + { + "epoch": 0.55, + "grad_norm": 16.087873458378628, + "learning_rate": 9.797582391055317e-06, + "loss": 1.1902, + "step": 9680 + }, + { + "epoch": 0.55, + "grad_norm": 16.109718740528173, + "learning_rate": 9.79712050148162e-06, + "loss": 1.1585, + "step": 9685 + }, + { + "epoch": 0.55, + "grad_norm": 6.241416079713798, + "learning_rate": 9.796658096436155e-06, + "loss": 1.1262, + "step": 9690 + }, + { + "epoch": 0.55, + "grad_norm": 6.099802194732407, + "learning_rate": 9.796195175968609e-06, + "loss": 1.1314, + "step": 9695 + }, + { + "epoch": 0.55, + "grad_norm": 5.806323165690559, + "learning_rate": 9.79573174012872e-06, + "loss": 1.1419, + "step": 9700 + }, + { + "epoch": 0.55, + "grad_norm": 5.84510943304211, + "learning_rate": 9.795267788966293e-06, + "loss": 1.079, + "step": 9705 + }, + { + "epoch": 0.55, + "grad_norm": 9.244004409262631, + "learning_rate": 9.79480332253118e-06, + "loss": 1.1117, + "step": 9710 + }, + { + "epoch": 0.55, + "grad_norm": 25.94214805244186, + "learning_rate": 9.794338340873288e-06, + "loss": 1.1416, + "step": 9715 + }, + { + "epoch": 0.55, + "grad_norm": 9.56855739260311, + "learning_rate": 9.793872844042582e-06, + "loss": 1.1155, + "step": 9720 + }, + { + "epoch": 0.55, + "grad_norm": 11.1216852701226, + "learning_rate": 9.793406832089084e-06, + "loss": 1.0934, + "step": 9725 + }, + { + "epoch": 0.55, + "grad_norm": 21.617542471609994, + "learning_rate": 9.792940305062863e-06, + "loss": 1.1141, + "step": 9730 + }, + { + "epoch": 0.55, + "grad_norm": 9.095356050481866, + "learning_rate": 9.792473263014056e-06, + "loss": 1.1001, + "step": 9735 + }, + { + "epoch": 0.55, + "grad_norm": 7.271670284567916, + "learning_rate": 9.792005705992847e-06, + "loss": 1.0696, + "step": 9740 + }, + { + "epoch": 0.55, + "grad_norm": 8.60935982925622, + "learning_rate": 9.791537634049473e-06, + "loss": 1.1412, + "step": 9745 + }, + { + "epoch": 0.55, + "grad_norm": 6.4281297841404506, + "learning_rate": 9.791069047234237e-06, + "loss": 1.126, + "step": 9750 + }, + { + "epoch": 0.55, + "grad_norm": 16.9394370256431, + "learning_rate": 9.790599945597484e-06, + "loss": 1.1089, + "step": 9755 + }, + { + "epoch": 0.55, + "grad_norm": 7.121878911508034, + "learning_rate": 9.790130329189626e-06, + "loss": 1.1086, + "step": 9760 + }, + { + "epoch": 0.55, + "grad_norm": 6.022861117918076, + "learning_rate": 9.789660198061124e-06, + "loss": 1.1442, + "step": 9765 + }, + { + "epoch": 0.55, + "grad_norm": 20.53097471180735, + "learning_rate": 9.789189552262494e-06, + "loss": 1.1686, + "step": 9770 + }, + { + "epoch": 0.55, + "grad_norm": 7.115874036538647, + "learning_rate": 9.788718391844312e-06, + "loss": 1.1327, + "step": 9775 + }, + { + "epoch": 0.55, + "grad_norm": 9.10906533014789, + "learning_rate": 9.788246716857203e-06, + "loss": 1.1365, + "step": 9780 + }, + { + "epoch": 0.55, + "grad_norm": 12.257897698293505, + "learning_rate": 9.787774527351853e-06, + "loss": 1.1272, + "step": 9785 + }, + { + "epoch": 0.55, + "grad_norm": 8.987710480713961, + "learning_rate": 9.787301823379e-06, + "loss": 1.1448, + "step": 9790 + }, + { + "epoch": 0.55, + "grad_norm": 6.36636842861665, + "learning_rate": 9.786828604989438e-06, + "loss": 1.1113, + "step": 9795 + }, + { + "epoch": 0.55, + "grad_norm": 15.169325055729368, + "learning_rate": 9.786354872234016e-06, + "loss": 1.1119, + "step": 9800 + }, + { + "epoch": 0.55, + "grad_norm": 5.567259582207006, + "learning_rate": 9.78588062516364e-06, + "loss": 1.0768, + "step": 9805 + }, + { + "epoch": 0.55, + "grad_norm": 5.523502957031439, + "learning_rate": 9.78540586382927e-06, + "loss": 1.0812, + "step": 9810 + }, + { + "epoch": 0.55, + "grad_norm": 6.818214437373952, + "learning_rate": 9.784930588281918e-06, + "loss": 1.1467, + "step": 9815 + }, + { + "epoch": 0.55, + "grad_norm": 5.5736218420296995, + "learning_rate": 9.784454798572658e-06, + "loss": 1.1653, + "step": 9820 + }, + { + "epoch": 0.55, + "grad_norm": 5.68106601324899, + "learning_rate": 9.783978494752613e-06, + "loss": 1.0969, + "step": 9825 + }, + { + "epoch": 0.55, + "grad_norm": 7.622379031185475, + "learning_rate": 9.783501676872968e-06, + "loss": 1.1296, + "step": 9830 + }, + { + "epoch": 0.55, + "grad_norm": 13.75996294680562, + "learning_rate": 9.783024344984956e-06, + "loss": 1.0826, + "step": 9835 + }, + { + "epoch": 0.55, + "grad_norm": 9.334303857518385, + "learning_rate": 9.782546499139867e-06, + "loss": 1.1397, + "step": 9840 + }, + { + "epoch": 0.55, + "grad_norm": 6.214488195724635, + "learning_rate": 9.782068139389051e-06, + "loss": 1.1237, + "step": 9845 + }, + { + "epoch": 0.55, + "grad_norm": 6.796337377225278, + "learning_rate": 9.781589265783909e-06, + "loss": 1.0843, + "step": 9850 + }, + { + "epoch": 0.56, + "grad_norm": 5.332913308661327, + "learning_rate": 9.781109878375897e-06, + "loss": 1.089, + "step": 9855 + }, + { + "epoch": 0.56, + "grad_norm": 7.006643164952569, + "learning_rate": 9.78062997721653e-06, + "loss": 1.1162, + "step": 9860 + }, + { + "epoch": 0.56, + "grad_norm": 8.827538006070318, + "learning_rate": 9.78014956235737e-06, + "loss": 1.0892, + "step": 9865 + }, + { + "epoch": 0.56, + "grad_norm": 7.829102172200375, + "learning_rate": 9.779668633850046e-06, + "loss": 1.0721, + "step": 9870 + }, + { + "epoch": 0.56, + "grad_norm": 11.863495499204848, + "learning_rate": 9.779187191746232e-06, + "loss": 1.0963, + "step": 9875 + }, + { + "epoch": 0.56, + "grad_norm": 8.755519011212401, + "learning_rate": 9.778705236097661e-06, + "loss": 1.1033, + "step": 9880 + }, + { + "epoch": 0.56, + "grad_norm": 9.485676461703726, + "learning_rate": 9.778222766956122e-06, + "loss": 1.1395, + "step": 9885 + }, + { + "epoch": 0.56, + "grad_norm": 7.514552674263018, + "learning_rate": 9.777739784373461e-06, + "loss": 1.1078, + "step": 9890 + }, + { + "epoch": 0.56, + "grad_norm": 8.024209150490186, + "learning_rate": 9.777256288401573e-06, + "loss": 1.1041, + "step": 9895 + }, + { + "epoch": 0.56, + "grad_norm": 6.417370515958933, + "learning_rate": 9.776772279092414e-06, + "loss": 1.1283, + "step": 9900 + }, + { + "epoch": 0.56, + "grad_norm": 14.681527551512538, + "learning_rate": 9.776287756497991e-06, + "loss": 1.0713, + "step": 9905 + }, + { + "epoch": 0.56, + "grad_norm": 13.332222561837927, + "learning_rate": 9.77580272067037e-06, + "loss": 1.1034, + "step": 9910 + }, + { + "epoch": 0.56, + "grad_norm": 6.079960594509294, + "learning_rate": 9.775317171661668e-06, + "loss": 1.0918, + "step": 9915 + }, + { + "epoch": 0.56, + "grad_norm": 18.95619231372345, + "learning_rate": 9.774831109524064e-06, + "loss": 1.1158, + "step": 9920 + }, + { + "epoch": 0.56, + "grad_norm": 8.754390005346306, + "learning_rate": 9.774344534309782e-06, + "loss": 1.0926, + "step": 9925 + }, + { + "epoch": 0.56, + "grad_norm": 5.839537890368287, + "learning_rate": 9.773857446071108e-06, + "loss": 1.1181, + "step": 9930 + }, + { + "epoch": 0.56, + "grad_norm": 6.620360785136541, + "learning_rate": 9.773369844860384e-06, + "loss": 1.127, + "step": 9935 + }, + { + "epoch": 0.56, + "grad_norm": 7.252806013880718, + "learning_rate": 9.772881730730007e-06, + "loss": 1.1544, + "step": 9940 + }, + { + "epoch": 0.56, + "grad_norm": 6.308373503259316, + "learning_rate": 9.772393103732417e-06, + "loss": 1.1641, + "step": 9945 + }, + { + "epoch": 0.56, + "grad_norm": 6.50809360555174, + "learning_rate": 9.771903963920132e-06, + "loss": 1.1238, + "step": 9950 + }, + { + "epoch": 0.56, + "grad_norm": 9.510544230636716, + "learning_rate": 9.771414311345702e-06, + "loss": 1.1382, + "step": 9955 + }, + { + "epoch": 0.56, + "grad_norm": 5.4853102905021505, + "learning_rate": 9.770924146061749e-06, + "loss": 1.15, + "step": 9960 + }, + { + "epoch": 0.56, + "grad_norm": 13.086961941226198, + "learning_rate": 9.77043346812094e-06, + "loss": 1.1442, + "step": 9965 + }, + { + "epoch": 0.56, + "grad_norm": 12.514056476883807, + "learning_rate": 9.769942277576001e-06, + "loss": 1.1134, + "step": 9970 + }, + { + "epoch": 0.56, + "grad_norm": 13.974031982611894, + "learning_rate": 9.769450574479714e-06, + "loss": 1.0835, + "step": 9975 + }, + { + "epoch": 0.56, + "grad_norm": 30.281983181732873, + "learning_rate": 9.76895835888491e-06, + "loss": 1.0753, + "step": 9980 + }, + { + "epoch": 0.56, + "grad_norm": 12.572762073580233, + "learning_rate": 9.768465630844487e-06, + "loss": 1.1356, + "step": 9985 + }, + { + "epoch": 0.56, + "grad_norm": 15.469596086217209, + "learning_rate": 9.767972390411386e-06, + "loss": 1.1499, + "step": 9990 + }, + { + "epoch": 0.56, + "grad_norm": 16.324454244157174, + "learning_rate": 9.767478637638609e-06, + "loss": 1.1274, + "step": 9995 + }, + { + "epoch": 0.56, + "grad_norm": 15.751386696752718, + "learning_rate": 9.766984372579212e-06, + "loss": 1.1674, + "step": 10000 + }, + { + "epoch": 0.56, + "grad_norm": 11.757330861218081, + "learning_rate": 9.766489595286306e-06, + "loss": 1.1062, + "step": 10005 + }, + { + "epoch": 0.56, + "grad_norm": 19.577967008442595, + "learning_rate": 9.765994305813056e-06, + "loss": 1.1584, + "step": 10010 + }, + { + "epoch": 0.56, + "grad_norm": 6.715385504908574, + "learning_rate": 9.765498504212685e-06, + "loss": 1.1571, + "step": 10015 + }, + { + "epoch": 0.56, + "grad_norm": 16.188995733588726, + "learning_rate": 9.765002190538467e-06, + "loss": 1.1157, + "step": 10020 + }, + { + "epoch": 0.56, + "grad_norm": 10.937873724368163, + "learning_rate": 9.764505364843734e-06, + "loss": 1.1288, + "step": 10025 + }, + { + "epoch": 0.57, + "grad_norm": 19.581915732852572, + "learning_rate": 9.764008027181872e-06, + "loss": 1.1096, + "step": 10030 + }, + { + "epoch": 0.57, + "grad_norm": 35.12020185934868, + "learning_rate": 9.763510177606324e-06, + "loss": 1.1504, + "step": 10035 + }, + { + "epoch": 0.57, + "grad_norm": 27.79462408084681, + "learning_rate": 9.763011816170583e-06, + "loss": 1.147, + "step": 10040 + }, + { + "epoch": 0.57, + "grad_norm": 10.135063625409902, + "learning_rate": 9.762512942928201e-06, + "loss": 1.1711, + "step": 10045 + }, + { + "epoch": 0.57, + "grad_norm": 7.7427326896889035, + "learning_rate": 9.762013557932787e-06, + "loss": 1.1437, + "step": 10050 + }, + { + "epoch": 0.57, + "grad_norm": 62.042302136295355, + "learning_rate": 9.761513661237998e-06, + "loss": 1.1346, + "step": 10055 + }, + { + "epoch": 0.57, + "grad_norm": 14.482157923920138, + "learning_rate": 9.761013252897554e-06, + "loss": 1.1313, + "step": 10060 + }, + { + "epoch": 0.57, + "grad_norm": 10.77522261793372, + "learning_rate": 9.760512332965221e-06, + "loss": 1.0506, + "step": 10065 + }, + { + "epoch": 0.57, + "grad_norm": 15.82416686579001, + "learning_rate": 9.76001090149483e-06, + "loss": 1.0977, + "step": 10070 + }, + { + "epoch": 0.57, + "grad_norm": 11.98131908318692, + "learning_rate": 9.75950895854026e-06, + "loss": 1.1557, + "step": 10075 + }, + { + "epoch": 0.57, + "grad_norm": 16.822062970124897, + "learning_rate": 9.759006504155446e-06, + "loss": 1.1455, + "step": 10080 + }, + { + "epoch": 0.57, + "grad_norm": 6.864234312537614, + "learning_rate": 9.758503538394382e-06, + "loss": 1.1413, + "step": 10085 + }, + { + "epoch": 0.57, + "grad_norm": 10.079842809205594, + "learning_rate": 9.75800006131111e-06, + "loss": 1.1507, + "step": 10090 + }, + { + "epoch": 0.57, + "grad_norm": 6.107264063664683, + "learning_rate": 9.757496072959734e-06, + "loss": 1.089, + "step": 10095 + }, + { + "epoch": 0.57, + "grad_norm": 6.195801576453392, + "learning_rate": 9.756991573394407e-06, + "loss": 1.1802, + "step": 10100 + }, + { + "epoch": 0.57, + "grad_norm": 5.779377619654823, + "learning_rate": 9.756486562669342e-06, + "loss": 1.1259, + "step": 10105 + }, + { + "epoch": 0.57, + "grad_norm": 6.386319012783584, + "learning_rate": 9.755981040838805e-06, + "loss": 1.0957, + "step": 10110 + }, + { + "epoch": 0.57, + "grad_norm": 8.811523355585202, + "learning_rate": 9.755475007957114e-06, + "loss": 1.0775, + "step": 10115 + }, + { + "epoch": 0.57, + "grad_norm": 13.369045677901603, + "learning_rate": 9.754968464078647e-06, + "loss": 1.0964, + "step": 10120 + }, + { + "epoch": 0.57, + "grad_norm": 6.597199366258872, + "learning_rate": 9.754461409257832e-06, + "loss": 1.1203, + "step": 10125 + }, + { + "epoch": 0.57, + "grad_norm": 12.471153307129219, + "learning_rate": 9.753953843549157e-06, + "loss": 1.0989, + "step": 10130 + }, + { + "epoch": 0.57, + "grad_norm": 5.875140466143439, + "learning_rate": 9.753445767007158e-06, + "loss": 1.1246, + "step": 10135 + }, + { + "epoch": 0.57, + "grad_norm": 20.75088919991107, + "learning_rate": 9.752937179686438e-06, + "loss": 1.1419, + "step": 10140 + }, + { + "epoch": 0.57, + "grad_norm": 17.543916367697925, + "learning_rate": 9.752428081641636e-06, + "loss": 1.1393, + "step": 10145 + }, + { + "epoch": 0.57, + "grad_norm": 9.194304111540609, + "learning_rate": 9.751918472927466e-06, + "loss": 1.1308, + "step": 10150 + }, + { + "epoch": 0.57, + "grad_norm": 5.916008625280372, + "learning_rate": 9.751408353598684e-06, + "loss": 1.0633, + "step": 10155 + }, + { + "epoch": 0.57, + "grad_norm": 6.8423195861966875, + "learning_rate": 9.750897723710104e-06, + "loss": 1.1195, + "step": 10160 + }, + { + "epoch": 0.57, + "grad_norm": 7.902355377355023, + "learning_rate": 9.750386583316595e-06, + "loss": 1.0883, + "step": 10165 + }, + { + "epoch": 0.57, + "grad_norm": 6.081120340072049, + "learning_rate": 9.749874932473085e-06, + "loss": 1.1056, + "step": 10170 + }, + { + "epoch": 0.57, + "grad_norm": 7.1463943403080865, + "learning_rate": 9.74936277123455e-06, + "loss": 1.1277, + "step": 10175 + }, + { + "epoch": 0.57, + "grad_norm": 8.747013479713555, + "learning_rate": 9.748850099656024e-06, + "loss": 1.1424, + "step": 10180 + }, + { + "epoch": 0.57, + "grad_norm": 12.854675542442633, + "learning_rate": 9.748336917792595e-06, + "loss": 1.091, + "step": 10185 + }, + { + "epoch": 0.57, + "grad_norm": 10.580645624406868, + "learning_rate": 9.747823225699411e-06, + "loss": 1.1058, + "step": 10190 + }, + { + "epoch": 0.57, + "grad_norm": 7.97250071910222, + "learning_rate": 9.747309023431668e-06, + "loss": 1.0889, + "step": 10195 + }, + { + "epoch": 0.57, + "grad_norm": 25.806132412860915, + "learning_rate": 9.746794311044616e-06, + "loss": 1.1003, + "step": 10200 + }, + { + "epoch": 0.57, + "grad_norm": 13.945519407440568, + "learning_rate": 9.746279088593569e-06, + "loss": 1.147, + "step": 10205 + }, + { + "epoch": 0.58, + "grad_norm": 11.64393437502863, + "learning_rate": 9.745763356133884e-06, + "loss": 1.0804, + "step": 10210 + }, + { + "epoch": 0.58, + "grad_norm": 6.292462338414721, + "learning_rate": 9.745247113720985e-06, + "loss": 1.1553, + "step": 10215 + }, + { + "epoch": 0.58, + "grad_norm": 6.827026435628405, + "learning_rate": 9.74473036141034e-06, + "loss": 1.1515, + "step": 10220 + }, + { + "epoch": 0.58, + "grad_norm": 9.04797847257774, + "learning_rate": 9.744213099257477e-06, + "loss": 1.1039, + "step": 10225 + }, + { + "epoch": 0.58, + "grad_norm": 21.339149586266636, + "learning_rate": 9.743695327317979e-06, + "loss": 1.1227, + "step": 10230 + }, + { + "epoch": 0.58, + "grad_norm": 6.996841412197717, + "learning_rate": 9.743177045647482e-06, + "loss": 1.1313, + "step": 10235 + }, + { + "epoch": 0.58, + "grad_norm": 16.945502506850865, + "learning_rate": 9.742658254301679e-06, + "loss": 1.1399, + "step": 10240 + }, + { + "epoch": 0.58, + "grad_norm": 16.075472790928725, + "learning_rate": 9.742138953336316e-06, + "loss": 1.1114, + "step": 10245 + }, + { + "epoch": 0.58, + "grad_norm": 24.094149374045454, + "learning_rate": 9.741619142807192e-06, + "loss": 1.1586, + "step": 10250 + }, + { + "epoch": 0.58, + "grad_norm": 6.250546938046734, + "learning_rate": 9.741098822770166e-06, + "loss": 1.0988, + "step": 10255 + }, + { + "epoch": 0.58, + "grad_norm": 7.897431171273826, + "learning_rate": 9.740577993281148e-06, + "loss": 1.0643, + "step": 10260 + }, + { + "epoch": 0.58, + "grad_norm": 6.013004346619281, + "learning_rate": 9.740056654396101e-06, + "loss": 1.1582, + "step": 10265 + }, + { + "epoch": 0.58, + "grad_norm": 8.888002368756538, + "learning_rate": 9.73953480617105e-06, + "loss": 1.1335, + "step": 10270 + }, + { + "epoch": 0.58, + "grad_norm": 6.43670125601399, + "learning_rate": 9.739012448662064e-06, + "loss": 1.1315, + "step": 10275 + }, + { + "epoch": 0.58, + "grad_norm": 11.826848129313802, + "learning_rate": 9.738489581925277e-06, + "loss": 1.1661, + "step": 10280 + }, + { + "epoch": 0.58, + "grad_norm": 24.349128088776393, + "learning_rate": 9.73796620601687e-06, + "loss": 1.1004, + "step": 10285 + }, + { + "epoch": 0.58, + "grad_norm": 7.10533008169947, + "learning_rate": 9.737442320993087e-06, + "loss": 1.1286, + "step": 10290 + }, + { + "epoch": 0.58, + "grad_norm": 6.222686842711923, + "learning_rate": 9.736917926910217e-06, + "loss": 1.1114, + "step": 10295 + }, + { + "epoch": 0.58, + "grad_norm": 10.196280282450498, + "learning_rate": 9.73639302382461e-06, + "loss": 1.0839, + "step": 10300 + }, + { + "epoch": 0.58, + "grad_norm": 13.100321322337622, + "learning_rate": 9.73586761179267e-06, + "loss": 1.1142, + "step": 10305 + }, + { + "epoch": 0.58, + "grad_norm": 9.209475139679098, + "learning_rate": 9.735341690870855e-06, + "loss": 1.0887, + "step": 10310 + }, + { + "epoch": 0.58, + "grad_norm": 8.444327449895846, + "learning_rate": 9.734815261115675e-06, + "loss": 1.1318, + "step": 10315 + }, + { + "epoch": 0.58, + "grad_norm": 8.827873819975245, + "learning_rate": 9.734288322583699e-06, + "loss": 1.1374, + "step": 10320 + }, + { + "epoch": 0.58, + "grad_norm": 14.783411255982069, + "learning_rate": 9.733760875331548e-06, + "loss": 1.1242, + "step": 10325 + }, + { + "epoch": 0.58, + "grad_norm": 6.493691481525475, + "learning_rate": 9.7332329194159e-06, + "loss": 1.1307, + "step": 10330 + }, + { + "epoch": 0.58, + "grad_norm": 7.581919740015162, + "learning_rate": 9.732704454893486e-06, + "loss": 1.1099, + "step": 10335 + }, + { + "epoch": 0.58, + "grad_norm": 17.715058082145536, + "learning_rate": 9.732175481821092e-06, + "loss": 1.1331, + "step": 10340 + }, + { + "epoch": 0.58, + "grad_norm": 13.192343668844734, + "learning_rate": 9.731646000255558e-06, + "loss": 1.1448, + "step": 10345 + }, + { + "epoch": 0.58, + "grad_norm": 8.679577190945963, + "learning_rate": 9.731116010253775e-06, + "loss": 1.1377, + "step": 10350 + }, + { + "epoch": 0.58, + "grad_norm": 13.573944570129123, + "learning_rate": 9.730585511872702e-06, + "loss": 1.1072, + "step": 10355 + }, + { + "epoch": 0.58, + "grad_norm": 40.535612120264155, + "learning_rate": 9.730054505169334e-06, + "loss": 1.1553, + "step": 10360 + }, + { + "epoch": 0.58, + "grad_norm": 28.689454298470146, + "learning_rate": 9.729522990200735e-06, + "loss": 1.1225, + "step": 10365 + }, + { + "epoch": 0.58, + "grad_norm": 17.917519588416386, + "learning_rate": 9.72899096702402e-06, + "loss": 1.1283, + "step": 10370 + }, + { + "epoch": 0.58, + "grad_norm": 24.286237193416884, + "learning_rate": 9.728458435696352e-06, + "loss": 1.164, + "step": 10375 + }, + { + "epoch": 0.58, + "grad_norm": 55.07682352740546, + "learning_rate": 9.727925396274958e-06, + "loss": 1.1373, + "step": 10380 + }, + { + "epoch": 0.59, + "grad_norm": 20.71368053216417, + "learning_rate": 9.727391848817114e-06, + "loss": 1.1441, + "step": 10385 + }, + { + "epoch": 0.59, + "grad_norm": 13.705263989766612, + "learning_rate": 9.726857793380153e-06, + "loss": 1.1164, + "step": 10390 + }, + { + "epoch": 0.59, + "grad_norm": 11.05921596663414, + "learning_rate": 9.72632323002146e-06, + "loss": 1.1428, + "step": 10395 + }, + { + "epoch": 0.59, + "grad_norm": 7.16323932261975, + "learning_rate": 9.725788158798477e-06, + "loss": 1.0978, + "step": 10400 + }, + { + "epoch": 0.59, + "grad_norm": 14.777814903551782, + "learning_rate": 9.725252579768701e-06, + "loss": 1.1299, + "step": 10405 + }, + { + "epoch": 0.59, + "grad_norm": 13.197498292246134, + "learning_rate": 9.724716492989681e-06, + "loss": 1.144, + "step": 10410 + }, + { + "epoch": 0.59, + "grad_norm": 19.40569514490288, + "learning_rate": 9.724179898519022e-06, + "loss": 1.1844, + "step": 10415 + }, + { + "epoch": 0.59, + "grad_norm": 7.509725936530929, + "learning_rate": 9.723642796414383e-06, + "loss": 1.0931, + "step": 10420 + }, + { + "epoch": 0.59, + "grad_norm": 21.606846928753214, + "learning_rate": 9.723105186733479e-06, + "loss": 1.11, + "step": 10425 + }, + { + "epoch": 0.59, + "grad_norm": 6.221554586179171, + "learning_rate": 9.722567069534077e-06, + "loss": 1.1516, + "step": 10430 + }, + { + "epoch": 0.59, + "grad_norm": 17.220405910858158, + "learning_rate": 9.722028444874002e-06, + "loss": 1.0947, + "step": 10435 + }, + { + "epoch": 0.59, + "grad_norm": 13.691082528659585, + "learning_rate": 9.721489312811129e-06, + "loss": 1.1555, + "step": 10440 + }, + { + "epoch": 0.59, + "grad_norm": 6.400540137386916, + "learning_rate": 9.720949673403395e-06, + "loss": 1.0979, + "step": 10445 + }, + { + "epoch": 0.59, + "grad_norm": 6.724417889986143, + "learning_rate": 9.720409526708782e-06, + "loss": 1.0846, + "step": 10450 + }, + { + "epoch": 0.59, + "grad_norm": 8.604818406860277, + "learning_rate": 9.719868872785331e-06, + "loss": 1.1206, + "step": 10455 + }, + { + "epoch": 0.59, + "grad_norm": 12.495729299942727, + "learning_rate": 9.719327711691142e-06, + "loss": 1.0591, + "step": 10460 + }, + { + "epoch": 0.59, + "grad_norm": 7.494537501359319, + "learning_rate": 9.71878604348436e-06, + "loss": 1.0871, + "step": 10465 + }, + { + "epoch": 0.59, + "grad_norm": 8.284017470531094, + "learning_rate": 9.718243868223193e-06, + "loss": 1.1264, + "step": 10470 + }, + { + "epoch": 0.59, + "grad_norm": 6.005211207587475, + "learning_rate": 9.7177011859659e-06, + "loss": 1.0903, + "step": 10475 + }, + { + "epoch": 0.59, + "grad_norm": 9.188547912156826, + "learning_rate": 9.717157996770793e-06, + "loss": 1.1578, + "step": 10480 + }, + { + "epoch": 0.59, + "grad_norm": 24.351181207494125, + "learning_rate": 9.716614300696242e-06, + "loss": 1.1269, + "step": 10485 + }, + { + "epoch": 0.59, + "grad_norm": 13.444515779435607, + "learning_rate": 9.716070097800665e-06, + "loss": 1.1311, + "step": 10490 + }, + { + "epoch": 0.59, + "grad_norm": 25.85354651689487, + "learning_rate": 9.715525388142545e-06, + "loss": 1.109, + "step": 10495 + }, + { + "epoch": 0.59, + "grad_norm": 7.276499819770264, + "learning_rate": 9.71498017178041e-06, + "loss": 1.1172, + "step": 10500 + }, + { + "epoch": 0.59, + "grad_norm": 5.75130989314831, + "learning_rate": 9.714434448772847e-06, + "loss": 1.107, + "step": 10505 + }, + { + "epoch": 0.59, + "grad_norm": 21.5014359906003, + "learning_rate": 9.713888219178497e-06, + "loss": 1.1076, + "step": 10510 + }, + { + "epoch": 0.59, + "grad_norm": 19.842848512386926, + "learning_rate": 9.713341483056054e-06, + "loss": 1.1029, + "step": 10515 + }, + { + "epoch": 0.59, + "grad_norm": 14.135521788293767, + "learning_rate": 9.712794240464265e-06, + "loss": 1.1221, + "step": 10520 + }, + { + "epoch": 0.59, + "grad_norm": 14.428356454329931, + "learning_rate": 9.712246491461937e-06, + "loss": 1.15, + "step": 10525 + }, + { + "epoch": 0.59, + "grad_norm": 18.754908177493192, + "learning_rate": 9.711698236107927e-06, + "loss": 1.1815, + "step": 10530 + }, + { + "epoch": 0.59, + "grad_norm": 30.93622654744006, + "learning_rate": 9.711149474461148e-06, + "loss": 1.151, + "step": 10535 + }, + { + "epoch": 0.59, + "grad_norm": 10.04329072521145, + "learning_rate": 9.710600206580565e-06, + "loss": 1.1517, + "step": 10540 + }, + { + "epoch": 0.59, + "grad_norm": 29.8315475145888, + "learning_rate": 9.7100504325252e-06, + "loss": 1.1409, + "step": 10545 + }, + { + "epoch": 0.59, + "grad_norm": 15.004271179187006, + "learning_rate": 9.709500152354128e-06, + "loss": 1.1159, + "step": 10550 + }, + { + "epoch": 0.59, + "grad_norm": 20.74101271516686, + "learning_rate": 9.708949366126482e-06, + "loss": 1.1402, + "step": 10555 + }, + { + "epoch": 0.59, + "grad_norm": 9.55534773566369, + "learning_rate": 9.708398073901444e-06, + "loss": 1.1007, + "step": 10560 + }, + { + "epoch": 0.6, + "grad_norm": 17.54004758170369, + "learning_rate": 9.707846275738253e-06, + "loss": 1.1298, + "step": 10565 + }, + { + "epoch": 0.6, + "grad_norm": 14.400850159105255, + "learning_rate": 9.707293971696202e-06, + "loss": 1.0853, + "step": 10570 + }, + { + "epoch": 0.6, + "grad_norm": 32.876751163781954, + "learning_rate": 9.70674116183464e-06, + "loss": 1.1536, + "step": 10575 + }, + { + "epoch": 0.6, + "grad_norm": 22.089008325041274, + "learning_rate": 9.706187846212967e-06, + "loss": 1.0851, + "step": 10580 + }, + { + "epoch": 0.6, + "grad_norm": 24.455632146508183, + "learning_rate": 9.70563402489064e-06, + "loss": 1.1009, + "step": 10585 + }, + { + "epoch": 0.6, + "grad_norm": 20.348149765166458, + "learning_rate": 9.705079697927168e-06, + "loss": 1.1786, + "step": 10590 + }, + { + "epoch": 0.6, + "grad_norm": 10.071164665487567, + "learning_rate": 9.70452486538212e-06, + "loss": 1.1392, + "step": 10595 + }, + { + "epoch": 0.6, + "grad_norm": 15.021139837990567, + "learning_rate": 9.70396952731511e-06, + "loss": 1.1434, + "step": 10600 + }, + { + "epoch": 0.6, + "grad_norm": 25.640077434092976, + "learning_rate": 9.703413683785817e-06, + "loss": 1.174, + "step": 10605 + }, + { + "epoch": 0.6, + "grad_norm": 17.417474433355128, + "learning_rate": 9.702857334853963e-06, + "loss": 1.1219, + "step": 10610 + }, + { + "epoch": 0.6, + "grad_norm": 22.32809964297896, + "learning_rate": 9.702300480579334e-06, + "loss": 1.1181, + "step": 10615 + }, + { + "epoch": 0.6, + "grad_norm": 7.940608868765343, + "learning_rate": 9.701743121021767e-06, + "loss": 1.1203, + "step": 10620 + }, + { + "epoch": 0.6, + "grad_norm": 11.463064479304906, + "learning_rate": 9.70118525624115e-06, + "loss": 1.1328, + "step": 10625 + }, + { + "epoch": 0.6, + "grad_norm": 7.441182793738268, + "learning_rate": 9.70062688629743e-06, + "loss": 1.1172, + "step": 10630 + }, + { + "epoch": 0.6, + "grad_norm": 7.042403549492438, + "learning_rate": 9.700068011250605e-06, + "loss": 1.1447, + "step": 10635 + }, + { + "epoch": 0.6, + "grad_norm": 10.664854623155703, + "learning_rate": 9.699508631160728e-06, + "loss": 1.0961, + "step": 10640 + }, + { + "epoch": 0.6, + "grad_norm": 6.71255557018499, + "learning_rate": 9.69894874608791e-06, + "loss": 1.0962, + "step": 10645 + }, + { + "epoch": 0.6, + "grad_norm": 5.352924314417328, + "learning_rate": 9.698388356092311e-06, + "loss": 1.1042, + "step": 10650 + }, + { + "epoch": 0.6, + "grad_norm": 6.5339482523967956, + "learning_rate": 9.697827461234147e-06, + "loss": 1.1122, + "step": 10655 + }, + { + "epoch": 0.6, + "grad_norm": 4.970137140377447, + "learning_rate": 9.69726606157369e-06, + "loss": 1.0622, + "step": 10660 + }, + { + "epoch": 0.6, + "grad_norm": 9.677722904681502, + "learning_rate": 9.696704157171262e-06, + "loss": 1.1195, + "step": 10665 + }, + { + "epoch": 0.6, + "grad_norm": 7.48068164376845, + "learning_rate": 9.696141748087247e-06, + "loss": 1.0789, + "step": 10670 + }, + { + "epoch": 0.6, + "grad_norm": 11.094365418738656, + "learning_rate": 9.695578834382074e-06, + "loss": 1.1123, + "step": 10675 + }, + { + "epoch": 0.6, + "grad_norm": 8.725765410505641, + "learning_rate": 9.695015416116232e-06, + "loss": 1.0964, + "step": 10680 + }, + { + "epoch": 0.6, + "grad_norm": 7.571844782783497, + "learning_rate": 9.694451493350264e-06, + "loss": 1.1276, + "step": 10685 + }, + { + "epoch": 0.6, + "grad_norm": 5.570916415682218, + "learning_rate": 9.693887066144762e-06, + "loss": 1.0941, + "step": 10690 + }, + { + "epoch": 0.6, + "grad_norm": 6.616549306214913, + "learning_rate": 9.693322134560382e-06, + "loss": 1.1304, + "step": 10695 + }, + { + "epoch": 0.6, + "grad_norm": 6.2412219935147695, + "learning_rate": 9.692756698657824e-06, + "loss": 1.1628, + "step": 10700 + }, + { + "epoch": 0.6, + "grad_norm": 9.592995785583279, + "learning_rate": 9.692190758497848e-06, + "loss": 1.1511, + "step": 10705 + }, + { + "epoch": 0.6, + "grad_norm": 6.956380223647882, + "learning_rate": 9.691624314141269e-06, + "loss": 1.1217, + "step": 10710 + }, + { + "epoch": 0.6, + "grad_norm": 6.601176942316527, + "learning_rate": 9.691057365648948e-06, + "loss": 1.0889, + "step": 10715 + }, + { + "epoch": 0.6, + "grad_norm": 6.289383284440312, + "learning_rate": 9.690489913081811e-06, + "loss": 1.0935, + "step": 10720 + }, + { + "epoch": 0.6, + "grad_norm": 5.769763701506643, + "learning_rate": 9.689921956500833e-06, + "loss": 1.141, + "step": 10725 + }, + { + "epoch": 0.6, + "grad_norm": 8.03707990084338, + "learning_rate": 9.689353495967043e-06, + "loss": 1.1382, + "step": 10730 + }, + { + "epoch": 0.6, + "grad_norm": 6.154239808856511, + "learning_rate": 9.688784531541524e-06, + "loss": 1.0971, + "step": 10735 + }, + { + "epoch": 0.61, + "grad_norm": 6.68709279393053, + "learning_rate": 9.688215063285414e-06, + "loss": 1.0991, + "step": 10740 + }, + { + "epoch": 0.61, + "grad_norm": 14.410367886881257, + "learning_rate": 9.687645091259902e-06, + "loss": 1.1552, + "step": 10745 + }, + { + "epoch": 0.61, + "grad_norm": 6.982096164450052, + "learning_rate": 9.68707461552624e-06, + "loss": 1.093, + "step": 10750 + }, + { + "epoch": 0.61, + "grad_norm": 7.726619292438254, + "learning_rate": 9.686503636145724e-06, + "loss": 1.1491, + "step": 10755 + }, + { + "epoch": 0.61, + "grad_norm": 14.841557450606214, + "learning_rate": 9.685932153179709e-06, + "loss": 1.0842, + "step": 10760 + }, + { + "epoch": 0.61, + "grad_norm": 8.599943842601997, + "learning_rate": 9.685360166689604e-06, + "loss": 1.1019, + "step": 10765 + }, + { + "epoch": 0.61, + "grad_norm": 7.1858095951090615, + "learning_rate": 9.68478767673687e-06, + "loss": 1.1489, + "step": 10770 + }, + { + "epoch": 0.61, + "grad_norm": 14.360875052792865, + "learning_rate": 9.684214683383027e-06, + "loss": 1.1493, + "step": 10775 + }, + { + "epoch": 0.61, + "grad_norm": 15.082698170941965, + "learning_rate": 9.683641186689642e-06, + "loss": 1.1402, + "step": 10780 + }, + { + "epoch": 0.61, + "grad_norm": 8.030738278295825, + "learning_rate": 9.68306718671834e-06, + "loss": 1.1297, + "step": 10785 + }, + { + "epoch": 0.61, + "grad_norm": 8.431781035672577, + "learning_rate": 9.682492683530802e-06, + "loss": 1.1744, + "step": 10790 + }, + { + "epoch": 0.61, + "grad_norm": 16.512006836021577, + "learning_rate": 9.681917677188758e-06, + "loss": 1.1445, + "step": 10795 + }, + { + "epoch": 0.61, + "grad_norm": 10.940268972104937, + "learning_rate": 9.681342167753998e-06, + "loss": 1.0945, + "step": 10800 + }, + { + "epoch": 0.61, + "grad_norm": 29.290167157837768, + "learning_rate": 9.680766155288362e-06, + "loss": 1.1571, + "step": 10805 + }, + { + "epoch": 0.61, + "grad_norm": 12.498490605308364, + "learning_rate": 9.680189639853743e-06, + "loss": 1.1203, + "step": 10810 + }, + { + "epoch": 0.61, + "grad_norm": 10.251821035868401, + "learning_rate": 9.679612621512093e-06, + "loss": 1.1003, + "step": 10815 + }, + { + "epoch": 0.61, + "grad_norm": 10.45276028054799, + "learning_rate": 9.679035100325416e-06, + "loss": 1.0907, + "step": 10820 + }, + { + "epoch": 0.61, + "grad_norm": 12.69058800986446, + "learning_rate": 9.678457076355764e-06, + "loss": 1.1457, + "step": 10825 + }, + { + "epoch": 0.61, + "grad_norm": 6.27444012265364, + "learning_rate": 9.677878549665254e-06, + "loss": 1.1632, + "step": 10830 + }, + { + "epoch": 0.61, + "grad_norm": 11.006361526713334, + "learning_rate": 9.677299520316048e-06, + "loss": 1.1473, + "step": 10835 + }, + { + "epoch": 0.61, + "grad_norm": 6.301868180611034, + "learning_rate": 9.676719988370366e-06, + "loss": 1.1146, + "step": 10840 + }, + { + "epoch": 0.61, + "grad_norm": 21.612832280318436, + "learning_rate": 9.676139953890482e-06, + "loss": 1.1083, + "step": 10845 + }, + { + "epoch": 0.61, + "grad_norm": 16.914221632107225, + "learning_rate": 9.675559416938723e-06, + "loss": 1.1085, + "step": 10850 + }, + { + "epoch": 0.61, + "grad_norm": 18.31008654260704, + "learning_rate": 9.674978377577468e-06, + "loss": 1.1384, + "step": 10855 + }, + { + "epoch": 0.61, + "grad_norm": 8.975980953182093, + "learning_rate": 9.674396835869155e-06, + "loss": 1.1205, + "step": 10860 + }, + { + "epoch": 0.61, + "grad_norm": 19.910273772561663, + "learning_rate": 9.673814791876273e-06, + "loss": 1.1497, + "step": 10865 + }, + { + "epoch": 0.61, + "grad_norm": 27.86541853928646, + "learning_rate": 9.673232245661364e-06, + "loss": 1.148, + "step": 10870 + }, + { + "epoch": 0.61, + "grad_norm": 37.35132208073142, + "learning_rate": 9.672649197287025e-06, + "loss": 1.1147, + "step": 10875 + }, + { + "epoch": 0.61, + "grad_norm": 38.39023156929308, + "learning_rate": 9.672065646815906e-06, + "loss": 1.1558, + "step": 10880 + }, + { + "epoch": 0.61, + "grad_norm": 11.555144351333318, + "learning_rate": 9.671481594310718e-06, + "loss": 1.1641, + "step": 10885 + }, + { + "epoch": 0.61, + "grad_norm": 15.056419692430639, + "learning_rate": 9.670897039834212e-06, + "loss": 1.1314, + "step": 10890 + }, + { + "epoch": 0.61, + "grad_norm": 11.318068102424403, + "learning_rate": 9.670311983449207e-06, + "loss": 1.1023, + "step": 10895 + }, + { + "epoch": 0.61, + "grad_norm": 7.5630145979294685, + "learning_rate": 9.669726425218569e-06, + "loss": 1.0887, + "step": 10900 + }, + { + "epoch": 0.61, + "grad_norm": 8.587877654307853, + "learning_rate": 9.669140365205216e-06, + "loss": 1.1014, + "step": 10905 + }, + { + "epoch": 0.61, + "grad_norm": 7.1165568028333945, + "learning_rate": 9.668553803472124e-06, + "loss": 1.1145, + "step": 10910 + }, + { + "epoch": 0.61, + "grad_norm": 7.5629788196879115, + "learning_rate": 9.667966740082322e-06, + "loss": 1.1576, + "step": 10915 + }, + { + "epoch": 0.62, + "grad_norm": 17.812033807852437, + "learning_rate": 9.667379175098892e-06, + "loss": 1.1285, + "step": 10920 + }, + { + "epoch": 0.62, + "grad_norm": 14.718265197899088, + "learning_rate": 9.666791108584973e-06, + "loss": 1.1424, + "step": 10925 + }, + { + "epoch": 0.62, + "grad_norm": 44.69221850490623, + "learning_rate": 9.66620254060375e-06, + "loss": 1.1466, + "step": 10930 + }, + { + "epoch": 0.62, + "grad_norm": 13.271379033231685, + "learning_rate": 9.665613471218474e-06, + "loss": 1.1064, + "step": 10935 + }, + { + "epoch": 0.62, + "grad_norm": 10.796541858818676, + "learning_rate": 9.665023900492437e-06, + "loss": 1.1728, + "step": 10940 + }, + { + "epoch": 0.62, + "grad_norm": 9.316819695961973, + "learning_rate": 9.664433828488994e-06, + "loss": 1.1235, + "step": 10945 + }, + { + "epoch": 0.62, + "grad_norm": 20.668594591122304, + "learning_rate": 9.66384325527155e-06, + "loss": 1.082, + "step": 10950 + }, + { + "epoch": 0.62, + "grad_norm": 16.085360159693657, + "learning_rate": 9.663252180903567e-06, + "loss": 1.1437, + "step": 10955 + }, + { + "epoch": 0.62, + "grad_norm": 7.637507941505123, + "learning_rate": 9.662660605448556e-06, + "loss": 1.168, + "step": 10960 + }, + { + "epoch": 0.62, + "grad_norm": 5.885822912023088, + "learning_rate": 9.662068528970084e-06, + "loss": 1.093, + "step": 10965 + }, + { + "epoch": 0.62, + "grad_norm": 7.247551153290033, + "learning_rate": 9.661475951531775e-06, + "loss": 1.097, + "step": 10970 + }, + { + "epoch": 0.62, + "grad_norm": 7.014622813957114, + "learning_rate": 9.660882873197302e-06, + "loss": 1.1774, + "step": 10975 + }, + { + "epoch": 0.62, + "grad_norm": 5.453537591758799, + "learning_rate": 9.660289294030392e-06, + "loss": 1.1138, + "step": 10980 + }, + { + "epoch": 0.62, + "grad_norm": 9.838324746880735, + "learning_rate": 9.659695214094831e-06, + "loss": 1.134, + "step": 10985 + }, + { + "epoch": 0.62, + "grad_norm": 10.156741371796533, + "learning_rate": 9.659100633454456e-06, + "loss": 1.1393, + "step": 10990 + }, + { + "epoch": 0.62, + "grad_norm": 16.96563855251562, + "learning_rate": 9.658505552173154e-06, + "loss": 1.1404, + "step": 10995 + }, + { + "epoch": 0.62, + "grad_norm": 32.05125425748238, + "learning_rate": 9.657909970314872e-06, + "loss": 1.106, + "step": 11000 + }, + { + "epoch": 0.62, + "grad_norm": 15.088904851396622, + "learning_rate": 9.657313887943607e-06, + "loss": 1.0857, + "step": 11005 + }, + { + "epoch": 0.62, + "grad_norm": 11.298157623219927, + "learning_rate": 9.656717305123409e-06, + "loss": 1.1081, + "step": 11010 + }, + { + "epoch": 0.62, + "grad_norm": 11.974985408266656, + "learning_rate": 9.656120221918385e-06, + "loss": 1.1928, + "step": 11015 + }, + { + "epoch": 0.62, + "grad_norm": 7.08615653618582, + "learning_rate": 9.655522638392696e-06, + "loss": 1.1115, + "step": 11020 + }, + { + "epoch": 0.62, + "grad_norm": 8.919446621579743, + "learning_rate": 9.654924554610552e-06, + "loss": 1.0954, + "step": 11025 + }, + { + "epoch": 0.62, + "grad_norm": 7.256431814444242, + "learning_rate": 9.65432597063622e-06, + "loss": 1.1222, + "step": 11030 + }, + { + "epoch": 0.62, + "grad_norm": 9.140945493593927, + "learning_rate": 9.653726886534022e-06, + "loss": 1.1338, + "step": 11035 + }, + { + "epoch": 0.62, + "grad_norm": 12.731735490306578, + "learning_rate": 9.653127302368332e-06, + "loss": 1.1436, + "step": 11040 + }, + { + "epoch": 0.62, + "grad_norm": 18.113124315147232, + "learning_rate": 9.652527218203578e-06, + "loss": 1.088, + "step": 11045 + }, + { + "epoch": 0.62, + "grad_norm": 11.334135425166144, + "learning_rate": 9.65192663410424e-06, + "loss": 1.1179, + "step": 11050 + }, + { + "epoch": 0.62, + "grad_norm": 7.709769850492832, + "learning_rate": 9.651325550134855e-06, + "loss": 1.128, + "step": 11055 + }, + { + "epoch": 0.62, + "grad_norm": 11.286184891474969, + "learning_rate": 9.650723966360012e-06, + "loss": 1.0974, + "step": 11060 + }, + { + "epoch": 0.62, + "grad_norm": 5.57027027045137, + "learning_rate": 9.650121882844353e-06, + "loss": 1.0875, + "step": 11065 + }, + { + "epoch": 0.62, + "grad_norm": 5.518062256318835, + "learning_rate": 9.649519299652577e-06, + "loss": 1.1408, + "step": 11070 + }, + { + "epoch": 0.62, + "grad_norm": 6.44312304087487, + "learning_rate": 9.64891621684943e-06, + "loss": 1.0707, + "step": 11075 + }, + { + "epoch": 0.62, + "grad_norm": 19.777767208451827, + "learning_rate": 9.64831263449972e-06, + "loss": 1.1336, + "step": 11080 + }, + { + "epoch": 0.62, + "grad_norm": 9.74015049633733, + "learning_rate": 9.647708552668301e-06, + "loss": 1.1165, + "step": 11085 + }, + { + "epoch": 0.62, + "grad_norm": 5.864708070004711, + "learning_rate": 9.647103971420087e-06, + "loss": 1.1266, + "step": 11090 + }, + { + "epoch": 0.63, + "grad_norm": 6.11088254375396, + "learning_rate": 9.646498890820044e-06, + "loss": 1.1073, + "step": 11095 + }, + { + "epoch": 0.63, + "grad_norm": 5.35672591968441, + "learning_rate": 9.645893310933185e-06, + "loss": 1.1284, + "step": 11100 + }, + { + "epoch": 0.63, + "grad_norm": 5.738344826519528, + "learning_rate": 9.64528723182459e-06, + "loss": 1.0926, + "step": 11105 + }, + { + "epoch": 0.63, + "grad_norm": 7.000288653758377, + "learning_rate": 9.644680653559377e-06, + "loss": 1.0761, + "step": 11110 + }, + { + "epoch": 0.63, + "grad_norm": 6.472794927018789, + "learning_rate": 9.64407357620273e-06, + "loss": 1.1454, + "step": 11115 + }, + { + "epoch": 0.63, + "grad_norm": 8.352227962786161, + "learning_rate": 9.643465999819884e-06, + "loss": 1.1623, + "step": 11120 + }, + { + "epoch": 0.63, + "grad_norm": 5.433874733615224, + "learning_rate": 9.64285792447612e-06, + "loss": 1.0855, + "step": 11125 + }, + { + "epoch": 0.63, + "grad_norm": 13.968813243175576, + "learning_rate": 9.642249350236783e-06, + "loss": 1.0918, + "step": 11130 + }, + { + "epoch": 0.63, + "grad_norm": 15.476869741505018, + "learning_rate": 9.641640277167265e-06, + "loss": 1.1205, + "step": 11135 + }, + { + "epoch": 0.63, + "grad_norm": 13.141844055717403, + "learning_rate": 9.641030705333014e-06, + "loss": 1.1208, + "step": 11140 + }, + { + "epoch": 0.63, + "grad_norm": 19.984238053045534, + "learning_rate": 9.640420634799531e-06, + "loss": 1.1526, + "step": 11145 + }, + { + "epoch": 0.63, + "grad_norm": 22.91811477675895, + "learning_rate": 9.639810065632373e-06, + "loss": 1.0788, + "step": 11150 + }, + { + "epoch": 0.63, + "grad_norm": 29.178932705364485, + "learning_rate": 9.639198997897144e-06, + "loss": 1.1161, + "step": 11155 + }, + { + "epoch": 0.63, + "grad_norm": 15.426420048454668, + "learning_rate": 9.63858743165951e-06, + "loss": 1.1295, + "step": 11160 + }, + { + "epoch": 0.63, + "grad_norm": 17.088387589648, + "learning_rate": 9.637975366985185e-06, + "loss": 1.1297, + "step": 11165 + }, + { + "epoch": 0.63, + "grad_norm": 7.9223722868315205, + "learning_rate": 9.637362803939936e-06, + "loss": 1.1254, + "step": 11170 + }, + { + "epoch": 0.63, + "grad_norm": 8.37856404730091, + "learning_rate": 9.636749742589588e-06, + "loss": 1.1098, + "step": 11175 + }, + { + "epoch": 0.63, + "grad_norm": 13.068630448717673, + "learning_rate": 9.636136183000017e-06, + "loss": 1.1188, + "step": 11180 + }, + { + "epoch": 0.63, + "grad_norm": 7.227362861670856, + "learning_rate": 9.635522125237153e-06, + "loss": 1.1407, + "step": 11185 + }, + { + "epoch": 0.63, + "grad_norm": 25.22172729921892, + "learning_rate": 9.634907569366978e-06, + "loss": 1.1145, + "step": 11190 + }, + { + "epoch": 0.63, + "grad_norm": 6.242894970507499, + "learning_rate": 9.63429251545553e-06, + "loss": 1.0853, + "step": 11195 + }, + { + "epoch": 0.63, + "grad_norm": 8.959942429038014, + "learning_rate": 9.633676963568897e-06, + "loss": 1.1038, + "step": 11200 + }, + { + "epoch": 0.63, + "grad_norm": 6.917994257142175, + "learning_rate": 9.633060913773226e-06, + "loss": 1.1111, + "step": 11205 + }, + { + "epoch": 0.63, + "grad_norm": 5.581888544269615, + "learning_rate": 9.632444366134713e-06, + "loss": 1.1097, + "step": 11210 + }, + { + "epoch": 0.63, + "grad_norm": 6.729514803292108, + "learning_rate": 9.631827320719605e-06, + "loss": 1.1022, + "step": 11215 + }, + { + "epoch": 0.63, + "grad_norm": 6.545240669926472, + "learning_rate": 9.631209777594213e-06, + "loss": 1.1128, + "step": 11220 + }, + { + "epoch": 0.63, + "grad_norm": 6.390526908087484, + "learning_rate": 9.630591736824888e-06, + "loss": 1.1747, + "step": 11225 + }, + { + "epoch": 0.63, + "grad_norm": 5.924949816061919, + "learning_rate": 9.629973198478048e-06, + "loss": 1.1005, + "step": 11230 + }, + { + "epoch": 0.63, + "grad_norm": 10.8916345427556, + "learning_rate": 9.629354162620152e-06, + "loss": 1.1376, + "step": 11235 + }, + { + "epoch": 0.63, + "grad_norm": 6.155518159629471, + "learning_rate": 9.62873462931772e-06, + "loss": 1.101, + "step": 11240 + }, + { + "epoch": 0.63, + "grad_norm": 5.6167227681067695, + "learning_rate": 9.628114598637325e-06, + "loss": 1.1598, + "step": 11245 + }, + { + "epoch": 0.63, + "grad_norm": 6.280942433837521, + "learning_rate": 9.627494070645591e-06, + "loss": 1.1055, + "step": 11250 + }, + { + "epoch": 0.63, + "grad_norm": 5.5000890376880385, + "learning_rate": 9.626873045409198e-06, + "loss": 1.144, + "step": 11255 + }, + { + "epoch": 0.63, + "grad_norm": 8.010584592440216, + "learning_rate": 9.626251522994875e-06, + "loss": 1.0922, + "step": 11260 + }, + { + "epoch": 0.63, + "grad_norm": 22.03434250940769, + "learning_rate": 9.625629503469407e-06, + "loss": 1.0946, + "step": 11265 + }, + { + "epoch": 0.64, + "grad_norm": 20.686737338818922, + "learning_rate": 9.625006986899634e-06, + "loss": 1.1348, + "step": 11270 + }, + { + "epoch": 0.64, + "grad_norm": 6.696667859947064, + "learning_rate": 9.624383973352452e-06, + "loss": 1.1247, + "step": 11275 + }, + { + "epoch": 0.64, + "grad_norm": 17.455077034763566, + "learning_rate": 9.6237604628948e-06, + "loss": 1.1122, + "step": 11280 + }, + { + "epoch": 0.64, + "grad_norm": 26.755922699137464, + "learning_rate": 9.623136455593682e-06, + "loss": 1.1146, + "step": 11285 + }, + { + "epoch": 0.64, + "grad_norm": 13.402977119405639, + "learning_rate": 9.622511951516147e-06, + "loss": 1.1356, + "step": 11290 + }, + { + "epoch": 0.64, + "grad_norm": 15.582304160611613, + "learning_rate": 9.621886950729303e-06, + "loss": 1.1585, + "step": 11295 + }, + { + "epoch": 0.64, + "grad_norm": 10.307559358077564, + "learning_rate": 9.621261453300307e-06, + "loss": 1.1062, + "step": 11300 + }, + { + "epoch": 0.64, + "grad_norm": 25.541829751747773, + "learning_rate": 9.620635459296375e-06, + "loss": 1.1223, + "step": 11305 + }, + { + "epoch": 0.64, + "grad_norm": 6.158057177799695, + "learning_rate": 9.620008968784768e-06, + "loss": 1.1469, + "step": 11310 + }, + { + "epoch": 0.64, + "grad_norm": 10.343254958460044, + "learning_rate": 9.619381981832809e-06, + "loss": 1.1296, + "step": 11315 + }, + { + "epoch": 0.64, + "grad_norm": 22.229399202952123, + "learning_rate": 9.618754498507869e-06, + "loss": 1.0891, + "step": 11320 + }, + { + "epoch": 0.64, + "grad_norm": 11.81759293368276, + "learning_rate": 9.618126518877374e-06, + "loss": 1.0831, + "step": 11325 + }, + { + "epoch": 0.64, + "grad_norm": 6.725847092971947, + "learning_rate": 9.617498043008805e-06, + "loss": 1.0687, + "step": 11330 + }, + { + "epoch": 0.64, + "grad_norm": 12.418814423887673, + "learning_rate": 9.616869070969688e-06, + "loss": 1.1319, + "step": 11335 + }, + { + "epoch": 0.64, + "grad_norm": 84.34338677835348, + "learning_rate": 9.616239602827617e-06, + "loss": 1.1773, + "step": 11340 + }, + { + "epoch": 0.64, + "grad_norm": 30.81992728108807, + "learning_rate": 9.615609638650228e-06, + "loss": 1.1321, + "step": 11345 + }, + { + "epoch": 0.64, + "grad_norm": 12.365825982658668, + "learning_rate": 9.614979178505212e-06, + "loss": 1.1478, + "step": 11350 + }, + { + "epoch": 0.64, + "grad_norm": 12.84936705469163, + "learning_rate": 9.614348222460315e-06, + "loss": 1.0954, + "step": 11355 + }, + { + "epoch": 0.64, + "grad_norm": 8.93458593210332, + "learning_rate": 9.613716770583338e-06, + "loss": 1.1145, + "step": 11360 + }, + { + "epoch": 0.64, + "grad_norm": 13.001447461288683, + "learning_rate": 9.613084822942132e-06, + "loss": 1.1395, + "step": 11365 + }, + { + "epoch": 0.64, + "grad_norm": 6.025687520103381, + "learning_rate": 9.612452379604602e-06, + "loss": 1.1146, + "step": 11370 + }, + { + "epoch": 0.64, + "grad_norm": 8.09606954537574, + "learning_rate": 9.611819440638708e-06, + "loss": 1.1094, + "step": 11375 + }, + { + "epoch": 0.64, + "grad_norm": 15.237321502220459, + "learning_rate": 9.611186006112463e-06, + "loss": 1.1907, + "step": 11380 + }, + { + "epoch": 0.64, + "grad_norm": 6.133020630062902, + "learning_rate": 9.61055207609393e-06, + "loss": 1.1237, + "step": 11385 + }, + { + "epoch": 0.64, + "grad_norm": 24.470807860632377, + "learning_rate": 9.609917650651227e-06, + "loss": 1.1231, + "step": 11390 + }, + { + "epoch": 0.64, + "grad_norm": 9.721493486936613, + "learning_rate": 9.609282729852528e-06, + "loss": 1.1749, + "step": 11395 + }, + { + "epoch": 0.64, + "grad_norm": 8.29379204980772, + "learning_rate": 9.608647313766057e-06, + "loss": 1.1255, + "step": 11400 + }, + { + "epoch": 0.64, + "grad_norm": 11.066054564641332, + "learning_rate": 9.608011402460095e-06, + "loss": 1.1613, + "step": 11405 + }, + { + "epoch": 0.64, + "grad_norm": 6.663241784772767, + "learning_rate": 9.607374996002969e-06, + "loss": 1.0887, + "step": 11410 + }, + { + "epoch": 0.64, + "grad_norm": 25.35957482725709, + "learning_rate": 9.606738094463065e-06, + "loss": 1.1547, + "step": 11415 + }, + { + "epoch": 0.64, + "grad_norm": 34.80978633832712, + "learning_rate": 9.606100697908824e-06, + "loss": 1.2203, + "step": 11420 + }, + { + "epoch": 0.64, + "grad_norm": 9.828161132685219, + "learning_rate": 9.605462806408735e-06, + "loss": 1.1553, + "step": 11425 + }, + { + "epoch": 0.64, + "grad_norm": 32.93371904842403, + "learning_rate": 9.60482442003134e-06, + "loss": 1.1089, + "step": 11430 + }, + { + "epoch": 0.64, + "grad_norm": 31.082856630775527, + "learning_rate": 9.604185538845238e-06, + "loss": 1.1546, + "step": 11435 + }, + { + "epoch": 0.64, + "grad_norm": 32.24987579908004, + "learning_rate": 9.603546162919082e-06, + "loss": 1.1026, + "step": 11440 + }, + { + "epoch": 0.64, + "grad_norm": 20.54133506282557, + "learning_rate": 9.602906292321572e-06, + "loss": 1.155, + "step": 11445 + }, + { + "epoch": 0.65, + "grad_norm": 14.975735053920653, + "learning_rate": 9.602265927121468e-06, + "loss": 1.148, + "step": 11450 + }, + { + "epoch": 0.65, + "grad_norm": 15.164774929887983, + "learning_rate": 9.60162506738758e-06, + "loss": 1.1451, + "step": 11455 + }, + { + "epoch": 0.65, + "grad_norm": 7.773471807523148, + "learning_rate": 9.600983713188767e-06, + "loss": 1.0728, + "step": 11460 + }, + { + "epoch": 0.65, + "grad_norm": 38.009791961725284, + "learning_rate": 9.60034186459395e-06, + "loss": 1.1672, + "step": 11465 + }, + { + "epoch": 0.65, + "grad_norm": 33.001981166921155, + "learning_rate": 9.599699521672096e-06, + "loss": 1.1464, + "step": 11470 + }, + { + "epoch": 0.65, + "grad_norm": 21.661184898745827, + "learning_rate": 9.59905668449223e-06, + "loss": 1.1131, + "step": 11475 + }, + { + "epoch": 0.65, + "grad_norm": 13.865634158608945, + "learning_rate": 9.598413353123423e-06, + "loss": 1.1506, + "step": 11480 + }, + { + "epoch": 0.65, + "grad_norm": 14.847809446167107, + "learning_rate": 9.59776952763481e-06, + "loss": 1.1242, + "step": 11485 + }, + { + "epoch": 0.65, + "grad_norm": 10.802102621857443, + "learning_rate": 9.597125208095568e-06, + "loss": 1.1191, + "step": 11490 + }, + { + "epoch": 0.65, + "grad_norm": 8.835095796388343, + "learning_rate": 9.596480394574934e-06, + "loss": 1.1456, + "step": 11495 + }, + { + "epoch": 0.65, + "grad_norm": 10.391157972821688, + "learning_rate": 9.595835087142197e-06, + "loss": 1.125, + "step": 11500 + }, + { + "epoch": 0.65, + "grad_norm": 13.646892000756408, + "learning_rate": 9.595189285866695e-06, + "loss": 1.1115, + "step": 11505 + }, + { + "epoch": 0.65, + "grad_norm": 10.334311064510455, + "learning_rate": 9.594542990817826e-06, + "loss": 1.1285, + "step": 11510 + }, + { + "epoch": 0.65, + "grad_norm": 11.386206355177405, + "learning_rate": 9.593896202065034e-06, + "loss": 1.105, + "step": 11515 + }, + { + "epoch": 0.65, + "grad_norm": 18.89044425074765, + "learning_rate": 9.593248919677821e-06, + "loss": 1.1278, + "step": 11520 + }, + { + "epoch": 0.65, + "grad_norm": 15.383003304725248, + "learning_rate": 9.592601143725741e-06, + "loss": 1.1239, + "step": 11525 + }, + { + "epoch": 0.65, + "grad_norm": 8.073223723033902, + "learning_rate": 9.591952874278398e-06, + "loss": 1.1152, + "step": 11530 + }, + { + "epoch": 0.65, + "grad_norm": 10.953832256371221, + "learning_rate": 9.591304111405453e-06, + "loss": 1.1307, + "step": 11535 + }, + { + "epoch": 0.65, + "grad_norm": 12.423248294487648, + "learning_rate": 9.59065485517662e-06, + "loss": 1.1665, + "step": 11540 + }, + { + "epoch": 0.65, + "grad_norm": 17.34771202769372, + "learning_rate": 9.590005105661661e-06, + "loss": 1.0943, + "step": 11545 + }, + { + "epoch": 0.65, + "grad_norm": 15.829346074591298, + "learning_rate": 9.589354862930396e-06, + "loss": 1.0949, + "step": 11550 + }, + { + "epoch": 0.65, + "grad_norm": 43.08719051302611, + "learning_rate": 9.588704127052697e-06, + "loss": 1.1418, + "step": 11555 + }, + { + "epoch": 0.65, + "grad_norm": 61.34795040610486, + "learning_rate": 9.58805289809849e-06, + "loss": 1.1212, + "step": 11560 + }, + { + "epoch": 0.65, + "grad_norm": 8.887902416872716, + "learning_rate": 9.587401176137747e-06, + "loss": 1.1878, + "step": 11565 + }, + { + "epoch": 0.65, + "grad_norm": 31.760457368731775, + "learning_rate": 9.586748961240504e-06, + "loss": 1.1356, + "step": 11570 + }, + { + "epoch": 0.65, + "grad_norm": 22.836393516635077, + "learning_rate": 9.586096253476843e-06, + "loss": 1.1151, + "step": 11575 + }, + { + "epoch": 0.65, + "grad_norm": 7.1359009032005165, + "learning_rate": 9.585443052916897e-06, + "loss": 1.1344, + "step": 11580 + }, + { + "epoch": 0.65, + "grad_norm": 5.691199733776354, + "learning_rate": 9.58478935963086e-06, + "loss": 1.117, + "step": 11585 + }, + { + "epoch": 0.65, + "grad_norm": 7.466776266088328, + "learning_rate": 9.58413517368897e-06, + "loss": 1.093, + "step": 11590 + }, + { + "epoch": 0.65, + "grad_norm": 11.753417729649698, + "learning_rate": 9.583480495161525e-06, + "loss": 1.1043, + "step": 11595 + }, + { + "epoch": 0.65, + "grad_norm": 11.974458243917208, + "learning_rate": 9.582825324118874e-06, + "loss": 1.0903, + "step": 11600 + }, + { + "epoch": 0.65, + "grad_norm": 16.199533039789745, + "learning_rate": 9.582169660631416e-06, + "loss": 1.113, + "step": 11605 + }, + { + "epoch": 0.65, + "grad_norm": 7.3870764961647035, + "learning_rate": 9.581513504769603e-06, + "loss": 1.0672, + "step": 11610 + }, + { + "epoch": 0.65, + "grad_norm": 5.472341001370613, + "learning_rate": 9.580856856603947e-06, + "loss": 1.1478, + "step": 11615 + }, + { + "epoch": 0.65, + "grad_norm": 8.022746532359818, + "learning_rate": 9.580199716205003e-06, + "loss": 1.1633, + "step": 11620 + }, + { + "epoch": 0.66, + "grad_norm": 11.709660529258535, + "learning_rate": 9.579542083643387e-06, + "loss": 1.1064, + "step": 11625 + }, + { + "epoch": 0.66, + "grad_norm": 11.06231994676766, + "learning_rate": 9.578883958989762e-06, + "loss": 1.093, + "step": 11630 + }, + { + "epoch": 0.66, + "grad_norm": 5.955678856201514, + "learning_rate": 9.578225342314848e-06, + "loss": 1.1366, + "step": 11635 + }, + { + "epoch": 0.66, + "grad_norm": 10.496948620728515, + "learning_rate": 9.577566233689416e-06, + "loss": 1.15, + "step": 11640 + }, + { + "epoch": 0.66, + "grad_norm": 5.55004295467598, + "learning_rate": 9.576906633184289e-06, + "loss": 1.1449, + "step": 11645 + }, + { + "epoch": 0.66, + "grad_norm": 8.379818781430924, + "learning_rate": 9.576246540870345e-06, + "loss": 1.1382, + "step": 11650 + }, + { + "epoch": 0.66, + "grad_norm": 11.247858126577775, + "learning_rate": 9.575585956818513e-06, + "loss": 1.1147, + "step": 11655 + }, + { + "epoch": 0.66, + "grad_norm": 6.38251281562076, + "learning_rate": 9.574924881099778e-06, + "loss": 1.0541, + "step": 11660 + }, + { + "epoch": 0.66, + "grad_norm": 6.237389167544144, + "learning_rate": 9.574263313785173e-06, + "loss": 1.1484, + "step": 11665 + }, + { + "epoch": 0.66, + "grad_norm": 9.060677713515952, + "learning_rate": 9.573601254945787e-06, + "loss": 1.2049, + "step": 11670 + }, + { + "epoch": 0.66, + "grad_norm": 9.757179417318337, + "learning_rate": 9.572938704652762e-06, + "loss": 1.1077, + "step": 11675 + }, + { + "epoch": 0.66, + "grad_norm": 6.052695169299957, + "learning_rate": 9.57227566297729e-06, + "loss": 1.1462, + "step": 11680 + }, + { + "epoch": 0.66, + "grad_norm": 7.8877390017597495, + "learning_rate": 9.571612129990619e-06, + "loss": 1.0599, + "step": 11685 + }, + { + "epoch": 0.66, + "grad_norm": 7.081068887101823, + "learning_rate": 9.57094810576405e-06, + "loss": 1.1275, + "step": 11690 + }, + { + "epoch": 0.66, + "grad_norm": 7.668535300593515, + "learning_rate": 9.570283590368932e-06, + "loss": 1.1313, + "step": 11695 + }, + { + "epoch": 0.66, + "grad_norm": 11.314851210452614, + "learning_rate": 9.569618583876675e-06, + "loss": 1.1157, + "step": 11700 + }, + { + "epoch": 0.66, + "grad_norm": 12.089547607492213, + "learning_rate": 9.568953086358732e-06, + "loss": 1.0894, + "step": 11705 + }, + { + "epoch": 0.66, + "grad_norm": 6.295051141884013, + "learning_rate": 9.568287097886614e-06, + "loss": 1.0726, + "step": 11710 + }, + { + "epoch": 0.66, + "grad_norm": 7.4161503464096405, + "learning_rate": 9.567620618531889e-06, + "loss": 1.1214, + "step": 11715 + }, + { + "epoch": 0.66, + "grad_norm": 11.554322530448706, + "learning_rate": 9.566953648366169e-06, + "loss": 1.107, + "step": 11720 + }, + { + "epoch": 0.66, + "grad_norm": 6.651018714519556, + "learning_rate": 9.566286187461122e-06, + "loss": 1.0729, + "step": 11725 + }, + { + "epoch": 0.66, + "grad_norm": 8.22959428725363, + "learning_rate": 9.565618235888474e-06, + "loss": 1.1256, + "step": 11730 + }, + { + "epoch": 0.66, + "grad_norm": 6.850813398519219, + "learning_rate": 9.564949793719997e-06, + "loss": 1.1022, + "step": 11735 + }, + { + "epoch": 0.66, + "grad_norm": 5.90267362929282, + "learning_rate": 9.564280861027517e-06, + "loss": 1.116, + "step": 11740 + }, + { + "epoch": 0.66, + "grad_norm": 6.369881594861055, + "learning_rate": 9.563611437882914e-06, + "loss": 1.0746, + "step": 11745 + }, + { + "epoch": 0.66, + "grad_norm": 5.54724950131561, + "learning_rate": 9.562941524358123e-06, + "loss": 1.0923, + "step": 11750 + }, + { + "epoch": 0.66, + "grad_norm": 9.606386548664831, + "learning_rate": 9.562271120525127e-06, + "loss": 1.1049, + "step": 11755 + }, + { + "epoch": 0.66, + "grad_norm": 6.328639732082031, + "learning_rate": 9.561600226455964e-06, + "loss": 1.1159, + "step": 11760 + }, + { + "epoch": 0.66, + "grad_norm": 6.54670197631698, + "learning_rate": 9.560928842222724e-06, + "loss": 1.0662, + "step": 11765 + }, + { + "epoch": 0.66, + "grad_norm": 8.447986882313948, + "learning_rate": 9.560256967897553e-06, + "loss": 1.1336, + "step": 11770 + }, + { + "epoch": 0.66, + "grad_norm": 7.7565950850095975, + "learning_rate": 9.559584603552642e-06, + "loss": 1.1301, + "step": 11775 + }, + { + "epoch": 0.66, + "grad_norm": 5.995963174413299, + "learning_rate": 9.558911749260245e-06, + "loss": 1.0952, + "step": 11780 + }, + { + "epoch": 0.66, + "grad_norm": 6.126469703774925, + "learning_rate": 9.558238405092658e-06, + "loss": 1.0694, + "step": 11785 + }, + { + "epoch": 0.66, + "grad_norm": 5.828798065105701, + "learning_rate": 9.55756457112224e-06, + "loss": 1.0996, + "step": 11790 + }, + { + "epoch": 0.66, + "grad_norm": 6.771380834874629, + "learning_rate": 9.556890247421393e-06, + "loss": 1.09, + "step": 11795 + }, + { + "epoch": 0.66, + "grad_norm": 5.759844219172296, + "learning_rate": 9.556215434062579e-06, + "loss": 1.1084, + "step": 11800 + }, + { + "epoch": 0.67, + "grad_norm": 6.8439149752138055, + "learning_rate": 9.555540131118308e-06, + "loss": 1.1204, + "step": 11805 + }, + { + "epoch": 0.67, + "grad_norm": 6.6160793086283505, + "learning_rate": 9.554864338661144e-06, + "loss": 1.133, + "step": 11810 + }, + { + "epoch": 0.67, + "grad_norm": 7.417168157300613, + "learning_rate": 9.554188056763706e-06, + "loss": 1.0639, + "step": 11815 + }, + { + "epoch": 0.67, + "grad_norm": 8.622621161220346, + "learning_rate": 9.553511285498661e-06, + "loss": 1.1193, + "step": 11820 + }, + { + "epoch": 0.67, + "grad_norm": 5.965269087906013, + "learning_rate": 9.552834024938733e-06, + "loss": 1.1219, + "step": 11825 + }, + { + "epoch": 0.67, + "grad_norm": 7.255698255179819, + "learning_rate": 9.552156275156696e-06, + "loss": 1.1086, + "step": 11830 + }, + { + "epoch": 0.67, + "grad_norm": 5.945998298437943, + "learning_rate": 9.551478036225376e-06, + "loss": 1.1316, + "step": 11835 + }, + { + "epoch": 0.67, + "grad_norm": 6.717388145642351, + "learning_rate": 9.550799308217655e-06, + "loss": 1.1139, + "step": 11840 + }, + { + "epoch": 0.67, + "grad_norm": 7.814865182414534, + "learning_rate": 9.550120091206463e-06, + "loss": 1.1251, + "step": 11845 + }, + { + "epoch": 0.67, + "grad_norm": 10.874893174733337, + "learning_rate": 9.549440385264786e-06, + "loss": 1.1257, + "step": 11850 + }, + { + "epoch": 0.67, + "grad_norm": 14.089310520514331, + "learning_rate": 9.548760190465663e-06, + "loss": 1.1265, + "step": 11855 + }, + { + "epoch": 0.67, + "grad_norm": 6.172026895489223, + "learning_rate": 9.54807950688218e-06, + "loss": 1.074, + "step": 11860 + }, + { + "epoch": 0.67, + "grad_norm": 9.840493183637697, + "learning_rate": 9.547398334587483e-06, + "loss": 1.125, + "step": 11865 + }, + { + "epoch": 0.67, + "grad_norm": 9.200894965676413, + "learning_rate": 9.546716673654766e-06, + "loss": 1.1552, + "step": 11870 + }, + { + "epoch": 0.67, + "grad_norm": 12.473078826019139, + "learning_rate": 9.546034524157275e-06, + "loss": 1.1332, + "step": 11875 + }, + { + "epoch": 0.67, + "grad_norm": 8.216539156849562, + "learning_rate": 9.545351886168313e-06, + "loss": 1.1079, + "step": 11880 + }, + { + "epoch": 0.67, + "grad_norm": 9.450721010310948, + "learning_rate": 9.544668759761228e-06, + "loss": 1.1098, + "step": 11885 + }, + { + "epoch": 0.67, + "grad_norm": 7.544296233518932, + "learning_rate": 9.543985145009431e-06, + "loss": 1.1314, + "step": 11890 + }, + { + "epoch": 0.67, + "grad_norm": 19.429425340593898, + "learning_rate": 9.543301041986375e-06, + "loss": 1.1266, + "step": 11895 + }, + { + "epoch": 0.67, + "grad_norm": 18.186894712391446, + "learning_rate": 9.542616450765569e-06, + "loss": 1.1712, + "step": 11900 + }, + { + "epoch": 0.67, + "grad_norm": 11.400204099652655, + "learning_rate": 9.541931371420579e-06, + "loss": 1.0878, + "step": 11905 + }, + { + "epoch": 0.67, + "grad_norm": 20.827900543882034, + "learning_rate": 9.541245804025018e-06, + "loss": 1.1044, + "step": 11910 + }, + { + "epoch": 0.67, + "grad_norm": 7.128200251301965, + "learning_rate": 9.540559748652553e-06, + "loss": 1.122, + "step": 11915 + }, + { + "epoch": 0.67, + "grad_norm": 17.20857238719293, + "learning_rate": 9.539873205376903e-06, + "loss": 1.122, + "step": 11920 + }, + { + "epoch": 0.67, + "grad_norm": 27.086352044189386, + "learning_rate": 9.539186174271843e-06, + "loss": 1.1286, + "step": 11925 + }, + { + "epoch": 0.67, + "grad_norm": 8.218026239759777, + "learning_rate": 9.538498655411194e-06, + "loss": 1.1099, + "step": 11930 + }, + { + "epoch": 0.67, + "grad_norm": 9.664747237291802, + "learning_rate": 9.537810648868835e-06, + "loss": 1.1122, + "step": 11935 + }, + { + "epoch": 0.67, + "grad_norm": 8.146816761770653, + "learning_rate": 9.537122154718697e-06, + "loss": 1.0863, + "step": 11940 + }, + { + "epoch": 0.67, + "grad_norm": 6.066789291300878, + "learning_rate": 9.536433173034756e-06, + "loss": 1.1175, + "step": 11945 + }, + { + "epoch": 0.67, + "grad_norm": 10.20282720723064, + "learning_rate": 9.535743703891052e-06, + "loss": 1.0828, + "step": 11950 + }, + { + "epoch": 0.67, + "grad_norm": 14.36317320547394, + "learning_rate": 9.535053747361668e-06, + "loss": 1.2184, + "step": 11955 + }, + { + "epoch": 0.67, + "grad_norm": 5.969134634017165, + "learning_rate": 9.534363303520747e-06, + "loss": 1.1383, + "step": 11960 + }, + { + "epoch": 0.67, + "grad_norm": 5.74126901801494, + "learning_rate": 9.533672372442472e-06, + "loss": 1.0959, + "step": 11965 + }, + { + "epoch": 0.67, + "grad_norm": 8.145065327359825, + "learning_rate": 9.532980954201096e-06, + "loss": 1.1254, + "step": 11970 + }, + { + "epoch": 0.67, + "grad_norm": 10.073135816460807, + "learning_rate": 9.532289048870911e-06, + "loss": 1.1273, + "step": 11975 + }, + { + "epoch": 0.68, + "grad_norm": 8.701958034273098, + "learning_rate": 9.531596656526263e-06, + "loss": 1.165, + "step": 11980 + }, + { + "epoch": 0.68, + "grad_norm": 11.511324696376247, + "learning_rate": 9.530903777241558e-06, + "loss": 1.0995, + "step": 11985 + }, + { + "epoch": 0.68, + "grad_norm": 12.987164352630714, + "learning_rate": 9.530210411091244e-06, + "loss": 1.1514, + "step": 11990 + }, + { + "epoch": 0.68, + "grad_norm": 25.501563990598655, + "learning_rate": 9.529516558149828e-06, + "loss": 1.1004, + "step": 11995 + }, + { + "epoch": 0.68, + "grad_norm": 6.545471466725064, + "learning_rate": 9.528822218491867e-06, + "loss": 1.109, + "step": 12000 + }, + { + "epoch": 0.68, + "grad_norm": 8.807814285801163, + "learning_rate": 9.52812739219197e-06, + "loss": 1.1089, + "step": 12005 + }, + { + "epoch": 0.68, + "grad_norm": 6.0473320061145674, + "learning_rate": 9.527432079324804e-06, + "loss": 1.0957, + "step": 12010 + }, + { + "epoch": 0.68, + "grad_norm": 9.348008458634869, + "learning_rate": 9.526736279965077e-06, + "loss": 1.0909, + "step": 12015 + }, + { + "epoch": 0.68, + "grad_norm": 8.184962795913268, + "learning_rate": 9.526039994187561e-06, + "loss": 1.1278, + "step": 12020 + }, + { + "epoch": 0.68, + "grad_norm": 15.793071498515623, + "learning_rate": 9.525343222067071e-06, + "loss": 1.1198, + "step": 12025 + }, + { + "epoch": 0.68, + "grad_norm": 26.328754883464295, + "learning_rate": 9.524645963678481e-06, + "loss": 1.1322, + "step": 12030 + }, + { + "epoch": 0.68, + "grad_norm": 16.585776506978878, + "learning_rate": 9.523948219096712e-06, + "loss": 1.1266, + "step": 12035 + }, + { + "epoch": 0.68, + "grad_norm": 7.381052915621866, + "learning_rate": 9.523249988396744e-06, + "loss": 1.1359, + "step": 12040 + }, + { + "epoch": 0.68, + "grad_norm": 28.246476797003368, + "learning_rate": 9.5225512716536e-06, + "loss": 1.1124, + "step": 12045 + }, + { + "epoch": 0.68, + "grad_norm": 17.77813837924769, + "learning_rate": 9.521852068942363e-06, + "loss": 1.0634, + "step": 12050 + }, + { + "epoch": 0.68, + "grad_norm": 17.407532549369275, + "learning_rate": 9.521152380338163e-06, + "loss": 1.1502, + "step": 12055 + }, + { + "epoch": 0.68, + "grad_norm": 10.363183334664006, + "learning_rate": 9.520452205916188e-06, + "loss": 1.1825, + "step": 12060 + }, + { + "epoch": 0.68, + "grad_norm": 8.088172289393352, + "learning_rate": 9.519751545751674e-06, + "loss": 1.1215, + "step": 12065 + }, + { + "epoch": 0.68, + "grad_norm": 15.068443371883795, + "learning_rate": 9.519050399919908e-06, + "loss": 1.1404, + "step": 12070 + }, + { + "epoch": 0.68, + "grad_norm": 15.890841705730148, + "learning_rate": 9.518348768496233e-06, + "loss": 1.0997, + "step": 12075 + }, + { + "epoch": 0.68, + "grad_norm": 34.471194434444435, + "learning_rate": 9.51764665155604e-06, + "loss": 1.1898, + "step": 12080 + }, + { + "epoch": 0.68, + "grad_norm": 5.6024386929285575, + "learning_rate": 9.51694404917478e-06, + "loss": 1.1087, + "step": 12085 + }, + { + "epoch": 0.68, + "grad_norm": 19.07403481880703, + "learning_rate": 9.516240961427943e-06, + "loss": 1.0759, + "step": 12090 + }, + { + "epoch": 0.68, + "grad_norm": 20.854328530035872, + "learning_rate": 9.515537388391085e-06, + "loss": 1.1387, + "step": 12095 + }, + { + "epoch": 0.68, + "grad_norm": 10.475056391091014, + "learning_rate": 9.514833330139805e-06, + "loss": 1.1455, + "step": 12100 + }, + { + "epoch": 0.68, + "grad_norm": 11.198787155357344, + "learning_rate": 9.51412878674976e-06, + "loss": 1.1423, + "step": 12105 + }, + { + "epoch": 0.68, + "grad_norm": 13.039969911802466, + "learning_rate": 9.513423758296653e-06, + "loss": 1.1566, + "step": 12110 + }, + { + "epoch": 0.68, + "grad_norm": 7.913201267676782, + "learning_rate": 9.512718244856245e-06, + "loss": 1.0734, + "step": 12115 + }, + { + "epoch": 0.68, + "grad_norm": 7.850794747649243, + "learning_rate": 9.512012246504346e-06, + "loss": 1.1062, + "step": 12120 + }, + { + "epoch": 0.68, + "grad_norm": 9.115240551426224, + "learning_rate": 9.511305763316817e-06, + "loss": 1.1188, + "step": 12125 + }, + { + "epoch": 0.68, + "grad_norm": 9.138770327836259, + "learning_rate": 9.510598795369575e-06, + "loss": 1.0632, + "step": 12130 + }, + { + "epoch": 0.68, + "grad_norm": 16.139595677373027, + "learning_rate": 9.509891342738583e-06, + "loss": 1.1162, + "step": 12135 + }, + { + "epoch": 0.68, + "grad_norm": 10.661063797525696, + "learning_rate": 9.509183405499865e-06, + "loss": 1.0781, + "step": 12140 + }, + { + "epoch": 0.68, + "grad_norm": 24.926770999531197, + "learning_rate": 9.508474983729487e-06, + "loss": 1.1565, + "step": 12145 + }, + { + "epoch": 0.68, + "grad_norm": 12.598977988434706, + "learning_rate": 9.507766077503577e-06, + "loss": 1.1564, + "step": 12150 + }, + { + "epoch": 0.68, + "grad_norm": 8.651586997734471, + "learning_rate": 9.507056686898308e-06, + "loss": 1.0918, + "step": 12155 + }, + { + "epoch": 0.69, + "grad_norm": 22.631146266550914, + "learning_rate": 9.506346811989905e-06, + "loss": 1.134, + "step": 12160 + }, + { + "epoch": 0.69, + "grad_norm": 14.340879746240121, + "learning_rate": 9.50563645285465e-06, + "loss": 1.1409, + "step": 12165 + }, + { + "epoch": 0.69, + "grad_norm": 7.252479325858253, + "learning_rate": 9.504925609568874e-06, + "loss": 1.1211, + "step": 12170 + }, + { + "epoch": 0.69, + "grad_norm": 6.2896935670529395, + "learning_rate": 9.504214282208957e-06, + "loss": 1.147, + "step": 12175 + }, + { + "epoch": 0.69, + "grad_norm": 5.6237948896331735, + "learning_rate": 9.503502470851338e-06, + "loss": 1.2065, + "step": 12180 + }, + { + "epoch": 0.69, + "grad_norm": 8.299142755373992, + "learning_rate": 9.502790175572506e-06, + "loss": 1.1081, + "step": 12185 + }, + { + "epoch": 0.69, + "grad_norm": 9.382570891103345, + "learning_rate": 9.502077396448992e-06, + "loss": 1.0987, + "step": 12190 + }, + { + "epoch": 0.69, + "grad_norm": 10.755366928410425, + "learning_rate": 9.501364133557396e-06, + "loss": 1.0734, + "step": 12195 + }, + { + "epoch": 0.69, + "grad_norm": 7.063385333619541, + "learning_rate": 9.500650386974358e-06, + "loss": 1.1213, + "step": 12200 + }, + { + "epoch": 0.69, + "grad_norm": 8.343002531403815, + "learning_rate": 9.499936156776571e-06, + "loss": 1.1006, + "step": 12205 + }, + { + "epoch": 0.69, + "grad_norm": 6.078932580486301, + "learning_rate": 9.499221443040784e-06, + "loss": 1.0581, + "step": 12210 + }, + { + "epoch": 0.69, + "grad_norm": 5.873417008498132, + "learning_rate": 9.4985062458438e-06, + "loss": 1.1499, + "step": 12215 + }, + { + "epoch": 0.69, + "grad_norm": 5.043719682987097, + "learning_rate": 9.497790565262464e-06, + "loss": 1.0777, + "step": 12220 + }, + { + "epoch": 0.69, + "grad_norm": 10.991103333781751, + "learning_rate": 9.49707440137368e-06, + "loss": 1.1144, + "step": 12225 + }, + { + "epoch": 0.69, + "grad_norm": 5.454431628315685, + "learning_rate": 9.496357754254406e-06, + "loss": 1.1191, + "step": 12230 + }, + { + "epoch": 0.69, + "grad_norm": 10.117265450369398, + "learning_rate": 9.495640623981648e-06, + "loss": 1.1407, + "step": 12235 + }, + { + "epoch": 0.69, + "grad_norm": 7.431191236012026, + "learning_rate": 9.494923010632465e-06, + "loss": 1.1575, + "step": 12240 + }, + { + "epoch": 0.69, + "grad_norm": 23.020638829126447, + "learning_rate": 9.494204914283964e-06, + "loss": 1.1306, + "step": 12245 + }, + { + "epoch": 0.69, + "grad_norm": 7.48881759716046, + "learning_rate": 9.493486335013313e-06, + "loss": 1.1143, + "step": 12250 + }, + { + "epoch": 0.69, + "grad_norm": 9.75435397065997, + "learning_rate": 9.492767272897723e-06, + "loss": 1.1055, + "step": 12255 + }, + { + "epoch": 0.69, + "grad_norm": 16.7700584546842, + "learning_rate": 9.492047728014463e-06, + "loss": 1.1206, + "step": 12260 + }, + { + "epoch": 0.69, + "grad_norm": 8.537033957318727, + "learning_rate": 9.491327700440847e-06, + "loss": 1.1481, + "step": 12265 + }, + { + "epoch": 0.69, + "grad_norm": 17.04442380933225, + "learning_rate": 9.490607190254248e-06, + "loss": 1.1422, + "step": 12270 + }, + { + "epoch": 0.69, + "grad_norm": 13.898531664093339, + "learning_rate": 9.48988619753209e-06, + "loss": 1.108, + "step": 12275 + }, + { + "epoch": 0.69, + "grad_norm": 8.662052853577302, + "learning_rate": 9.489164722351843e-06, + "loss": 1.1449, + "step": 12280 + }, + { + "epoch": 0.69, + "grad_norm": 6.7918430639108, + "learning_rate": 9.488442764791034e-06, + "loss": 1.1292, + "step": 12285 + }, + { + "epoch": 0.69, + "grad_norm": 6.608927898417105, + "learning_rate": 9.487720324927243e-06, + "loss": 1.0711, + "step": 12290 + }, + { + "epoch": 0.69, + "grad_norm": 8.709125647527724, + "learning_rate": 9.486997402838096e-06, + "loss": 1.1668, + "step": 12295 + }, + { + "epoch": 0.69, + "grad_norm": 7.814286830550152, + "learning_rate": 9.486273998601276e-06, + "loss": 1.1231, + "step": 12300 + }, + { + "epoch": 0.69, + "grad_norm": 6.057419899841028, + "learning_rate": 9.485550112294515e-06, + "loss": 1.1414, + "step": 12305 + }, + { + "epoch": 0.69, + "grad_norm": 5.894706702379888, + "learning_rate": 9.484825743995598e-06, + "loss": 1.0623, + "step": 12310 + }, + { + "epoch": 0.69, + "grad_norm": 5.476779439178674, + "learning_rate": 9.484100893782362e-06, + "loss": 1.1581, + "step": 12315 + }, + { + "epoch": 0.69, + "grad_norm": 10.564348801849615, + "learning_rate": 9.483375561732696e-06, + "loss": 1.1211, + "step": 12320 + }, + { + "epoch": 0.69, + "grad_norm": 17.079720238698098, + "learning_rate": 9.482649747924538e-06, + "loss": 1.1678, + "step": 12325 + }, + { + "epoch": 0.69, + "grad_norm": 11.497766593119248, + "learning_rate": 9.481923452435882e-06, + "loss": 1.1616, + "step": 12330 + }, + { + "epoch": 0.7, + "grad_norm": 14.077638479398647, + "learning_rate": 9.481196675344769e-06, + "loss": 1.1073, + "step": 12335 + }, + { + "epoch": 0.7, + "grad_norm": 9.133263841220293, + "learning_rate": 9.480469416729297e-06, + "loss": 1.1109, + "step": 12340 + }, + { + "epoch": 0.7, + "grad_norm": 10.066355834477365, + "learning_rate": 9.479741676667614e-06, + "loss": 1.1155, + "step": 12345 + }, + { + "epoch": 0.7, + "grad_norm": 34.65322650311844, + "learning_rate": 9.479013455237916e-06, + "loss": 1.1326, + "step": 12350 + }, + { + "epoch": 0.7, + "grad_norm": 25.697613214100894, + "learning_rate": 9.478284752518455e-06, + "loss": 1.1244, + "step": 12355 + }, + { + "epoch": 0.7, + "grad_norm": 13.598786341599224, + "learning_rate": 9.477555568587533e-06, + "loss": 1.1553, + "step": 12360 + }, + { + "epoch": 0.7, + "grad_norm": 7.598780582515386, + "learning_rate": 9.476825903523503e-06, + "loss": 1.1745, + "step": 12365 + }, + { + "epoch": 0.7, + "grad_norm": 30.328799584656238, + "learning_rate": 9.476095757404775e-06, + "loss": 1.1028, + "step": 12370 + }, + { + "epoch": 0.7, + "grad_norm": 27.474643890991686, + "learning_rate": 9.4753651303098e-06, + "loss": 1.1761, + "step": 12375 + }, + { + "epoch": 0.7, + "grad_norm": 15.396515626595093, + "learning_rate": 9.474634022317093e-06, + "loss": 1.121, + "step": 12380 + }, + { + "epoch": 0.7, + "grad_norm": 12.16064964195702, + "learning_rate": 9.473902433505211e-06, + "loss": 1.1631, + "step": 12385 + }, + { + "epoch": 0.7, + "grad_norm": 21.314931852071062, + "learning_rate": 9.473170363952768e-06, + "loss": 1.1405, + "step": 12390 + }, + { + "epoch": 0.7, + "grad_norm": 9.304196512597075, + "learning_rate": 9.472437813738427e-06, + "loss": 1.1715, + "step": 12395 + }, + { + "epoch": 0.7, + "grad_norm": 7.657950395685992, + "learning_rate": 9.471704782940905e-06, + "loss": 1.1413, + "step": 12400 + }, + { + "epoch": 0.7, + "grad_norm": 7.089581989436484, + "learning_rate": 9.47097127163897e-06, + "loss": 1.1055, + "step": 12405 + }, + { + "epoch": 0.7, + "grad_norm": 15.120573640959808, + "learning_rate": 9.470237279911439e-06, + "loss": 1.1003, + "step": 12410 + }, + { + "epoch": 0.7, + "grad_norm": 5.316663572348183, + "learning_rate": 9.469502807837184e-06, + "loss": 1.115, + "step": 12415 + }, + { + "epoch": 0.7, + "grad_norm": 8.674143620666513, + "learning_rate": 9.468767855495126e-06, + "loss": 1.0986, + "step": 12420 + }, + { + "epoch": 0.7, + "grad_norm": 11.409574716827853, + "learning_rate": 9.468032422964241e-06, + "loss": 1.0582, + "step": 12425 + }, + { + "epoch": 0.7, + "grad_norm": 12.799152871815183, + "learning_rate": 9.467296510323553e-06, + "loss": 1.1067, + "step": 12430 + }, + { + "epoch": 0.7, + "grad_norm": 38.93868324170623, + "learning_rate": 9.466560117652138e-06, + "loss": 1.156, + "step": 12435 + }, + { + "epoch": 0.7, + "grad_norm": 13.158157085498642, + "learning_rate": 9.465823245029128e-06, + "loss": 1.1023, + "step": 12440 + }, + { + "epoch": 0.7, + "grad_norm": 15.20520513141992, + "learning_rate": 9.465085892533698e-06, + "loss": 1.1292, + "step": 12445 + }, + { + "epoch": 0.7, + "grad_norm": 8.32160737032039, + "learning_rate": 9.464348060245086e-06, + "loss": 1.0815, + "step": 12450 + }, + { + "epoch": 0.7, + "grad_norm": 7.380708385902926, + "learning_rate": 9.463609748242571e-06, + "loss": 1.1177, + "step": 12455 + }, + { + "epoch": 0.7, + "grad_norm": 18.72739984623385, + "learning_rate": 9.46287095660549e-06, + "loss": 1.1198, + "step": 12460 + }, + { + "epoch": 0.7, + "grad_norm": 9.125505495264395, + "learning_rate": 9.462131685413227e-06, + "loss": 1.116, + "step": 12465 + }, + { + "epoch": 0.7, + "grad_norm": 5.8834113702680035, + "learning_rate": 9.461391934745222e-06, + "loss": 1.1438, + "step": 12470 + }, + { + "epoch": 0.7, + "grad_norm": 9.843331232735709, + "learning_rate": 9.460651704680965e-06, + "loss": 1.1385, + "step": 12475 + }, + { + "epoch": 0.7, + "grad_norm": 5.1626993983384155, + "learning_rate": 9.459910995299995e-06, + "loss": 1.1198, + "step": 12480 + }, + { + "epoch": 0.7, + "grad_norm": 29.643221699883394, + "learning_rate": 9.459169806681906e-06, + "loss": 1.1186, + "step": 12485 + }, + { + "epoch": 0.7, + "grad_norm": 21.42517040378149, + "learning_rate": 9.458428138906342e-06, + "loss": 1.1036, + "step": 12490 + }, + { + "epoch": 0.7, + "grad_norm": 9.556379813323755, + "learning_rate": 9.457685992052996e-06, + "loss": 1.1198, + "step": 12495 + }, + { + "epoch": 0.7, + "grad_norm": 8.551313425469406, + "learning_rate": 9.456943366201618e-06, + "loss": 1.095, + "step": 12500 + }, + { + "epoch": 0.7, + "grad_norm": 7.183185951237004, + "learning_rate": 9.456200261432006e-06, + "loss": 1.1424, + "step": 12505 + }, + { + "epoch": 0.7, + "grad_norm": 13.03201152976163, + "learning_rate": 9.455456677824009e-06, + "loss": 1.1254, + "step": 12510 + }, + { + "epoch": 0.71, + "grad_norm": 8.551929579653649, + "learning_rate": 9.454712615457528e-06, + "loss": 1.1135, + "step": 12515 + }, + { + "epoch": 0.71, + "grad_norm": 12.795386521634065, + "learning_rate": 9.453968074412516e-06, + "loss": 1.0912, + "step": 12520 + }, + { + "epoch": 0.71, + "grad_norm": 13.142409810564299, + "learning_rate": 9.453223054768978e-06, + "loss": 1.1362, + "step": 12525 + }, + { + "epoch": 0.71, + "grad_norm": 14.122040193018915, + "learning_rate": 9.45247755660697e-06, + "loss": 1.1672, + "step": 12530 + }, + { + "epoch": 0.71, + "grad_norm": 18.03597120205186, + "learning_rate": 9.451731580006596e-06, + "loss": 1.1714, + "step": 12535 + }, + { + "epoch": 0.71, + "grad_norm": 17.957972181590634, + "learning_rate": 9.450985125048018e-06, + "loss": 1.1231, + "step": 12540 + }, + { + "epoch": 0.71, + "grad_norm": 9.068861456533899, + "learning_rate": 9.450238191811445e-06, + "loss": 1.1487, + "step": 12545 + }, + { + "epoch": 0.71, + "grad_norm": 7.735441167765313, + "learning_rate": 9.449490780377139e-06, + "loss": 1.1021, + "step": 12550 + }, + { + "epoch": 0.71, + "grad_norm": 10.475548072152762, + "learning_rate": 9.44874289082541e-06, + "loss": 1.1374, + "step": 12555 + }, + { + "epoch": 0.71, + "grad_norm": 6.076666324686386, + "learning_rate": 9.447994523236623e-06, + "loss": 1.1247, + "step": 12560 + }, + { + "epoch": 0.71, + "grad_norm": 4.846464886781207, + "learning_rate": 9.447245677691194e-06, + "loss": 1.1001, + "step": 12565 + }, + { + "epoch": 0.71, + "grad_norm": 12.457530808912342, + "learning_rate": 9.446496354269591e-06, + "loss": 1.1332, + "step": 12570 + }, + { + "epoch": 0.71, + "grad_norm": 13.20611072601647, + "learning_rate": 9.44574655305233e-06, + "loss": 1.1267, + "step": 12575 + }, + { + "epoch": 0.71, + "grad_norm": 5.860558295108078, + "learning_rate": 9.444996274119981e-06, + "loss": 1.1038, + "step": 12580 + }, + { + "epoch": 0.71, + "grad_norm": 5.990814196429046, + "learning_rate": 9.444245517553168e-06, + "loss": 1.115, + "step": 12585 + }, + { + "epoch": 0.71, + "grad_norm": 6.581261097929349, + "learning_rate": 9.443494283432558e-06, + "loss": 1.1787, + "step": 12590 + }, + { + "epoch": 0.71, + "grad_norm": 6.616809454658205, + "learning_rate": 9.442742571838877e-06, + "loss": 1.0425, + "step": 12595 + }, + { + "epoch": 0.71, + "grad_norm": 16.261423757744502, + "learning_rate": 9.441990382852898e-06, + "loss": 1.1253, + "step": 12600 + }, + { + "epoch": 0.71, + "grad_norm": 10.263933207187424, + "learning_rate": 9.44123771655545e-06, + "loss": 1.1094, + "step": 12605 + }, + { + "epoch": 0.71, + "grad_norm": 9.228412573657085, + "learning_rate": 9.440484573027406e-06, + "loss": 1.1011, + "step": 12610 + }, + { + "epoch": 0.71, + "grad_norm": 11.28916239685707, + "learning_rate": 9.4397309523497e-06, + "loss": 1.0897, + "step": 12615 + }, + { + "epoch": 0.71, + "grad_norm": 31.018750337265722, + "learning_rate": 9.438976854603308e-06, + "loss": 1.1461, + "step": 12620 + }, + { + "epoch": 0.71, + "grad_norm": 26.742957868697705, + "learning_rate": 9.438222279869263e-06, + "loss": 1.1322, + "step": 12625 + }, + { + "epoch": 0.71, + "grad_norm": 35.31543257261958, + "learning_rate": 9.437467228228647e-06, + "loss": 1.1515, + "step": 12630 + }, + { + "epoch": 0.71, + "grad_norm": 12.414277899110981, + "learning_rate": 9.436711699762592e-06, + "loss": 1.1076, + "step": 12635 + }, + { + "epoch": 0.71, + "grad_norm": 13.928930974362663, + "learning_rate": 9.435955694552284e-06, + "loss": 1.1389, + "step": 12640 + }, + { + "epoch": 0.71, + "grad_norm": 4.980139924511439, + "learning_rate": 9.435199212678961e-06, + "loss": 1.0712, + "step": 12645 + }, + { + "epoch": 0.71, + "grad_norm": 7.085234525025728, + "learning_rate": 9.434442254223907e-06, + "loss": 1.0766, + "step": 12650 + }, + { + "epoch": 0.71, + "grad_norm": 7.039348080058898, + "learning_rate": 9.433684819268463e-06, + "loss": 1.1415, + "step": 12655 + }, + { + "epoch": 0.71, + "grad_norm": 12.361505124190733, + "learning_rate": 9.432926907894018e-06, + "loss": 1.1213, + "step": 12660 + }, + { + "epoch": 0.71, + "grad_norm": 10.373513634400794, + "learning_rate": 9.432168520182012e-06, + "loss": 1.1114, + "step": 12665 + }, + { + "epoch": 0.71, + "grad_norm": 9.208057103046562, + "learning_rate": 9.431409656213938e-06, + "loss": 1.1266, + "step": 12670 + }, + { + "epoch": 0.71, + "grad_norm": 11.63737716382715, + "learning_rate": 9.43065031607134e-06, + "loss": 1.1667, + "step": 12675 + }, + { + "epoch": 0.71, + "grad_norm": 6.35561817323881, + "learning_rate": 9.42989049983581e-06, + "loss": 1.1265, + "step": 12680 + }, + { + "epoch": 0.71, + "grad_norm": 5.766013721858562, + "learning_rate": 9.429130207588996e-06, + "loss": 1.1129, + "step": 12685 + }, + { + "epoch": 0.72, + "grad_norm": 17.69039586276047, + "learning_rate": 9.428369439412593e-06, + "loss": 1.1336, + "step": 12690 + }, + { + "epoch": 0.72, + "grad_norm": 5.336678591045394, + "learning_rate": 9.42760819538835e-06, + "loss": 1.1616, + "step": 12695 + }, + { + "epoch": 0.72, + "grad_norm": 10.158976172458836, + "learning_rate": 9.426846475598068e-06, + "loss": 1.0746, + "step": 12700 + }, + { + "epoch": 0.72, + "grad_norm": 15.24143874107018, + "learning_rate": 9.426084280123593e-06, + "loss": 1.1158, + "step": 12705 + }, + { + "epoch": 0.72, + "grad_norm": 7.022196927358343, + "learning_rate": 9.425321609046828e-06, + "loss": 1.125, + "step": 12710 + }, + { + "epoch": 0.72, + "grad_norm": 16.518529232070218, + "learning_rate": 9.42455846244973e-06, + "loss": 1.1057, + "step": 12715 + }, + { + "epoch": 0.72, + "grad_norm": 17.04460834141142, + "learning_rate": 9.423794840414293e-06, + "loss": 1.1004, + "step": 12720 + }, + { + "epoch": 0.72, + "grad_norm": 7.614250069444788, + "learning_rate": 9.423030743022578e-06, + "loss": 1.1326, + "step": 12725 + }, + { + "epoch": 0.72, + "grad_norm": 8.289685085298933, + "learning_rate": 9.422266170356691e-06, + "loss": 1.1302, + "step": 12730 + }, + { + "epoch": 0.72, + "grad_norm": 7.474578407116928, + "learning_rate": 9.421501122498786e-06, + "loss": 1.1302, + "step": 12735 + }, + { + "epoch": 0.72, + "grad_norm": 8.38616676280056, + "learning_rate": 9.420735599531072e-06, + "loss": 1.0935, + "step": 12740 + }, + { + "epoch": 0.72, + "grad_norm": 18.511450693752472, + "learning_rate": 9.419969601535807e-06, + "loss": 1.182, + "step": 12745 + }, + { + "epoch": 0.72, + "grad_norm": 19.66843139983669, + "learning_rate": 9.419203128595303e-06, + "loss": 1.1721, + "step": 12750 + }, + { + "epoch": 0.72, + "grad_norm": 12.951035396013895, + "learning_rate": 9.418436180791916e-06, + "loss": 1.0998, + "step": 12755 + }, + { + "epoch": 0.72, + "grad_norm": 42.28858037071631, + "learning_rate": 9.417668758208065e-06, + "loss": 1.1737, + "step": 12760 + }, + { + "epoch": 0.72, + "grad_norm": 8.417100947954943, + "learning_rate": 9.416900860926208e-06, + "loss": 1.0882, + "step": 12765 + }, + { + "epoch": 0.72, + "grad_norm": 6.57613525666104, + "learning_rate": 9.41613248902886e-06, + "loss": 1.0914, + "step": 12770 + }, + { + "epoch": 0.72, + "grad_norm": 8.205950045129088, + "learning_rate": 9.415363642598584e-06, + "loss": 1.1423, + "step": 12775 + }, + { + "epoch": 0.72, + "grad_norm": 41.362035494699974, + "learning_rate": 9.414594321718002e-06, + "loss": 1.0901, + "step": 12780 + }, + { + "epoch": 0.72, + "grad_norm": 39.278881813521956, + "learning_rate": 9.413824526469776e-06, + "loss": 1.1721, + "step": 12785 + }, + { + "epoch": 0.72, + "grad_norm": 25.838193458336175, + "learning_rate": 9.413054256936625e-06, + "loss": 1.1703, + "step": 12790 + }, + { + "epoch": 0.72, + "grad_norm": 9.044351025135112, + "learning_rate": 9.412283513201316e-06, + "loss": 1.1498, + "step": 12795 + }, + { + "epoch": 0.72, + "grad_norm": 30.754042227083023, + "learning_rate": 9.411512295346673e-06, + "loss": 1.1617, + "step": 12800 + }, + { + "epoch": 0.72, + "grad_norm": 33.277278107276686, + "learning_rate": 9.410740603455561e-06, + "loss": 1.1287, + "step": 12805 + }, + { + "epoch": 0.72, + "grad_norm": 15.448915195955019, + "learning_rate": 9.409968437610908e-06, + "loss": 1.106, + "step": 12810 + }, + { + "epoch": 0.72, + "grad_norm": 8.749745902002564, + "learning_rate": 9.409195797895683e-06, + "loss": 1.1242, + "step": 12815 + }, + { + "epoch": 0.72, + "grad_norm": 6.928985641085489, + "learning_rate": 9.40842268439291e-06, + "loss": 1.1539, + "step": 12820 + }, + { + "epoch": 0.72, + "grad_norm": 16.05039753422052, + "learning_rate": 9.407649097185665e-06, + "loss": 1.1024, + "step": 12825 + }, + { + "epoch": 0.72, + "grad_norm": 6.689759133065871, + "learning_rate": 9.406875036357071e-06, + "loss": 1.1873, + "step": 12830 + }, + { + "epoch": 0.72, + "grad_norm": 5.944921197427781, + "learning_rate": 9.406100501990305e-06, + "loss": 1.1143, + "step": 12835 + }, + { + "epoch": 0.72, + "grad_norm": 7.128427185251159, + "learning_rate": 9.405325494168594e-06, + "loss": 1.1226, + "step": 12840 + }, + { + "epoch": 0.72, + "grad_norm": 16.106836168194338, + "learning_rate": 9.404550012975218e-06, + "loss": 1.1855, + "step": 12845 + }, + { + "epoch": 0.72, + "grad_norm": 13.895518718252985, + "learning_rate": 9.403774058493502e-06, + "loss": 1.0902, + "step": 12850 + }, + { + "epoch": 0.72, + "grad_norm": 6.105264985860612, + "learning_rate": 9.40299763080683e-06, + "loss": 1.1333, + "step": 12855 + }, + { + "epoch": 0.72, + "grad_norm": 7.406074166906579, + "learning_rate": 9.402220729998631e-06, + "loss": 1.1542, + "step": 12860 + }, + { + "epoch": 0.72, + "grad_norm": 9.42796449588289, + "learning_rate": 9.401443356152384e-06, + "loss": 1.0913, + "step": 12865 + }, + { + "epoch": 0.73, + "grad_norm": 29.87094411953661, + "learning_rate": 9.400665509351627e-06, + "loss": 1.1323, + "step": 12870 + }, + { + "epoch": 0.73, + "grad_norm": 16.188931817073026, + "learning_rate": 9.399887189679936e-06, + "loss": 1.1057, + "step": 12875 + }, + { + "epoch": 0.73, + "grad_norm": 12.538725457071587, + "learning_rate": 9.399108397220949e-06, + "loss": 1.0832, + "step": 12880 + }, + { + "epoch": 0.73, + "grad_norm": 17.01460622218668, + "learning_rate": 9.398329132058351e-06, + "loss": 1.1324, + "step": 12885 + }, + { + "epoch": 0.73, + "grad_norm": 7.268106924225547, + "learning_rate": 9.397549394275875e-06, + "loss": 1.1151, + "step": 12890 + }, + { + "epoch": 0.73, + "grad_norm": 10.245982486740813, + "learning_rate": 9.39676918395731e-06, + "loss": 1.1472, + "step": 12895 + }, + { + "epoch": 0.73, + "grad_norm": 11.940761226310704, + "learning_rate": 9.395988501186491e-06, + "loss": 1.1105, + "step": 12900 + }, + { + "epoch": 0.73, + "grad_norm": 5.575413387910367, + "learning_rate": 9.395207346047308e-06, + "loss": 1.1148, + "step": 12905 + }, + { + "epoch": 0.73, + "grad_norm": 5.495914888733515, + "learning_rate": 9.394425718623697e-06, + "loss": 1.1721, + "step": 12910 + }, + { + "epoch": 0.73, + "grad_norm": 5.274415686603875, + "learning_rate": 9.393643618999649e-06, + "loss": 1.0729, + "step": 12915 + }, + { + "epoch": 0.73, + "grad_norm": 10.731290248345159, + "learning_rate": 9.392861047259202e-06, + "loss": 1.1506, + "step": 12920 + }, + { + "epoch": 0.73, + "grad_norm": 18.641337298477747, + "learning_rate": 9.39207800348645e-06, + "loss": 1.1063, + "step": 12925 + }, + { + "epoch": 0.73, + "grad_norm": 9.427365041800352, + "learning_rate": 9.391294487765532e-06, + "loss": 1.1098, + "step": 12930 + }, + { + "epoch": 0.73, + "grad_norm": 7.378995705528802, + "learning_rate": 9.390510500180641e-06, + "loss": 1.0928, + "step": 12935 + }, + { + "epoch": 0.73, + "grad_norm": 8.170561431325469, + "learning_rate": 9.389726040816018e-06, + "loss": 1.1684, + "step": 12940 + }, + { + "epoch": 0.73, + "grad_norm": 12.561798633145985, + "learning_rate": 9.388941109755962e-06, + "loss": 1.1274, + "step": 12945 + }, + { + "epoch": 0.73, + "grad_norm": 8.0767443314504, + "learning_rate": 9.388155707084811e-06, + "loss": 1.1512, + "step": 12950 + }, + { + "epoch": 0.73, + "grad_norm": 7.904459162577236, + "learning_rate": 9.387369832886966e-06, + "loss": 1.128, + "step": 12955 + }, + { + "epoch": 0.73, + "grad_norm": 6.6162427506300805, + "learning_rate": 9.386583487246868e-06, + "loss": 1.1184, + "step": 12960 + }, + { + "epoch": 0.73, + "grad_norm": 8.286757855866375, + "learning_rate": 9.385796670249014e-06, + "loss": 1.1162, + "step": 12965 + }, + { + "epoch": 0.73, + "grad_norm": 9.1933303579712, + "learning_rate": 9.385009381977953e-06, + "loss": 1.1486, + "step": 12970 + }, + { + "epoch": 0.73, + "grad_norm": 18.081005075327386, + "learning_rate": 9.38422162251828e-06, + "loss": 1.1298, + "step": 12975 + }, + { + "epoch": 0.73, + "grad_norm": 7.3677400467950065, + "learning_rate": 9.383433391954646e-06, + "loss": 1.0608, + "step": 12980 + }, + { + "epoch": 0.73, + "grad_norm": 7.733173453317965, + "learning_rate": 9.382644690371748e-06, + "loss": 1.1192, + "step": 12985 + }, + { + "epoch": 0.73, + "grad_norm": 6.855145146612309, + "learning_rate": 9.381855517854337e-06, + "loss": 1.1356, + "step": 12990 + }, + { + "epoch": 0.73, + "grad_norm": 13.39183657773669, + "learning_rate": 9.38106587448721e-06, + "loss": 1.113, + "step": 12995 + }, + { + "epoch": 0.73, + "grad_norm": 11.329719563088435, + "learning_rate": 9.380275760355221e-06, + "loss": 1.1151, + "step": 13000 + }, + { + "epoch": 0.73, + "grad_norm": 6.092401493419849, + "learning_rate": 9.37948517554327e-06, + "loss": 1.0717, + "step": 13005 + }, + { + "epoch": 0.73, + "grad_norm": 14.658258926016858, + "learning_rate": 9.378694120136307e-06, + "loss": 1.1207, + "step": 13010 + }, + { + "epoch": 0.73, + "grad_norm": 33.77583955578754, + "learning_rate": 9.377902594219338e-06, + "loss": 1.1206, + "step": 13015 + }, + { + "epoch": 0.73, + "grad_norm": 10.825310853171759, + "learning_rate": 9.377110597877412e-06, + "loss": 1.153, + "step": 13020 + }, + { + "epoch": 0.73, + "grad_norm": 24.86660956764498, + "learning_rate": 9.376318131195637e-06, + "loss": 1.1212, + "step": 13025 + }, + { + "epoch": 0.73, + "grad_norm": 13.584119360644792, + "learning_rate": 9.375525194259163e-06, + "loss": 1.098, + "step": 13030 + }, + { + "epoch": 0.73, + "grad_norm": 36.207602275449005, + "learning_rate": 9.374731787153197e-06, + "loss": 1.119, + "step": 13035 + }, + { + "epoch": 0.73, + "grad_norm": 26.33804591732181, + "learning_rate": 9.373937909962994e-06, + "loss": 1.1115, + "step": 13040 + }, + { + "epoch": 0.74, + "grad_norm": 11.184503587946828, + "learning_rate": 9.373143562773858e-06, + "loss": 1.1246, + "step": 13045 + }, + { + "epoch": 0.74, + "grad_norm": 9.241634386912347, + "learning_rate": 9.372348745671147e-06, + "loss": 1.1627, + "step": 13050 + }, + { + "epoch": 0.74, + "grad_norm": 8.767982749391692, + "learning_rate": 9.371553458740266e-06, + "loss": 1.1461, + "step": 13055 + }, + { + "epoch": 0.74, + "grad_norm": 19.61474824336374, + "learning_rate": 9.370757702066671e-06, + "loss": 1.1405, + "step": 13060 + }, + { + "epoch": 0.74, + "grad_norm": 22.444813504062605, + "learning_rate": 9.369961475735874e-06, + "loss": 1.0994, + "step": 13065 + }, + { + "epoch": 0.74, + "grad_norm": 25.129565243638375, + "learning_rate": 9.369164779833429e-06, + "loss": 1.1301, + "step": 13070 + }, + { + "epoch": 0.74, + "grad_norm": 22.436779647983265, + "learning_rate": 9.368367614444947e-06, + "loss": 1.155, + "step": 13075 + }, + { + "epoch": 0.74, + "grad_norm": 5.435391066135868, + "learning_rate": 9.367569979656084e-06, + "loss": 1.0868, + "step": 13080 + }, + { + "epoch": 0.74, + "grad_norm": 9.260614251676454, + "learning_rate": 9.366771875552552e-06, + "loss": 1.1021, + "step": 13085 + }, + { + "epoch": 0.74, + "grad_norm": 10.837475874790362, + "learning_rate": 9.365973302220108e-06, + "loss": 1.1082, + "step": 13090 + }, + { + "epoch": 0.74, + "grad_norm": 8.314431233857963, + "learning_rate": 9.365174259744568e-06, + "loss": 1.0692, + "step": 13095 + }, + { + "epoch": 0.74, + "grad_norm": 6.575194003628091, + "learning_rate": 9.364374748211787e-06, + "loss": 1.1054, + "step": 13100 + }, + { + "epoch": 0.74, + "grad_norm": 8.383191716857047, + "learning_rate": 9.363574767707677e-06, + "loss": 1.1237, + "step": 13105 + }, + { + "epoch": 0.74, + "grad_norm": 5.693730546301416, + "learning_rate": 9.3627743183182e-06, + "loss": 1.1613, + "step": 13110 + }, + { + "epoch": 0.74, + "grad_norm": 5.90265164994131, + "learning_rate": 9.36197340012937e-06, + "loss": 1.1803, + "step": 13115 + }, + { + "epoch": 0.74, + "grad_norm": 5.667282459197571, + "learning_rate": 9.361172013227246e-06, + "loss": 1.1383, + "step": 13120 + }, + { + "epoch": 0.74, + "grad_norm": 5.654847477922495, + "learning_rate": 9.360370157697942e-06, + "loss": 1.1111, + "step": 13125 + }, + { + "epoch": 0.74, + "grad_norm": 5.672817964407618, + "learning_rate": 9.359567833627622e-06, + "loss": 1.2001, + "step": 13130 + }, + { + "epoch": 0.74, + "grad_norm": 8.34903538280646, + "learning_rate": 9.358765041102496e-06, + "loss": 1.0858, + "step": 13135 + }, + { + "epoch": 0.74, + "grad_norm": 11.155886734362207, + "learning_rate": 9.35796178020883e-06, + "loss": 1.1416, + "step": 13140 + }, + { + "epoch": 0.74, + "grad_norm": 8.623987602243092, + "learning_rate": 9.357158051032938e-06, + "loss": 1.1261, + "step": 13145 + }, + { + "epoch": 0.74, + "grad_norm": 5.447376695700821, + "learning_rate": 9.356353853661184e-06, + "loss": 1.0647, + "step": 13150 + }, + { + "epoch": 0.74, + "grad_norm": 5.756070117139443, + "learning_rate": 9.355549188179982e-06, + "loss": 1.1354, + "step": 13155 + }, + { + "epoch": 0.74, + "grad_norm": 9.57267547815171, + "learning_rate": 9.354744054675798e-06, + "loss": 1.1301, + "step": 13160 + }, + { + "epoch": 0.74, + "grad_norm": 9.313112073726026, + "learning_rate": 9.353938453235146e-06, + "loss": 1.074, + "step": 13165 + }, + { + "epoch": 0.74, + "grad_norm": 8.403610122509042, + "learning_rate": 9.353132383944591e-06, + "loss": 1.0868, + "step": 13170 + }, + { + "epoch": 0.74, + "grad_norm": 8.199064665877822, + "learning_rate": 9.352325846890751e-06, + "loss": 1.1155, + "step": 13175 + }, + { + "epoch": 0.74, + "grad_norm": 9.66609846236514, + "learning_rate": 9.35151884216029e-06, + "loss": 1.093, + "step": 13180 + }, + { + "epoch": 0.74, + "grad_norm": 7.704564594035563, + "learning_rate": 9.350711369839925e-06, + "loss": 1.1022, + "step": 13185 + }, + { + "epoch": 0.74, + "grad_norm": 15.32147343235014, + "learning_rate": 9.34990343001642e-06, + "loss": 1.091, + "step": 13190 + }, + { + "epoch": 0.74, + "grad_norm": 6.5295983347581, + "learning_rate": 9.349095022776595e-06, + "loss": 1.1109, + "step": 13195 + }, + { + "epoch": 0.74, + "grad_norm": 7.339973559241184, + "learning_rate": 9.348286148207316e-06, + "loss": 1.0998, + "step": 13200 + }, + { + "epoch": 0.74, + "grad_norm": 9.329558581848236, + "learning_rate": 9.3474768063955e-06, + "loss": 1.0831, + "step": 13205 + }, + { + "epoch": 0.74, + "grad_norm": 11.166574014716085, + "learning_rate": 9.346666997428116e-06, + "loss": 1.138, + "step": 13210 + }, + { + "epoch": 0.74, + "grad_norm": 12.372868758085875, + "learning_rate": 9.345856721392179e-06, + "loss": 1.1125, + "step": 13215 + }, + { + "epoch": 0.74, + "grad_norm": 8.684416253674428, + "learning_rate": 9.345045978374756e-06, + "loss": 1.1334, + "step": 13220 + }, + { + "epoch": 0.75, + "grad_norm": 32.350750811462724, + "learning_rate": 9.344234768462968e-06, + "loss": 1.0502, + "step": 13225 + }, + { + "epoch": 0.75, + "grad_norm": 43.16607866911975, + "learning_rate": 9.343423091743979e-06, + "loss": 1.1193, + "step": 13230 + }, + { + "epoch": 0.75, + "grad_norm": 17.913472380848074, + "learning_rate": 9.342610948305013e-06, + "loss": 1.1577, + "step": 13235 + }, + { + "epoch": 0.75, + "grad_norm": 11.6992172035777, + "learning_rate": 9.341798338233333e-06, + "loss": 1.0966, + "step": 13240 + }, + { + "epoch": 0.75, + "grad_norm": 11.881311632675496, + "learning_rate": 9.34098526161626e-06, + "loss": 1.0964, + "step": 13245 + }, + { + "epoch": 0.75, + "grad_norm": 7.650245010063143, + "learning_rate": 9.340171718541163e-06, + "loss": 1.1569, + "step": 13250 + }, + { + "epoch": 0.75, + "grad_norm": 25.103789602155135, + "learning_rate": 9.33935770909546e-06, + "loss": 1.1086, + "step": 13255 + }, + { + "epoch": 0.75, + "grad_norm": 28.328221175906137, + "learning_rate": 9.338543233366621e-06, + "loss": 1.1614, + "step": 13260 + }, + { + "epoch": 0.75, + "grad_norm": 6.4071256350902, + "learning_rate": 9.337728291442164e-06, + "loss": 1.0926, + "step": 13265 + }, + { + "epoch": 0.75, + "grad_norm": 26.494290703300035, + "learning_rate": 9.336912883409656e-06, + "loss": 1.1193, + "step": 13270 + }, + { + "epoch": 0.75, + "grad_norm": 47.0797278162397, + "learning_rate": 9.336097009356721e-06, + "loss": 1.1742, + "step": 13275 + }, + { + "epoch": 0.75, + "grad_norm": 30.657108277570504, + "learning_rate": 9.335280669371024e-06, + "loss": 1.1142, + "step": 13280 + }, + { + "epoch": 0.75, + "grad_norm": 29.358195727306434, + "learning_rate": 9.334463863540288e-06, + "loss": 1.1423, + "step": 13285 + }, + { + "epoch": 0.75, + "grad_norm": 18.693673650046073, + "learning_rate": 9.33364659195228e-06, + "loss": 1.1196, + "step": 13290 + }, + { + "epoch": 0.75, + "grad_norm": 20.287072127314556, + "learning_rate": 9.332828854694817e-06, + "loss": 1.1281, + "step": 13295 + }, + { + "epoch": 0.75, + "grad_norm": 40.28895374694088, + "learning_rate": 9.332010651855774e-06, + "loss": 1.103, + "step": 13300 + }, + { + "epoch": 0.75, + "grad_norm": 15.866896750845198, + "learning_rate": 9.331191983523067e-06, + "loss": 1.1423, + "step": 13305 + }, + { + "epoch": 0.75, + "grad_norm": 13.218007796622238, + "learning_rate": 9.330372849784667e-06, + "loss": 1.1288, + "step": 13310 + }, + { + "epoch": 0.75, + "grad_norm": 30.68812414437333, + "learning_rate": 9.329553250728592e-06, + "loss": 1.1598, + "step": 13315 + }, + { + "epoch": 0.75, + "grad_norm": 23.147217760840743, + "learning_rate": 9.328733186442914e-06, + "loss": 1.1207, + "step": 13320 + }, + { + "epoch": 0.75, + "grad_norm": 13.447397617180021, + "learning_rate": 9.32791265701575e-06, + "loss": 1.1665, + "step": 13325 + }, + { + "epoch": 0.75, + "grad_norm": 21.35296338325948, + "learning_rate": 9.32709166253527e-06, + "loss": 1.1476, + "step": 13330 + }, + { + "epoch": 0.75, + "grad_norm": 5.52903095366896, + "learning_rate": 9.326270203089694e-06, + "loss": 1.118, + "step": 13335 + }, + { + "epoch": 0.75, + "grad_norm": 16.953362206083295, + "learning_rate": 9.325448278767291e-06, + "loss": 1.1535, + "step": 13340 + }, + { + "epoch": 0.75, + "grad_norm": 7.077985539100943, + "learning_rate": 9.324625889656383e-06, + "loss": 1.1524, + "step": 13345 + }, + { + "epoch": 0.75, + "grad_norm": 21.737339439347682, + "learning_rate": 9.323803035845333e-06, + "loss": 1.0792, + "step": 13350 + }, + { + "epoch": 0.75, + "grad_norm": 12.055998988920857, + "learning_rate": 9.322979717422569e-06, + "loss": 1.1441, + "step": 13355 + }, + { + "epoch": 0.75, + "grad_norm": 5.448433146731257, + "learning_rate": 9.322155934476553e-06, + "loss": 1.1197, + "step": 13360 + }, + { + "epoch": 0.75, + "grad_norm": 6.665088564842439, + "learning_rate": 9.321331687095806e-06, + "loss": 1.066, + "step": 13365 + }, + { + "epoch": 0.75, + "grad_norm": 14.559809008757659, + "learning_rate": 9.320506975368901e-06, + "loss": 1.1375, + "step": 13370 + }, + { + "epoch": 0.75, + "grad_norm": 23.296918360592887, + "learning_rate": 9.31968179938445e-06, + "loss": 1.1388, + "step": 13375 + }, + { + "epoch": 0.75, + "grad_norm": 8.334157795281683, + "learning_rate": 9.318856159231128e-06, + "loss": 1.155, + "step": 13380 + }, + { + "epoch": 0.75, + "grad_norm": 17.041357722066866, + "learning_rate": 9.31803005499765e-06, + "loss": 1.1328, + "step": 13385 + }, + { + "epoch": 0.75, + "grad_norm": 16.144806421591245, + "learning_rate": 9.317203486772785e-06, + "loss": 1.0926, + "step": 13390 + }, + { + "epoch": 0.75, + "grad_norm": 11.373692989715362, + "learning_rate": 9.316376454645355e-06, + "loss": 1.1577, + "step": 13395 + }, + { + "epoch": 0.76, + "grad_norm": 14.656030314059016, + "learning_rate": 9.315548958704223e-06, + "loss": 1.0983, + "step": 13400 + }, + { + "epoch": 0.76, + "grad_norm": 14.320948089972093, + "learning_rate": 9.31472099903831e-06, + "loss": 1.1007, + "step": 13405 + }, + { + "epoch": 0.76, + "grad_norm": 11.862662990973707, + "learning_rate": 9.313892575736582e-06, + "loss": 1.1155, + "step": 13410 + }, + { + "epoch": 0.76, + "grad_norm": 13.643863414437558, + "learning_rate": 9.31306368888806e-06, + "loss": 1.1028, + "step": 13415 + }, + { + "epoch": 0.76, + "grad_norm": 13.911997035071135, + "learning_rate": 9.312234338581809e-06, + "loss": 1.0794, + "step": 13420 + }, + { + "epoch": 0.76, + "grad_norm": 5.640759175515936, + "learning_rate": 9.311404524906946e-06, + "loss": 1.1046, + "step": 13425 + }, + { + "epoch": 0.76, + "grad_norm": 17.267081615979173, + "learning_rate": 9.31057424795264e-06, + "loss": 1.1123, + "step": 13430 + }, + { + "epoch": 0.76, + "grad_norm": 9.893566893941383, + "learning_rate": 9.309743507808106e-06, + "loss": 1.1426, + "step": 13435 + }, + { + "epoch": 0.76, + "grad_norm": 6.313233252311439, + "learning_rate": 9.308912304562613e-06, + "loss": 1.1425, + "step": 13440 + }, + { + "epoch": 0.76, + "grad_norm": 10.09712988402112, + "learning_rate": 9.308080638305478e-06, + "loss": 1.1283, + "step": 13445 + }, + { + "epoch": 0.76, + "grad_norm": 5.443032462770006, + "learning_rate": 9.30724850912606e-06, + "loss": 1.136, + "step": 13450 + }, + { + "epoch": 0.76, + "grad_norm": 5.565472317043675, + "learning_rate": 9.306415917113784e-06, + "loss": 1.1052, + "step": 13455 + }, + { + "epoch": 0.76, + "grad_norm": 13.141922557312448, + "learning_rate": 9.305582862358113e-06, + "loss": 1.1289, + "step": 13460 + }, + { + "epoch": 0.76, + "grad_norm": 5.73748140898653, + "learning_rate": 9.304749344948559e-06, + "loss": 1.1245, + "step": 13465 + }, + { + "epoch": 0.76, + "grad_norm": 6.13130494238641, + "learning_rate": 9.303915364974691e-06, + "loss": 1.1061, + "step": 13470 + }, + { + "epoch": 0.76, + "grad_norm": 5.860927237913024, + "learning_rate": 9.303080922526122e-06, + "loss": 1.127, + "step": 13475 + }, + { + "epoch": 0.76, + "grad_norm": 7.545780182973548, + "learning_rate": 9.302246017692517e-06, + "loss": 1.0708, + "step": 13480 + }, + { + "epoch": 0.76, + "grad_norm": 8.800562981262573, + "learning_rate": 9.30141065056359e-06, + "loss": 1.1194, + "step": 13485 + }, + { + "epoch": 0.76, + "grad_norm": 5.6279791804018515, + "learning_rate": 9.300574821229106e-06, + "loss": 1.124, + "step": 13490 + }, + { + "epoch": 0.76, + "grad_norm": 8.023758701531525, + "learning_rate": 9.299738529778877e-06, + "loss": 1.1083, + "step": 13495 + }, + { + "epoch": 0.76, + "grad_norm": 7.248043305272939, + "learning_rate": 9.298901776302765e-06, + "loss": 1.1152, + "step": 13500 + }, + { + "epoch": 0.76, + "grad_norm": 7.22039750648539, + "learning_rate": 9.298064560890687e-06, + "loss": 1.1321, + "step": 13505 + }, + { + "epoch": 0.76, + "grad_norm": 7.509992613535459, + "learning_rate": 9.297226883632601e-06, + "loss": 1.1747, + "step": 13510 + }, + { + "epoch": 0.76, + "grad_norm": 7.280557450220159, + "learning_rate": 9.296388744618524e-06, + "loss": 1.1179, + "step": 13515 + }, + { + "epoch": 0.76, + "grad_norm": 12.72380571235983, + "learning_rate": 9.295550143938514e-06, + "loss": 1.1317, + "step": 13520 + }, + { + "epoch": 0.76, + "grad_norm": 6.9820958892304, + "learning_rate": 9.294711081682683e-06, + "loss": 1.1744, + "step": 13525 + }, + { + "epoch": 0.76, + "grad_norm": 6.3255339936694694, + "learning_rate": 9.293871557941192e-06, + "loss": 1.1813, + "step": 13530 + }, + { + "epoch": 0.76, + "grad_norm": 10.08886677511008, + "learning_rate": 9.293031572804252e-06, + "loss": 1.0773, + "step": 13535 + }, + { + "epoch": 0.76, + "grad_norm": 19.925705538683506, + "learning_rate": 9.292191126362123e-06, + "loss": 1.1472, + "step": 13540 + }, + { + "epoch": 0.76, + "grad_norm": 7.745251977035863, + "learning_rate": 9.291350218705114e-06, + "loss": 1.1422, + "step": 13545 + }, + { + "epoch": 0.76, + "grad_norm": 34.30816442876599, + "learning_rate": 9.290508849923585e-06, + "loss": 1.1174, + "step": 13550 + }, + { + "epoch": 0.76, + "grad_norm": 7.887642679729329, + "learning_rate": 9.289667020107944e-06, + "loss": 1.1387, + "step": 13555 + }, + { + "epoch": 0.76, + "grad_norm": 11.320860936183017, + "learning_rate": 9.288824729348652e-06, + "loss": 1.1281, + "step": 13560 + }, + { + "epoch": 0.76, + "grad_norm": 11.581776526817508, + "learning_rate": 9.287981977736214e-06, + "loss": 1.1098, + "step": 13565 + }, + { + "epoch": 0.76, + "grad_norm": 7.520306085948311, + "learning_rate": 9.287138765361187e-06, + "loss": 1.1389, + "step": 13570 + }, + { + "epoch": 0.76, + "grad_norm": 8.437870663057927, + "learning_rate": 9.28629509231418e-06, + "loss": 1.1511, + "step": 13575 + }, + { + "epoch": 0.77, + "grad_norm": 8.003099252983906, + "learning_rate": 9.285450958685848e-06, + "loss": 1.1098, + "step": 13580 + }, + { + "epoch": 0.77, + "grad_norm": 10.05737331972076, + "learning_rate": 9.284606364566898e-06, + "loss": 1.1591, + "step": 13585 + }, + { + "epoch": 0.77, + "grad_norm": 13.608738856247651, + "learning_rate": 9.283761310048083e-06, + "loss": 1.124, + "step": 13590 + }, + { + "epoch": 0.77, + "grad_norm": 12.875720873844369, + "learning_rate": 9.282915795220213e-06, + "loss": 1.146, + "step": 13595 + }, + { + "epoch": 0.77, + "grad_norm": 7.6993046757266335, + "learning_rate": 9.282069820174136e-06, + "loss": 1.1302, + "step": 13600 + }, + { + "epoch": 0.77, + "grad_norm": 10.337634709434207, + "learning_rate": 9.281223385000759e-06, + "loss": 1.1196, + "step": 13605 + }, + { + "epoch": 0.77, + "grad_norm": 6.875538727178148, + "learning_rate": 9.280376489791035e-06, + "loss": 1.1057, + "step": 13610 + }, + { + "epoch": 0.77, + "grad_norm": 12.116674798519531, + "learning_rate": 9.279529134635967e-06, + "loss": 1.1533, + "step": 13615 + }, + { + "epoch": 0.77, + "grad_norm": 9.46769866569297, + "learning_rate": 9.278681319626606e-06, + "loss": 1.1566, + "step": 13620 + }, + { + "epoch": 0.77, + "grad_norm": 6.477793272182669, + "learning_rate": 9.277833044854054e-06, + "loss": 1.1103, + "step": 13625 + }, + { + "epoch": 0.77, + "grad_norm": 6.816367511849197, + "learning_rate": 9.276984310409463e-06, + "loss": 1.094, + "step": 13630 + }, + { + "epoch": 0.77, + "grad_norm": 7.254042338449794, + "learning_rate": 9.276135116384031e-06, + "loss": 1.1292, + "step": 13635 + }, + { + "epoch": 0.77, + "grad_norm": 13.687436225032867, + "learning_rate": 9.275285462869007e-06, + "loss": 1.1342, + "step": 13640 + }, + { + "epoch": 0.77, + "grad_norm": 10.561032557100821, + "learning_rate": 9.274435349955694e-06, + "loss": 1.1221, + "step": 13645 + }, + { + "epoch": 0.77, + "grad_norm": 12.062241926944752, + "learning_rate": 9.273584777735439e-06, + "loss": 1.1429, + "step": 13650 + }, + { + "epoch": 0.77, + "grad_norm": 6.078878765385249, + "learning_rate": 9.272733746299636e-06, + "loss": 1.1171, + "step": 13655 + }, + { + "epoch": 0.77, + "grad_norm": 5.379962126254871, + "learning_rate": 9.271882255739737e-06, + "loss": 1.1258, + "step": 13660 + }, + { + "epoch": 0.77, + "grad_norm": 8.731563499094715, + "learning_rate": 9.271030306147235e-06, + "loss": 1.1159, + "step": 13665 + }, + { + "epoch": 0.77, + "grad_norm": 7.3783218277511065, + "learning_rate": 9.270177897613677e-06, + "loss": 1.0826, + "step": 13670 + }, + { + "epoch": 0.77, + "grad_norm": 5.502104811157233, + "learning_rate": 9.26932503023066e-06, + "loss": 1.1116, + "step": 13675 + }, + { + "epoch": 0.77, + "grad_norm": 23.797859875034057, + "learning_rate": 9.268471704089825e-06, + "loss": 1.1085, + "step": 13680 + }, + { + "epoch": 0.77, + "grad_norm": 15.413733104854044, + "learning_rate": 9.267617919282868e-06, + "loss": 1.1752, + "step": 13685 + }, + { + "epoch": 0.77, + "grad_norm": 27.47484187655042, + "learning_rate": 9.26676367590153e-06, + "loss": 1.1258, + "step": 13690 + }, + { + "epoch": 0.77, + "grad_norm": 20.007395662097164, + "learning_rate": 9.265908974037607e-06, + "loss": 1.1228, + "step": 13695 + }, + { + "epoch": 0.77, + "grad_norm": 6.950419800963811, + "learning_rate": 9.265053813782937e-06, + "loss": 1.1377, + "step": 13700 + }, + { + "epoch": 0.77, + "grad_norm": 6.659025159131205, + "learning_rate": 9.264198195229408e-06, + "loss": 1.0952, + "step": 13705 + }, + { + "epoch": 0.77, + "grad_norm": 15.445367188085125, + "learning_rate": 9.263342118468967e-06, + "loss": 1.1694, + "step": 13710 + }, + { + "epoch": 0.77, + "grad_norm": 10.967010541997837, + "learning_rate": 9.262485583593599e-06, + "loss": 1.1389, + "step": 13715 + }, + { + "epoch": 0.77, + "grad_norm": 19.276295985618987, + "learning_rate": 9.261628590695344e-06, + "loss": 1.1292, + "step": 13720 + }, + { + "epoch": 0.77, + "grad_norm": 8.718851162447113, + "learning_rate": 9.260771139866287e-06, + "loss": 1.1319, + "step": 13725 + }, + { + "epoch": 0.77, + "grad_norm": 13.207826272962391, + "learning_rate": 9.25991323119857e-06, + "loss": 1.0926, + "step": 13730 + }, + { + "epoch": 0.77, + "grad_norm": 10.459454509945742, + "learning_rate": 9.259054864784373e-06, + "loss": 1.128, + "step": 13735 + }, + { + "epoch": 0.77, + "grad_norm": 18.71762436735676, + "learning_rate": 9.258196040715935e-06, + "loss": 1.1374, + "step": 13740 + }, + { + "epoch": 0.77, + "grad_norm": 12.436272299996114, + "learning_rate": 9.25733675908554e-06, + "loss": 1.1349, + "step": 13745 + }, + { + "epoch": 0.77, + "grad_norm": 8.991719482585694, + "learning_rate": 9.256477019985521e-06, + "loss": 1.093, + "step": 13750 + }, + { + "epoch": 0.78, + "grad_norm": 10.862701413478378, + "learning_rate": 9.25561682350826e-06, + "loss": 1.0909, + "step": 13755 + }, + { + "epoch": 0.78, + "grad_norm": 9.538549779012483, + "learning_rate": 9.25475616974619e-06, + "loss": 1.0974, + "step": 13760 + }, + { + "epoch": 0.78, + "grad_norm": 22.799611630950533, + "learning_rate": 9.253895058791794e-06, + "loss": 1.1136, + "step": 13765 + }, + { + "epoch": 0.78, + "grad_norm": 17.952476271317316, + "learning_rate": 9.253033490737597e-06, + "loss": 1.1979, + "step": 13770 + }, + { + "epoch": 0.78, + "grad_norm": 11.261136565668522, + "learning_rate": 9.252171465676183e-06, + "loss": 1.1116, + "step": 13775 + }, + { + "epoch": 0.78, + "grad_norm": 5.966480746423545, + "learning_rate": 9.25130898370018e-06, + "loss": 1.1217, + "step": 13780 + }, + { + "epoch": 0.78, + "grad_norm": 8.65468608724836, + "learning_rate": 9.250446044902261e-06, + "loss": 1.0864, + "step": 13785 + }, + { + "epoch": 0.78, + "grad_norm": 5.9709406379466605, + "learning_rate": 9.249582649375159e-06, + "loss": 1.1271, + "step": 13790 + }, + { + "epoch": 0.78, + "grad_norm": 7.500872689576212, + "learning_rate": 9.248718797211644e-06, + "loss": 1.143, + "step": 13795 + }, + { + "epoch": 0.78, + "grad_norm": 6.169700697059949, + "learning_rate": 9.247854488504545e-06, + "loss": 1.1306, + "step": 13800 + }, + { + "epoch": 0.78, + "grad_norm": 8.698725865695183, + "learning_rate": 9.246989723346733e-06, + "loss": 1.0959, + "step": 13805 + }, + { + "epoch": 0.78, + "grad_norm": 8.749293720715379, + "learning_rate": 9.246124501831133e-06, + "loss": 1.1566, + "step": 13810 + }, + { + "epoch": 0.78, + "grad_norm": 6.177725593555664, + "learning_rate": 9.245258824050715e-06, + "loss": 1.1323, + "step": 13815 + }, + { + "epoch": 0.78, + "grad_norm": 10.441274968038554, + "learning_rate": 9.2443926900985e-06, + "loss": 1.1197, + "step": 13820 + }, + { + "epoch": 0.78, + "grad_norm": 6.58083618966104, + "learning_rate": 9.24352610006756e-06, + "loss": 1.15, + "step": 13825 + }, + { + "epoch": 0.78, + "grad_norm": 8.292663141819418, + "learning_rate": 9.24265905405101e-06, + "loss": 1.1437, + "step": 13830 + }, + { + "epoch": 0.78, + "grad_norm": 12.595715610980198, + "learning_rate": 9.241791552142024e-06, + "loss": 1.1708, + "step": 13835 + }, + { + "epoch": 0.78, + "grad_norm": 4.905953138287774, + "learning_rate": 9.240923594433812e-06, + "loss": 1.097, + "step": 13840 + }, + { + "epoch": 0.78, + "grad_norm": 7.481455008815603, + "learning_rate": 9.240055181019645e-06, + "loss": 1.1193, + "step": 13845 + }, + { + "epoch": 0.78, + "grad_norm": 6.429698145079362, + "learning_rate": 9.239186311992834e-06, + "loss": 1.1881, + "step": 13850 + }, + { + "epoch": 0.78, + "grad_norm": 12.162421162324343, + "learning_rate": 9.238316987446747e-06, + "loss": 1.1377, + "step": 13855 + }, + { + "epoch": 0.78, + "grad_norm": 6.89095074995895, + "learning_rate": 9.237447207474794e-06, + "loss": 1.1155, + "step": 13860 + }, + { + "epoch": 0.78, + "grad_norm": 14.822738067200005, + "learning_rate": 9.236576972170437e-06, + "loss": 1.145, + "step": 13865 + }, + { + "epoch": 0.78, + "grad_norm": 14.081231536891007, + "learning_rate": 9.235706281627186e-06, + "loss": 1.1025, + "step": 13870 + }, + { + "epoch": 0.78, + "grad_norm": 5.619280728725981, + "learning_rate": 9.234835135938602e-06, + "loss": 1.157, + "step": 13875 + }, + { + "epoch": 0.78, + "grad_norm": 20.579302852048286, + "learning_rate": 9.233963535198293e-06, + "loss": 1.1212, + "step": 13880 + }, + { + "epoch": 0.78, + "grad_norm": 16.05429958005681, + "learning_rate": 9.233091479499917e-06, + "loss": 1.1027, + "step": 13885 + }, + { + "epoch": 0.78, + "grad_norm": 15.796214002590123, + "learning_rate": 9.232218968937179e-06, + "loss": 1.0815, + "step": 13890 + }, + { + "epoch": 0.78, + "grad_norm": 7.165377012286247, + "learning_rate": 9.231346003603834e-06, + "loss": 1.0771, + "step": 13895 + }, + { + "epoch": 0.78, + "grad_norm": 7.4699465575682815, + "learning_rate": 9.230472583593689e-06, + "loss": 1.1094, + "step": 13900 + }, + { + "epoch": 0.78, + "grad_norm": 8.452581346147237, + "learning_rate": 9.22959870900059e-06, + "loss": 1.1625, + "step": 13905 + }, + { + "epoch": 0.78, + "grad_norm": 9.055197124904744, + "learning_rate": 9.228724379918445e-06, + "loss": 1.1085, + "step": 13910 + }, + { + "epoch": 0.78, + "grad_norm": 5.76551117845389, + "learning_rate": 9.227849596441205e-06, + "loss": 1.1528, + "step": 13915 + }, + { + "epoch": 0.78, + "grad_norm": 15.930873175564468, + "learning_rate": 9.226974358662865e-06, + "loss": 1.1737, + "step": 13920 + }, + { + "epoch": 0.78, + "grad_norm": 6.320446860384759, + "learning_rate": 9.226098666677474e-06, + "loss": 1.1629, + "step": 13925 + }, + { + "epoch": 0.78, + "grad_norm": 6.781621538328085, + "learning_rate": 9.22522252057913e-06, + "loss": 1.1138, + "step": 13930 + }, + { + "epoch": 0.79, + "grad_norm": 19.338828347921968, + "learning_rate": 9.224345920461982e-06, + "loss": 1.1157, + "step": 13935 + }, + { + "epoch": 0.79, + "grad_norm": 13.016020945129645, + "learning_rate": 9.22346886642022e-06, + "loss": 1.1395, + "step": 13940 + }, + { + "epoch": 0.79, + "grad_norm": 20.16111275895161, + "learning_rate": 9.222591358548087e-06, + "loss": 1.167, + "step": 13945 + }, + { + "epoch": 0.79, + "grad_norm": 14.864772680736904, + "learning_rate": 9.221713396939879e-06, + "loss": 1.1201, + "step": 13950 + }, + { + "epoch": 0.79, + "grad_norm": 20.342063790210815, + "learning_rate": 9.220834981689932e-06, + "loss": 1.1418, + "step": 13955 + }, + { + "epoch": 0.79, + "grad_norm": 36.95048013404268, + "learning_rate": 9.21995611289264e-06, + "loss": 1.1484, + "step": 13960 + }, + { + "epoch": 0.79, + "grad_norm": 26.491214139685393, + "learning_rate": 9.219076790642441e-06, + "loss": 1.2125, + "step": 13965 + }, + { + "epoch": 0.79, + "grad_norm": 6.563687511853851, + "learning_rate": 9.218197015033818e-06, + "loss": 1.1196, + "step": 13970 + }, + { + "epoch": 0.79, + "grad_norm": 17.26285170276772, + "learning_rate": 9.217316786161312e-06, + "loss": 1.1208, + "step": 13975 + }, + { + "epoch": 0.79, + "grad_norm": 8.454411330595025, + "learning_rate": 9.216436104119501e-06, + "loss": 1.1488, + "step": 13980 + }, + { + "epoch": 0.79, + "grad_norm": 8.161895232722665, + "learning_rate": 9.215554969003026e-06, + "loss": 1.1275, + "step": 13985 + }, + { + "epoch": 0.79, + "grad_norm": 10.091020849494228, + "learning_rate": 9.214673380906562e-06, + "loss": 1.1399, + "step": 13990 + }, + { + "epoch": 0.79, + "grad_norm": 10.11861779810707, + "learning_rate": 9.213791339924845e-06, + "loss": 1.085, + "step": 13995 + }, + { + "epoch": 0.79, + "grad_norm": 12.965318133568058, + "learning_rate": 9.212908846152651e-06, + "loss": 1.0959, + "step": 14000 + }, + { + "epoch": 0.79, + "grad_norm": 11.660287254545757, + "learning_rate": 9.212025899684806e-06, + "loss": 1.1838, + "step": 14005 + }, + { + "epoch": 0.79, + "grad_norm": 11.122441198003585, + "learning_rate": 9.211142500616193e-06, + "loss": 1.1873, + "step": 14010 + }, + { + "epoch": 0.79, + "grad_norm": 13.188803797134169, + "learning_rate": 9.21025864904173e-06, + "loss": 1.2069, + "step": 14015 + }, + { + "epoch": 0.79, + "grad_norm": 12.980304627612265, + "learning_rate": 9.209374345056395e-06, + "loss": 1.1159, + "step": 14020 + }, + { + "epoch": 0.79, + "grad_norm": 35.89189235415076, + "learning_rate": 9.20848958875521e-06, + "loss": 1.166, + "step": 14025 + }, + { + "epoch": 0.79, + "grad_norm": 17.5039723213255, + "learning_rate": 9.207604380233243e-06, + "loss": 1.1617, + "step": 14030 + }, + { + "epoch": 0.79, + "grad_norm": 21.597566500556304, + "learning_rate": 9.206718719585618e-06, + "loss": 1.1739, + "step": 14035 + }, + { + "epoch": 0.79, + "grad_norm": 11.964009069751182, + "learning_rate": 9.205832606907501e-06, + "loss": 1.1549, + "step": 14040 + }, + { + "epoch": 0.79, + "grad_norm": 6.820664262938266, + "learning_rate": 9.204946042294106e-06, + "loss": 1.1297, + "step": 14045 + }, + { + "epoch": 0.79, + "grad_norm": 15.643955951170904, + "learning_rate": 9.204059025840704e-06, + "loss": 1.1171, + "step": 14050 + }, + { + "epoch": 0.79, + "grad_norm": 9.945312211858417, + "learning_rate": 9.203171557642604e-06, + "loss": 1.1169, + "step": 14055 + }, + { + "epoch": 0.79, + "grad_norm": 8.930336859532545, + "learning_rate": 9.20228363779517e-06, + "loss": 1.1336, + "step": 14060 + }, + { + "epoch": 0.79, + "grad_norm": 9.606237171809015, + "learning_rate": 9.201395266393813e-06, + "loss": 1.1167, + "step": 14065 + }, + { + "epoch": 0.79, + "grad_norm": 4.94701859813007, + "learning_rate": 9.200506443533994e-06, + "loss": 1.1178, + "step": 14070 + }, + { + "epoch": 0.79, + "grad_norm": 9.831832557644484, + "learning_rate": 9.199617169311218e-06, + "loss": 1.093, + "step": 14075 + }, + { + "epoch": 0.79, + "grad_norm": 5.041097715615692, + "learning_rate": 9.198727443821042e-06, + "loss": 1.1481, + "step": 14080 + }, + { + "epoch": 0.79, + "grad_norm": 13.335247721699924, + "learning_rate": 9.197837267159074e-06, + "loss": 1.0983, + "step": 14085 + }, + { + "epoch": 0.79, + "grad_norm": 27.95839710913172, + "learning_rate": 9.196946639420965e-06, + "loss": 1.1165, + "step": 14090 + }, + { + "epoch": 0.79, + "grad_norm": 22.61718909579245, + "learning_rate": 9.196055560702414e-06, + "loss": 1.107, + "step": 14095 + }, + { + "epoch": 0.79, + "grad_norm": 21.087475547267324, + "learning_rate": 9.195164031099177e-06, + "loss": 1.1095, + "step": 14100 + }, + { + "epoch": 0.79, + "grad_norm": 5.919048826713127, + "learning_rate": 9.19427205070705e-06, + "loss": 1.0918, + "step": 14105 + }, + { + "epoch": 0.8, + "grad_norm": 38.24959780739236, + "learning_rate": 9.19337961962188e-06, + "loss": 1.1554, + "step": 14110 + }, + { + "epoch": 0.8, + "grad_norm": 30.465789912767008, + "learning_rate": 9.192486737939565e-06, + "loss": 1.144, + "step": 14115 + }, + { + "epoch": 0.8, + "grad_norm": 29.561028925518176, + "learning_rate": 9.191593405756045e-06, + "loss": 1.1474, + "step": 14120 + }, + { + "epoch": 0.8, + "grad_norm": 18.82457678039254, + "learning_rate": 9.190699623167315e-06, + "loss": 1.1571, + "step": 14125 + }, + { + "epoch": 0.8, + "grad_norm": 9.485736071645771, + "learning_rate": 9.189805390269416e-06, + "loss": 1.1314, + "step": 14130 + }, + { + "epoch": 0.8, + "grad_norm": 9.880076748288326, + "learning_rate": 9.188910707158436e-06, + "loss": 1.1214, + "step": 14135 + }, + { + "epoch": 0.8, + "grad_norm": 7.747732571231177, + "learning_rate": 9.188015573930513e-06, + "loss": 1.1376, + "step": 14140 + }, + { + "epoch": 0.8, + "grad_norm": 9.10280136658978, + "learning_rate": 9.187119990681835e-06, + "loss": 1.1462, + "step": 14145 + }, + { + "epoch": 0.8, + "grad_norm": 5.331362335970542, + "learning_rate": 9.186223957508634e-06, + "loss": 1.106, + "step": 14150 + }, + { + "epoch": 0.8, + "grad_norm": 13.191655740486233, + "learning_rate": 9.185327474507193e-06, + "loss": 1.0988, + "step": 14155 + }, + { + "epoch": 0.8, + "grad_norm": 14.897305841846903, + "learning_rate": 9.184430541773845e-06, + "loss": 1.1619, + "step": 14160 + }, + { + "epoch": 0.8, + "grad_norm": 11.539959522238624, + "learning_rate": 9.183533159404966e-06, + "loss": 1.0901, + "step": 14165 + }, + { + "epoch": 0.8, + "grad_norm": 6.96237218812107, + "learning_rate": 9.182635327496988e-06, + "loss": 1.1275, + "step": 14170 + }, + { + "epoch": 0.8, + "grad_norm": 23.810293716473804, + "learning_rate": 9.181737046146384e-06, + "loss": 1.1698, + "step": 14175 + }, + { + "epoch": 0.8, + "grad_norm": 14.373524810798113, + "learning_rate": 9.180838315449678e-06, + "loss": 1.1174, + "step": 14180 + }, + { + "epoch": 0.8, + "grad_norm": 7.079215045233922, + "learning_rate": 9.179939135503444e-06, + "loss": 1.1378, + "step": 14185 + }, + { + "epoch": 0.8, + "grad_norm": 8.353180060990551, + "learning_rate": 9.179039506404304e-06, + "loss": 1.1267, + "step": 14190 + }, + { + "epoch": 0.8, + "grad_norm": 10.091622773051276, + "learning_rate": 9.178139428248924e-06, + "loss": 1.1655, + "step": 14195 + }, + { + "epoch": 0.8, + "grad_norm": 18.32336553810114, + "learning_rate": 9.177238901134022e-06, + "loss": 1.0768, + "step": 14200 + }, + { + "epoch": 0.8, + "grad_norm": 9.730446886045902, + "learning_rate": 9.176337925156365e-06, + "loss": 1.0966, + "step": 14205 + }, + { + "epoch": 0.8, + "grad_norm": 8.616189975073862, + "learning_rate": 9.175436500412768e-06, + "loss": 1.1357, + "step": 14210 + }, + { + "epoch": 0.8, + "grad_norm": 8.225365663349692, + "learning_rate": 9.17453462700009e-06, + "loss": 1.1409, + "step": 14215 + }, + { + "epoch": 0.8, + "grad_norm": 8.40414752329751, + "learning_rate": 9.173632305015244e-06, + "loss": 1.0925, + "step": 14220 + }, + { + "epoch": 0.8, + "grad_norm": 8.10721750390549, + "learning_rate": 9.172729534555185e-06, + "loss": 1.0974, + "step": 14225 + }, + { + "epoch": 0.8, + "grad_norm": 5.690636915096559, + "learning_rate": 9.171826315716923e-06, + "loss": 1.1386, + "step": 14230 + }, + { + "epoch": 0.8, + "grad_norm": 10.255810219626824, + "learning_rate": 9.170922648597511e-06, + "loss": 1.1288, + "step": 14235 + }, + { + "epoch": 0.8, + "grad_norm": 10.806463877990772, + "learning_rate": 9.170018533294054e-06, + "loss": 1.0899, + "step": 14240 + }, + { + "epoch": 0.8, + "grad_norm": 5.6608942379184795, + "learning_rate": 9.169113969903701e-06, + "loss": 1.1235, + "step": 14245 + }, + { + "epoch": 0.8, + "grad_norm": 7.4079556215100535, + "learning_rate": 9.168208958523652e-06, + "loss": 1.1528, + "step": 14250 + }, + { + "epoch": 0.8, + "grad_norm": 5.863205442316616, + "learning_rate": 9.167303499251158e-06, + "loss": 1.1222, + "step": 14255 + }, + { + "epoch": 0.8, + "grad_norm": 7.518350535844348, + "learning_rate": 9.166397592183506e-06, + "loss": 1.1012, + "step": 14260 + }, + { + "epoch": 0.8, + "grad_norm": 6.880158890445731, + "learning_rate": 9.165491237418049e-06, + "loss": 1.1033, + "step": 14265 + }, + { + "epoch": 0.8, + "grad_norm": 8.999510079626809, + "learning_rate": 9.164584435052173e-06, + "loss": 1.1345, + "step": 14270 + }, + { + "epoch": 0.8, + "grad_norm": 16.9060481237708, + "learning_rate": 9.163677185183319e-06, + "loss": 1.0951, + "step": 14275 + }, + { + "epoch": 0.8, + "grad_norm": 12.5116064946818, + "learning_rate": 9.162769487908979e-06, + "loss": 1.1321, + "step": 14280 + }, + { + "epoch": 0.8, + "grad_norm": 5.889670907161606, + "learning_rate": 9.161861343326682e-06, + "loss": 1.1357, + "step": 14285 + }, + { + "epoch": 0.81, + "grad_norm": 13.560013535585883, + "learning_rate": 9.160952751534017e-06, + "loss": 1.1131, + "step": 14290 + }, + { + "epoch": 0.81, + "grad_norm": 19.136247411587014, + "learning_rate": 9.160043712628617e-06, + "loss": 1.1223, + "step": 14295 + }, + { + "epoch": 0.81, + "grad_norm": 18.823130733668364, + "learning_rate": 9.15913422670816e-06, + "loss": 1.096, + "step": 14300 + }, + { + "epoch": 0.81, + "grad_norm": 10.246879066563004, + "learning_rate": 9.158224293870372e-06, + "loss": 1.1279, + "step": 14305 + }, + { + "epoch": 0.81, + "grad_norm": 10.241964967977525, + "learning_rate": 9.157313914213035e-06, + "loss": 1.0837, + "step": 14310 + }, + { + "epoch": 0.81, + "grad_norm": 6.153056223550367, + "learning_rate": 9.15640308783397e-06, + "loss": 1.1464, + "step": 14315 + }, + { + "epoch": 0.81, + "grad_norm": 9.180470132255973, + "learning_rate": 9.155491814831048e-06, + "loss": 1.0901, + "step": 14320 + }, + { + "epoch": 0.81, + "grad_norm": 8.214125432403987, + "learning_rate": 9.154580095302192e-06, + "loss": 1.1685, + "step": 14325 + }, + { + "epoch": 0.81, + "grad_norm": 10.747395491772629, + "learning_rate": 9.15366792934537e-06, + "loss": 1.1498, + "step": 14330 + }, + { + "epoch": 0.81, + "grad_norm": 12.060230650050356, + "learning_rate": 9.152755317058595e-06, + "loss": 1.0823, + "step": 14335 + }, + { + "epoch": 0.81, + "grad_norm": 10.522138693026545, + "learning_rate": 9.151842258539935e-06, + "loss": 1.1511, + "step": 14340 + }, + { + "epoch": 0.81, + "grad_norm": 8.049128238439922, + "learning_rate": 9.1509287538875e-06, + "loss": 1.1553, + "step": 14345 + }, + { + "epoch": 0.81, + "grad_norm": 16.059989275654193, + "learning_rate": 9.150014803199452e-06, + "loss": 1.1317, + "step": 14350 + }, + { + "epoch": 0.81, + "grad_norm": 14.197991414325541, + "learning_rate": 9.149100406573999e-06, + "loss": 1.1415, + "step": 14355 + }, + { + "epoch": 0.81, + "grad_norm": 14.55678018802807, + "learning_rate": 9.148185564109395e-06, + "loss": 1.1033, + "step": 14360 + }, + { + "epoch": 0.81, + "grad_norm": 13.394148323407906, + "learning_rate": 9.147270275903943e-06, + "loss": 1.126, + "step": 14365 + }, + { + "epoch": 0.81, + "grad_norm": 8.931639924562274, + "learning_rate": 9.146354542055999e-06, + "loss": 1.1332, + "step": 14370 + }, + { + "epoch": 0.81, + "grad_norm": 6.946614747006146, + "learning_rate": 9.14543836266396e-06, + "loss": 1.1761, + "step": 14375 + }, + { + "epoch": 0.81, + "grad_norm": 6.851780316902592, + "learning_rate": 9.144521737826273e-06, + "loss": 1.0815, + "step": 14380 + }, + { + "epoch": 0.81, + "grad_norm": 9.36034929892392, + "learning_rate": 9.143604667641434e-06, + "loss": 1.1592, + "step": 14385 + }, + { + "epoch": 0.81, + "grad_norm": 14.50869854422606, + "learning_rate": 9.142687152207984e-06, + "loss": 1.1145, + "step": 14390 + }, + { + "epoch": 0.81, + "grad_norm": 25.04190733959511, + "learning_rate": 9.14176919162452e-06, + "loss": 1.1217, + "step": 14395 + }, + { + "epoch": 0.81, + "grad_norm": 5.962800932883541, + "learning_rate": 9.140850785989677e-06, + "loss": 1.1872, + "step": 14400 + }, + { + "epoch": 0.81, + "grad_norm": 12.229314658841174, + "learning_rate": 9.139931935402141e-06, + "loss": 1.1138, + "step": 14405 + }, + { + "epoch": 0.81, + "grad_norm": 9.744552444135527, + "learning_rate": 9.139012639960649e-06, + "loss": 1.1389, + "step": 14410 + }, + { + "epoch": 0.81, + "grad_norm": 7.328030037489963, + "learning_rate": 9.13809289976398e-06, + "loss": 1.1035, + "step": 14415 + }, + { + "epoch": 0.81, + "grad_norm": 5.766340861757486, + "learning_rate": 9.137172714910967e-06, + "loss": 1.1192, + "step": 14420 + }, + { + "epoch": 0.81, + "grad_norm": 6.20875420425887, + "learning_rate": 9.136252085500488e-06, + "loss": 1.1332, + "step": 14425 + }, + { + "epoch": 0.81, + "grad_norm": 6.330742111571248, + "learning_rate": 9.135331011631465e-06, + "loss": 1.1166, + "step": 14430 + }, + { + "epoch": 0.81, + "grad_norm": 7.432026828331053, + "learning_rate": 9.134409493402877e-06, + "loss": 1.0995, + "step": 14435 + }, + { + "epoch": 0.81, + "grad_norm": 6.2613884077564546, + "learning_rate": 9.13348753091374e-06, + "loss": 1.1127, + "step": 14440 + }, + { + "epoch": 0.81, + "grad_norm": 11.055163315915612, + "learning_rate": 9.132565124263129e-06, + "loss": 1.1122, + "step": 14445 + }, + { + "epoch": 0.81, + "grad_norm": 16.96554909998541, + "learning_rate": 9.131642273550155e-06, + "loss": 1.1098, + "step": 14450 + }, + { + "epoch": 0.81, + "grad_norm": 17.056357541722985, + "learning_rate": 9.130718978873983e-06, + "loss": 1.096, + "step": 14455 + }, + { + "epoch": 0.81, + "grad_norm": 10.244625753913043, + "learning_rate": 9.129795240333828e-06, + "loss": 1.07, + "step": 14460 + }, + { + "epoch": 0.82, + "grad_norm": 5.237572939386469, + "learning_rate": 9.128871058028947e-06, + "loss": 1.0604, + "step": 14465 + }, + { + "epoch": 0.82, + "grad_norm": 22.58110063273889, + "learning_rate": 9.12794643205865e-06, + "loss": 1.1535, + "step": 14470 + }, + { + "epoch": 0.82, + "grad_norm": 26.196714509958802, + "learning_rate": 9.12702136252229e-06, + "loss": 1.1287, + "step": 14475 + }, + { + "epoch": 0.82, + "grad_norm": 8.609119541234472, + "learning_rate": 9.126095849519272e-06, + "loss": 1.163, + "step": 14480 + }, + { + "epoch": 0.82, + "grad_norm": 8.672169768287137, + "learning_rate": 9.125169893149043e-06, + "loss": 1.0958, + "step": 14485 + }, + { + "epoch": 0.82, + "grad_norm": 9.348789296891935, + "learning_rate": 9.124243493511105e-06, + "loss": 1.062, + "step": 14490 + }, + { + "epoch": 0.82, + "grad_norm": 11.278743464736229, + "learning_rate": 9.123316650705001e-06, + "loss": 1.1346, + "step": 14495 + }, + { + "epoch": 0.82, + "grad_norm": 6.196639817099412, + "learning_rate": 9.122389364830327e-06, + "loss": 1.0966, + "step": 14500 + }, + { + "epoch": 0.82, + "grad_norm": 6.767105478740581, + "learning_rate": 9.121461635986719e-06, + "loss": 1.0693, + "step": 14505 + }, + { + "epoch": 0.82, + "grad_norm": 16.26256420492267, + "learning_rate": 9.120533464273872e-06, + "loss": 1.117, + "step": 14510 + }, + { + "epoch": 0.82, + "grad_norm": 13.309338429855874, + "learning_rate": 9.119604849791518e-06, + "loss": 1.1151, + "step": 14515 + }, + { + "epoch": 0.82, + "grad_norm": 7.115462869171166, + "learning_rate": 9.11867579263944e-06, + "loss": 1.1576, + "step": 14520 + }, + { + "epoch": 0.82, + "grad_norm": 13.463568967606593, + "learning_rate": 9.117746292917473e-06, + "loss": 1.0641, + "step": 14525 + }, + { + "epoch": 0.82, + "grad_norm": 13.1024608003889, + "learning_rate": 9.116816350725492e-06, + "loss": 1.0755, + "step": 14530 + }, + { + "epoch": 0.82, + "grad_norm": 10.427344289970103, + "learning_rate": 9.115885966163426e-06, + "loss": 1.1344, + "step": 14535 + }, + { + "epoch": 0.82, + "grad_norm": 5.83017031327818, + "learning_rate": 9.11495513933125e-06, + "loss": 1.1203, + "step": 14540 + }, + { + "epoch": 0.82, + "grad_norm": 19.718512738635773, + "learning_rate": 9.114023870328982e-06, + "loss": 1.1189, + "step": 14545 + }, + { + "epoch": 0.82, + "grad_norm": 7.840400431357369, + "learning_rate": 9.113092159256694e-06, + "loss": 1.1485, + "step": 14550 + }, + { + "epoch": 0.82, + "grad_norm": 5.694466414197182, + "learning_rate": 9.112160006214499e-06, + "loss": 1.1119, + "step": 14555 + }, + { + "epoch": 0.82, + "grad_norm": 12.36702082978927, + "learning_rate": 9.111227411302564e-06, + "loss": 1.1151, + "step": 14560 + }, + { + "epoch": 0.82, + "grad_norm": 8.742012490451101, + "learning_rate": 9.110294374621099e-06, + "loss": 1.1402, + "step": 14565 + }, + { + "epoch": 0.82, + "grad_norm": 5.793883486862604, + "learning_rate": 9.109360896270363e-06, + "loss": 1.1191, + "step": 14570 + }, + { + "epoch": 0.82, + "grad_norm": 9.332386305768537, + "learning_rate": 9.108426976350664e-06, + "loss": 1.1635, + "step": 14575 + }, + { + "epoch": 0.82, + "grad_norm": 5.582900024472447, + "learning_rate": 9.107492614962352e-06, + "loss": 1.0885, + "step": 14580 + }, + { + "epoch": 0.82, + "grad_norm": 10.90860873158703, + "learning_rate": 9.106557812205833e-06, + "loss": 1.1498, + "step": 14585 + }, + { + "epoch": 0.82, + "grad_norm": 6.122243577764641, + "learning_rate": 9.105622568181554e-06, + "loss": 1.1213, + "step": 14590 + }, + { + "epoch": 0.82, + "grad_norm": 7.163872220241011, + "learning_rate": 9.104686882990009e-06, + "loss": 1.0777, + "step": 14595 + }, + { + "epoch": 0.82, + "grad_norm": 8.198994160118266, + "learning_rate": 9.103750756731743e-06, + "loss": 1.0947, + "step": 14600 + }, + { + "epoch": 0.82, + "grad_norm": 5.267489683648388, + "learning_rate": 9.102814189507347e-06, + "loss": 1.1456, + "step": 14605 + }, + { + "epoch": 0.82, + "grad_norm": 11.229897114218584, + "learning_rate": 9.10187718141746e-06, + "loss": 1.0682, + "step": 14610 + }, + { + "epoch": 0.82, + "grad_norm": 13.801560161867796, + "learning_rate": 9.100939732562766e-06, + "loss": 1.1285, + "step": 14615 + }, + { + "epoch": 0.82, + "grad_norm": 10.02997697852096, + "learning_rate": 9.100001843044e-06, + "loss": 1.1209, + "step": 14620 + }, + { + "epoch": 0.82, + "grad_norm": 6.329984777855054, + "learning_rate": 9.099063512961941e-06, + "loss": 1.1269, + "step": 14625 + }, + { + "epoch": 0.82, + "grad_norm": 8.538026072721141, + "learning_rate": 9.098124742417417e-06, + "loss": 1.1004, + "step": 14630 + }, + { + "epoch": 0.82, + "grad_norm": 10.725961038889572, + "learning_rate": 9.097185531511303e-06, + "loss": 1.1349, + "step": 14635 + }, + { + "epoch": 0.82, + "grad_norm": 8.842410136833655, + "learning_rate": 9.096245880344521e-06, + "loss": 1.11, + "step": 14640 + }, + { + "epoch": 0.83, + "grad_norm": 5.904407469255224, + "learning_rate": 9.09530578901804e-06, + "loss": 1.1297, + "step": 14645 + }, + { + "epoch": 0.83, + "grad_norm": 8.895722514745561, + "learning_rate": 9.09436525763288e-06, + "loss": 1.1291, + "step": 14650 + }, + { + "epoch": 0.83, + "grad_norm": 6.609008054231528, + "learning_rate": 9.093424286290103e-06, + "loss": 1.0738, + "step": 14655 + }, + { + "epoch": 0.83, + "grad_norm": 28.32829868657577, + "learning_rate": 9.09248287509082e-06, + "loss": 1.1297, + "step": 14660 + }, + { + "epoch": 0.83, + "grad_norm": 11.456125814199051, + "learning_rate": 9.091541024136192e-06, + "loss": 1.1334, + "step": 14665 + }, + { + "epoch": 0.83, + "grad_norm": 8.342499478758848, + "learning_rate": 9.090598733527422e-06, + "loss": 1.0919, + "step": 14670 + }, + { + "epoch": 0.83, + "grad_norm": 8.794926518998487, + "learning_rate": 9.089656003365765e-06, + "loss": 1.1409, + "step": 14675 + }, + { + "epoch": 0.83, + "grad_norm": 13.01187948942198, + "learning_rate": 9.088712833752522e-06, + "loss": 1.1168, + "step": 14680 + }, + { + "epoch": 0.83, + "grad_norm": 8.358538361911016, + "learning_rate": 9.08776922478904e-06, + "loss": 1.0703, + "step": 14685 + }, + { + "epoch": 0.83, + "grad_norm": 13.303843672961618, + "learning_rate": 9.086825176576714e-06, + "loss": 1.1206, + "step": 14690 + }, + { + "epoch": 0.83, + "grad_norm": 10.090847186154365, + "learning_rate": 9.085880689216986e-06, + "loss": 1.0842, + "step": 14695 + }, + { + "epoch": 0.83, + "grad_norm": 5.845417577175287, + "learning_rate": 9.084935762811346e-06, + "loss": 1.0998, + "step": 14700 + }, + { + "epoch": 0.83, + "grad_norm": 15.935343897262458, + "learning_rate": 9.08399039746133e-06, + "loss": 1.1412, + "step": 14705 + }, + { + "epoch": 0.83, + "grad_norm": 27.903758243616746, + "learning_rate": 9.083044593268522e-06, + "loss": 1.1039, + "step": 14710 + }, + { + "epoch": 0.83, + "grad_norm": 9.379569474444917, + "learning_rate": 9.082098350334552e-06, + "loss": 1.1555, + "step": 14715 + }, + { + "epoch": 0.83, + "grad_norm": 13.374143094036183, + "learning_rate": 9.081151668761097e-06, + "loss": 1.1063, + "step": 14720 + }, + { + "epoch": 0.83, + "grad_norm": 26.567546670372046, + "learning_rate": 9.080204548649887e-06, + "loss": 1.1441, + "step": 14725 + }, + { + "epoch": 0.83, + "grad_norm": 17.5783194755378, + "learning_rate": 9.07925699010269e-06, + "loss": 1.1372, + "step": 14730 + }, + { + "epoch": 0.83, + "grad_norm": 19.64054869897644, + "learning_rate": 9.078308993221325e-06, + "loss": 1.0967, + "step": 14735 + }, + { + "epoch": 0.83, + "grad_norm": 36.904593891318484, + "learning_rate": 9.07736055810766e-06, + "loss": 1.1641, + "step": 14740 + }, + { + "epoch": 0.83, + "grad_norm": 60.90044606885157, + "learning_rate": 9.076411684863609e-06, + "loss": 1.2104, + "step": 14745 + }, + { + "epoch": 0.83, + "grad_norm": 31.515403806869745, + "learning_rate": 9.075462373591131e-06, + "loss": 1.1401, + "step": 14750 + }, + { + "epoch": 0.83, + "grad_norm": 13.527393229252933, + "learning_rate": 9.074512624392237e-06, + "loss": 1.1814, + "step": 14755 + }, + { + "epoch": 0.83, + "grad_norm": 25.559324892237377, + "learning_rate": 9.073562437368978e-06, + "loss": 1.2215, + "step": 14760 + }, + { + "epoch": 0.83, + "grad_norm": 9.17703256392607, + "learning_rate": 9.072611812623455e-06, + "loss": 1.114, + "step": 14765 + }, + { + "epoch": 0.83, + "grad_norm": 16.62022547087072, + "learning_rate": 9.07166075025782e-06, + "loss": 1.1284, + "step": 14770 + }, + { + "epoch": 0.83, + "grad_norm": 47.64907967284982, + "learning_rate": 9.07070925037427e-06, + "loss": 1.0944, + "step": 14775 + }, + { + "epoch": 0.83, + "grad_norm": 11.732313223783803, + "learning_rate": 9.069757313075042e-06, + "loss": 1.1701, + "step": 14780 + }, + { + "epoch": 0.83, + "grad_norm": 41.91910561016749, + "learning_rate": 9.068804938462432e-06, + "loss": 1.166, + "step": 14785 + }, + { + "epoch": 0.83, + "grad_norm": 28.458670265029813, + "learning_rate": 9.067852126638775e-06, + "loss": 1.0961, + "step": 14790 + }, + { + "epoch": 0.83, + "grad_norm": 43.422055609801596, + "learning_rate": 9.066898877706452e-06, + "loss": 1.1488, + "step": 14795 + }, + { + "epoch": 0.83, + "grad_norm": 45.83463846175388, + "learning_rate": 9.065945191767897e-06, + "loss": 1.143, + "step": 14800 + }, + { + "epoch": 0.83, + "grad_norm": 13.366062323915122, + "learning_rate": 9.064991068925587e-06, + "loss": 1.1366, + "step": 14805 + }, + { + "epoch": 0.83, + "grad_norm": 48.11725412910709, + "learning_rate": 9.064036509282046e-06, + "loss": 1.1632, + "step": 14810 + }, + { + "epoch": 0.83, + "grad_norm": 12.427492522436365, + "learning_rate": 9.063081512939847e-06, + "loss": 1.1133, + "step": 14815 + }, + { + "epoch": 0.84, + "grad_norm": 29.720651040287922, + "learning_rate": 9.062126080001608e-06, + "loss": 1.0939, + "step": 14820 + }, + { + "epoch": 0.84, + "grad_norm": 23.438743515526202, + "learning_rate": 9.061170210569992e-06, + "loss": 1.159, + "step": 14825 + }, + { + "epoch": 0.84, + "grad_norm": 24.303463946418535, + "learning_rate": 9.060213904747715e-06, + "loss": 1.1413, + "step": 14830 + }, + { + "epoch": 0.84, + "grad_norm": 39.2810075255219, + "learning_rate": 9.059257162637535e-06, + "loss": 1.1823, + "step": 14835 + }, + { + "epoch": 0.84, + "grad_norm": 14.22613215485907, + "learning_rate": 9.058299984342258e-06, + "loss": 1.1167, + "step": 14840 + }, + { + "epoch": 0.84, + "grad_norm": 7.6565528713030995, + "learning_rate": 9.057342369964737e-06, + "loss": 1.122, + "step": 14845 + }, + { + "epoch": 0.84, + "grad_norm": 12.097227707630147, + "learning_rate": 9.05638431960787e-06, + "loss": 1.1826, + "step": 14850 + }, + { + "epoch": 0.84, + "grad_norm": 12.724964176778249, + "learning_rate": 9.055425833374609e-06, + "loss": 1.122, + "step": 14855 + }, + { + "epoch": 0.84, + "grad_norm": 6.054356700617171, + "learning_rate": 9.05446691136794e-06, + "loss": 1.1403, + "step": 14860 + }, + { + "epoch": 0.84, + "grad_norm": 9.170345265610834, + "learning_rate": 9.053507553690912e-06, + "loss": 1.1732, + "step": 14865 + }, + { + "epoch": 0.84, + "grad_norm": 19.123171957572076, + "learning_rate": 9.052547760446606e-06, + "loss": 1.1322, + "step": 14870 + }, + { + "epoch": 0.84, + "grad_norm": 5.312985123014363, + "learning_rate": 9.051587531738158e-06, + "loss": 1.1324, + "step": 14875 + }, + { + "epoch": 0.84, + "grad_norm": 8.153623725885799, + "learning_rate": 9.050626867668747e-06, + "loss": 1.0854, + "step": 14880 + }, + { + "epoch": 0.84, + "grad_norm": 12.212587392556209, + "learning_rate": 9.049665768341604e-06, + "loss": 1.1518, + "step": 14885 + }, + { + "epoch": 0.84, + "grad_norm": 8.601145005621945, + "learning_rate": 9.048704233860003e-06, + "loss": 1.1126, + "step": 14890 + }, + { + "epoch": 0.84, + "grad_norm": 17.737501036088073, + "learning_rate": 9.047742264327261e-06, + "loss": 1.1573, + "step": 14895 + }, + { + "epoch": 0.84, + "grad_norm": 17.44119029656582, + "learning_rate": 9.04677985984675e-06, + "loss": 1.0828, + "step": 14900 + }, + { + "epoch": 0.84, + "grad_norm": 13.019495679200885, + "learning_rate": 9.045817020521884e-06, + "loss": 1.0922, + "step": 14905 + }, + { + "epoch": 0.84, + "grad_norm": 8.396485879866313, + "learning_rate": 9.044853746456123e-06, + "loss": 1.1129, + "step": 14910 + }, + { + "epoch": 0.84, + "grad_norm": 7.985975196367451, + "learning_rate": 9.043890037752976e-06, + "loss": 1.1021, + "step": 14915 + }, + { + "epoch": 0.84, + "grad_norm": 11.738204524180604, + "learning_rate": 9.042925894515998e-06, + "loss": 1.1154, + "step": 14920 + }, + { + "epoch": 0.84, + "grad_norm": 16.748101190111107, + "learning_rate": 9.041961316848787e-06, + "loss": 1.089, + "step": 14925 + }, + { + "epoch": 0.84, + "grad_norm": 17.73538533372155, + "learning_rate": 9.040996304854997e-06, + "loss": 1.1355, + "step": 14930 + }, + { + "epoch": 0.84, + "grad_norm": 11.113794638447425, + "learning_rate": 9.040030858638319e-06, + "loss": 1.1045, + "step": 14935 + }, + { + "epoch": 0.84, + "grad_norm": 6.7434952780550805, + "learning_rate": 9.039064978302493e-06, + "loss": 1.1086, + "step": 14940 + }, + { + "epoch": 0.84, + "grad_norm": 15.603301122450585, + "learning_rate": 9.038098663951311e-06, + "loss": 1.1274, + "step": 14945 + }, + { + "epoch": 0.84, + "grad_norm": 11.757240851222182, + "learning_rate": 9.037131915688607e-06, + "loss": 1.1293, + "step": 14950 + }, + { + "epoch": 0.84, + "grad_norm": 31.923350278535334, + "learning_rate": 9.03616473361826e-06, + "loss": 1.1706, + "step": 14955 + }, + { + "epoch": 0.84, + "grad_norm": 5.82388837537516, + "learning_rate": 9.035197117844199e-06, + "loss": 1.1331, + "step": 14960 + }, + { + "epoch": 0.84, + "grad_norm": 7.319920335950254, + "learning_rate": 9.0342290684704e-06, + "loss": 1.1461, + "step": 14965 + }, + { + "epoch": 0.84, + "grad_norm": 54.85723431684405, + "learning_rate": 9.033260585600883e-06, + "loss": 1.1265, + "step": 14970 + }, + { + "epoch": 0.84, + "grad_norm": 28.824414345469417, + "learning_rate": 9.032291669339716e-06, + "loss": 1.104, + "step": 14975 + }, + { + "epoch": 0.84, + "grad_norm": 13.40871623293331, + "learning_rate": 9.031322319791014e-06, + "loss": 1.1264, + "step": 14980 + }, + { + "epoch": 0.84, + "grad_norm": 8.612526593600604, + "learning_rate": 9.030352537058936e-06, + "loss": 1.1385, + "step": 14985 + }, + { + "epoch": 0.84, + "grad_norm": 18.914087751351797, + "learning_rate": 9.02938232124769e-06, + "loss": 1.1049, + "step": 14990 + }, + { + "epoch": 0.84, + "grad_norm": 12.266647447125317, + "learning_rate": 9.028411672461532e-06, + "loss": 1.1158, + "step": 14995 + }, + { + "epoch": 0.85, + "grad_norm": 7.04429399058661, + "learning_rate": 9.027440590804761e-06, + "loss": 1.1337, + "step": 15000 + }, + { + "epoch": 0.85, + "grad_norm": 13.948380414885566, + "learning_rate": 9.026469076381724e-06, + "loss": 1.1182, + "step": 15005 + }, + { + "epoch": 0.85, + "grad_norm": 8.574202684687753, + "learning_rate": 9.025497129296813e-06, + "loss": 1.1119, + "step": 15010 + }, + { + "epoch": 0.85, + "grad_norm": 13.087659409092865, + "learning_rate": 9.024524749654472e-06, + "loss": 1.1082, + "step": 15015 + }, + { + "epoch": 0.85, + "grad_norm": 17.427771613553315, + "learning_rate": 9.023551937559186e-06, + "loss": 1.1108, + "step": 15020 + }, + { + "epoch": 0.85, + "grad_norm": 7.490019223859416, + "learning_rate": 9.022578693115484e-06, + "loss": 1.1397, + "step": 15025 + }, + { + "epoch": 0.85, + "grad_norm": 15.256419026868922, + "learning_rate": 9.021605016427949e-06, + "loss": 1.127, + "step": 15030 + }, + { + "epoch": 0.85, + "grad_norm": 13.452780712753931, + "learning_rate": 9.020630907601207e-06, + "loss": 1.0729, + "step": 15035 + }, + { + "epoch": 0.85, + "grad_norm": 7.4594919433949745, + "learning_rate": 9.019656366739929e-06, + "loss": 1.0662, + "step": 15040 + }, + { + "epoch": 0.85, + "grad_norm": 19.75808217651537, + "learning_rate": 9.018681393948836e-06, + "loss": 1.008, + "step": 15045 + }, + { + "epoch": 0.85, + "grad_norm": 5.744640468159116, + "learning_rate": 9.017705989332687e-06, + "loss": 1.0693, + "step": 15050 + }, + { + "epoch": 0.85, + "grad_norm": 6.8766407741970905, + "learning_rate": 9.016730152996302e-06, + "loss": 1.1441, + "step": 15055 + }, + { + "epoch": 0.85, + "grad_norm": 13.488179358832632, + "learning_rate": 9.015753885044534e-06, + "loss": 1.1546, + "step": 15060 + }, + { + "epoch": 0.85, + "grad_norm": 5.268097923372006, + "learning_rate": 9.014777185582289e-06, + "loss": 1.0512, + "step": 15065 + }, + { + "epoch": 0.85, + "grad_norm": 10.656687178551373, + "learning_rate": 9.013800054714515e-06, + "loss": 1.1458, + "step": 15070 + }, + { + "epoch": 0.85, + "grad_norm": 7.970250634168265, + "learning_rate": 9.012822492546211e-06, + "loss": 1.1421, + "step": 15075 + }, + { + "epoch": 0.85, + "grad_norm": 7.541114823315593, + "learning_rate": 9.011844499182423e-06, + "loss": 1.1049, + "step": 15080 + }, + { + "epoch": 0.85, + "grad_norm": 21.162148932902298, + "learning_rate": 9.010866074728236e-06, + "loss": 1.1292, + "step": 15085 + }, + { + "epoch": 0.85, + "grad_norm": 13.487928112419654, + "learning_rate": 9.009887219288786e-06, + "loss": 1.1189, + "step": 15090 + }, + { + "epoch": 0.85, + "grad_norm": 10.42430035428869, + "learning_rate": 9.008907932969261e-06, + "loss": 1.1098, + "step": 15095 + }, + { + "epoch": 0.85, + "grad_norm": 10.338598817675365, + "learning_rate": 9.007928215874884e-06, + "loss": 1.1538, + "step": 15100 + }, + { + "epoch": 0.85, + "grad_norm": 17.13543336542322, + "learning_rate": 9.00694806811093e-06, + "loss": 1.1179, + "step": 15105 + }, + { + "epoch": 0.85, + "grad_norm": 17.009629898302627, + "learning_rate": 9.005967489782725e-06, + "loss": 1.0913, + "step": 15110 + }, + { + "epoch": 0.85, + "grad_norm": 16.163157815657705, + "learning_rate": 9.004986480995634e-06, + "loss": 1.0746, + "step": 15115 + }, + { + "epoch": 0.85, + "grad_norm": 8.398435618814801, + "learning_rate": 9.004005041855068e-06, + "loss": 1.1442, + "step": 15120 + }, + { + "epoch": 0.85, + "grad_norm": 8.309657495107698, + "learning_rate": 9.003023172466489e-06, + "loss": 1.1605, + "step": 15125 + }, + { + "epoch": 0.85, + "grad_norm": 9.444696379623364, + "learning_rate": 9.002040872935405e-06, + "loss": 1.1448, + "step": 15130 + }, + { + "epoch": 0.85, + "grad_norm": 11.951662721228207, + "learning_rate": 9.001058143367368e-06, + "loss": 1.1309, + "step": 15135 + }, + { + "epoch": 0.85, + "grad_norm": 12.934353463404143, + "learning_rate": 9.000074983867973e-06, + "loss": 1.095, + "step": 15140 + }, + { + "epoch": 0.85, + "grad_norm": 7.996296854973321, + "learning_rate": 8.999091394542868e-06, + "loss": 1.1453, + "step": 15145 + }, + { + "epoch": 0.85, + "grad_norm": 6.555201313261598, + "learning_rate": 8.998107375497744e-06, + "loss": 1.1219, + "step": 15150 + }, + { + "epoch": 0.85, + "grad_norm": 5.4527144214470615, + "learning_rate": 8.997122926838337e-06, + "loss": 1.1429, + "step": 15155 + }, + { + "epoch": 0.85, + "grad_norm": 6.053857604890467, + "learning_rate": 8.996138048670431e-06, + "loss": 1.1013, + "step": 15160 + }, + { + "epoch": 0.85, + "grad_norm": 6.584512239337207, + "learning_rate": 8.995152741099854e-06, + "loss": 1.0783, + "step": 15165 + }, + { + "epoch": 0.85, + "grad_norm": 5.359040859756247, + "learning_rate": 8.994167004232486e-06, + "loss": 1.158, + "step": 15170 + }, + { + "epoch": 0.86, + "grad_norm": 17.558674170066805, + "learning_rate": 8.993180838174242e-06, + "loss": 1.1502, + "step": 15175 + }, + { + "epoch": 0.86, + "grad_norm": 9.148872725818038, + "learning_rate": 8.992194243031097e-06, + "loss": 1.147, + "step": 15180 + }, + { + "epoch": 0.86, + "grad_norm": 15.65830754136846, + "learning_rate": 8.99120721890906e-06, + "loss": 1.1536, + "step": 15185 + }, + { + "epoch": 0.86, + "grad_norm": 14.295536196796188, + "learning_rate": 8.990219765914194e-06, + "loss": 1.1444, + "step": 15190 + }, + { + "epoch": 0.86, + "grad_norm": 19.573950215677538, + "learning_rate": 8.989231884152603e-06, + "loss": 1.0894, + "step": 15195 + }, + { + "epoch": 0.86, + "grad_norm": 12.424215213013229, + "learning_rate": 8.988243573730439e-06, + "loss": 1.133, + "step": 15200 + }, + { + "epoch": 0.86, + "grad_norm": 40.37914304337508, + "learning_rate": 8.987254834753904e-06, + "loss": 1.1227, + "step": 15205 + }, + { + "epoch": 0.86, + "grad_norm": 27.373384526939393, + "learning_rate": 8.986265667329239e-06, + "loss": 1.1093, + "step": 15210 + }, + { + "epoch": 0.86, + "grad_norm": 10.569928707365623, + "learning_rate": 8.985276071562737e-06, + "loss": 1.1085, + "step": 15215 + }, + { + "epoch": 0.86, + "grad_norm": 12.893884643105887, + "learning_rate": 8.984286047560729e-06, + "loss": 1.0573, + "step": 15220 + }, + { + "epoch": 0.86, + "grad_norm": 14.084049740281538, + "learning_rate": 8.983295595429604e-06, + "loss": 1.1941, + "step": 15225 + }, + { + "epoch": 0.86, + "grad_norm": 9.697120379518738, + "learning_rate": 8.982304715275788e-06, + "loss": 1.1134, + "step": 15230 + }, + { + "epoch": 0.86, + "grad_norm": 10.153596659350104, + "learning_rate": 8.981313407205752e-06, + "loss": 1.1232, + "step": 15235 + }, + { + "epoch": 0.86, + "grad_norm": 9.339287903534414, + "learning_rate": 8.980321671326022e-06, + "loss": 1.1423, + "step": 15240 + }, + { + "epoch": 0.86, + "grad_norm": 7.220683732804061, + "learning_rate": 8.979329507743161e-06, + "loss": 1.0984, + "step": 15245 + }, + { + "epoch": 0.86, + "grad_norm": 5.758846506839011, + "learning_rate": 8.978336916563783e-06, + "loss": 1.1382, + "step": 15250 + }, + { + "epoch": 0.86, + "grad_norm": 15.10878136536204, + "learning_rate": 8.977343897894545e-06, + "loss": 1.1074, + "step": 15255 + }, + { + "epoch": 0.86, + "grad_norm": 9.904917585744883, + "learning_rate": 8.976350451842152e-06, + "loss": 1.1039, + "step": 15260 + }, + { + "epoch": 0.86, + "grad_norm": 8.953932824290469, + "learning_rate": 8.975356578513354e-06, + "loss": 1.1082, + "step": 15265 + }, + { + "epoch": 0.86, + "grad_norm": 6.41885249619772, + "learning_rate": 8.974362278014946e-06, + "loss": 1.1211, + "step": 15270 + }, + { + "epoch": 0.86, + "grad_norm": 6.8002738221505314, + "learning_rate": 8.973367550453774e-06, + "loss": 1.1127, + "step": 15275 + }, + { + "epoch": 0.86, + "grad_norm": 7.206878051734687, + "learning_rate": 8.972372395936719e-06, + "loss": 1.1134, + "step": 15280 + }, + { + "epoch": 0.86, + "grad_norm": 9.087959244254456, + "learning_rate": 8.97137681457072e-06, + "loss": 1.0968, + "step": 15285 + }, + { + "epoch": 0.86, + "grad_norm": 6.415553792258701, + "learning_rate": 8.970380806462758e-06, + "loss": 1.1231, + "step": 15290 + }, + { + "epoch": 0.86, + "grad_norm": 8.118829201084383, + "learning_rate": 8.969384371719852e-06, + "loss": 1.1112, + "step": 15295 + }, + { + "epoch": 0.86, + "grad_norm": 5.928779573226809, + "learning_rate": 8.96838751044908e-06, + "loss": 1.0993, + "step": 15300 + }, + { + "epoch": 0.86, + "grad_norm": 21.279270627251556, + "learning_rate": 8.967390222757553e-06, + "loss": 1.1204, + "step": 15305 + }, + { + "epoch": 0.86, + "grad_norm": 18.733553955905094, + "learning_rate": 8.96639250875244e-06, + "loss": 1.1068, + "step": 15310 + }, + { + "epoch": 0.86, + "grad_norm": 23.863312827531992, + "learning_rate": 8.965394368540945e-06, + "loss": 1.2045, + "step": 15315 + }, + { + "epoch": 0.86, + "grad_norm": 13.54916337636012, + "learning_rate": 8.964395802230325e-06, + "loss": 1.1303, + "step": 15320 + }, + { + "epoch": 0.86, + "grad_norm": 5.200584650921166, + "learning_rate": 8.96339680992788e-06, + "loss": 1.0871, + "step": 15325 + }, + { + "epoch": 0.86, + "grad_norm": 8.0581056774615, + "learning_rate": 8.962397391740957e-06, + "loss": 1.1269, + "step": 15330 + }, + { + "epoch": 0.86, + "grad_norm": 19.354208687587455, + "learning_rate": 8.961397547776946e-06, + "loss": 1.0688, + "step": 15335 + }, + { + "epoch": 0.86, + "grad_norm": 10.372140860900933, + "learning_rate": 8.960397278143285e-06, + "loss": 1.0811, + "step": 15340 + }, + { + "epoch": 0.86, + "grad_norm": 5.8651829016830215, + "learning_rate": 8.959396582947458e-06, + "loss": 1.1043, + "step": 15345 + }, + { + "epoch": 0.86, + "grad_norm": 30.30903429132215, + "learning_rate": 8.958395462296995e-06, + "loss": 1.1203, + "step": 15350 + }, + { + "epoch": 0.87, + "grad_norm": 23.306133502411026, + "learning_rate": 8.957393916299469e-06, + "loss": 1.0805, + "step": 15355 + }, + { + "epoch": 0.87, + "grad_norm": 11.661451079673935, + "learning_rate": 8.956391945062501e-06, + "loss": 1.1396, + "step": 15360 + }, + { + "epoch": 0.87, + "grad_norm": 19.86938027595896, + "learning_rate": 8.955389548693758e-06, + "loss": 1.1358, + "step": 15365 + }, + { + "epoch": 0.87, + "grad_norm": 28.80689046818295, + "learning_rate": 8.954386727300953e-06, + "loss": 1.1024, + "step": 15370 + }, + { + "epoch": 0.87, + "grad_norm": 29.535898479193374, + "learning_rate": 8.95338348099184e-06, + "loss": 1.17, + "step": 15375 + }, + { + "epoch": 0.87, + "grad_norm": 16.511670803697672, + "learning_rate": 8.952379809874225e-06, + "loss": 1.1174, + "step": 15380 + }, + { + "epoch": 0.87, + "grad_norm": 13.03831406520542, + "learning_rate": 8.951375714055956e-06, + "loss": 1.1425, + "step": 15385 + }, + { + "epoch": 0.87, + "grad_norm": 11.170125157229293, + "learning_rate": 8.950371193644929e-06, + "loss": 1.0766, + "step": 15390 + }, + { + "epoch": 0.87, + "grad_norm": 11.275570143929595, + "learning_rate": 8.949366248749084e-06, + "loss": 1.1446, + "step": 15395 + }, + { + "epoch": 0.87, + "grad_norm": 9.146976157540005, + "learning_rate": 8.9483608794764e-06, + "loss": 1.1326, + "step": 15400 + }, + { + "epoch": 0.87, + "grad_norm": 13.842079211086604, + "learning_rate": 8.947355085934918e-06, + "loss": 1.1216, + "step": 15405 + }, + { + "epoch": 0.87, + "grad_norm": 18.185294680167168, + "learning_rate": 8.946348868232712e-06, + "loss": 1.1124, + "step": 15410 + }, + { + "epoch": 0.87, + "grad_norm": 5.770946193843225, + "learning_rate": 8.945342226477903e-06, + "loss": 1.0472, + "step": 15415 + }, + { + "epoch": 0.87, + "grad_norm": 7.094968685801851, + "learning_rate": 8.944335160778658e-06, + "loss": 1.1096, + "step": 15420 + }, + { + "epoch": 0.87, + "grad_norm": 36.008993846535525, + "learning_rate": 8.943327671243194e-06, + "loss": 1.1069, + "step": 15425 + }, + { + "epoch": 0.87, + "grad_norm": 8.102908445444992, + "learning_rate": 8.942319757979768e-06, + "loss": 1.0827, + "step": 15430 + }, + { + "epoch": 0.87, + "grad_norm": 22.180074968865192, + "learning_rate": 8.941311421096686e-06, + "loss": 1.1042, + "step": 15435 + }, + { + "epoch": 0.87, + "grad_norm": 25.342355504685877, + "learning_rate": 8.940302660702296e-06, + "loss": 1.1036, + "step": 15440 + }, + { + "epoch": 0.87, + "grad_norm": 12.207078580665375, + "learning_rate": 8.939293476904996e-06, + "loss": 1.1118, + "step": 15445 + }, + { + "epoch": 0.87, + "grad_norm": 8.961212029304685, + "learning_rate": 8.938283869813226e-06, + "loss": 1.0724, + "step": 15450 + }, + { + "epoch": 0.87, + "grad_norm": 22.213912907198274, + "learning_rate": 8.937273839535474e-06, + "loss": 1.1287, + "step": 15455 + }, + { + "epoch": 0.87, + "grad_norm": 18.205395591758585, + "learning_rate": 8.936263386180271e-06, + "loss": 1.105, + "step": 15460 + }, + { + "epoch": 0.87, + "grad_norm": 6.466101834459902, + "learning_rate": 8.935252509856193e-06, + "loss": 1.1461, + "step": 15465 + }, + { + "epoch": 0.87, + "grad_norm": 6.685012039390134, + "learning_rate": 8.934241210671868e-06, + "loss": 1.1156, + "step": 15470 + }, + { + "epoch": 0.87, + "grad_norm": 11.445347741715205, + "learning_rate": 8.933229488735961e-06, + "loss": 1.1005, + "step": 15475 + }, + { + "epoch": 0.87, + "grad_norm": 7.377434775353599, + "learning_rate": 8.932217344157188e-06, + "loss": 1.1346, + "step": 15480 + }, + { + "epoch": 0.87, + "grad_norm": 13.217632836077673, + "learning_rate": 8.931204777044305e-06, + "loss": 1.1278, + "step": 15485 + }, + { + "epoch": 0.87, + "grad_norm": 13.896865401653006, + "learning_rate": 8.93019178750612e-06, + "loss": 1.1265, + "step": 15490 + }, + { + "epoch": 0.87, + "grad_norm": 10.429060194190058, + "learning_rate": 8.929178375651481e-06, + "loss": 1.084, + "step": 15495 + }, + { + "epoch": 0.87, + "grad_norm": 14.56996667332036, + "learning_rate": 8.928164541589287e-06, + "loss": 1.1089, + "step": 15500 + }, + { + "epoch": 0.87, + "grad_norm": 9.494176421990351, + "learning_rate": 8.927150285428475e-06, + "loss": 1.1051, + "step": 15505 + }, + { + "epoch": 0.87, + "grad_norm": 6.920264190839941, + "learning_rate": 8.926135607278035e-06, + "loss": 1.1011, + "step": 15510 + }, + { + "epoch": 0.87, + "grad_norm": 10.810185229857796, + "learning_rate": 8.925120507246993e-06, + "loss": 1.1335, + "step": 15515 + }, + { + "epoch": 0.87, + "grad_norm": 10.242383280864297, + "learning_rate": 8.924104985444434e-06, + "loss": 1.0933, + "step": 15520 + }, + { + "epoch": 0.87, + "grad_norm": 5.09670297381289, + "learning_rate": 8.923089041979474e-06, + "loss": 1.09, + "step": 15525 + }, + { + "epoch": 0.88, + "grad_norm": 6.141979374923959, + "learning_rate": 8.922072676961282e-06, + "loss": 1.1145, + "step": 15530 + }, + { + "epoch": 0.88, + "grad_norm": 6.114749823235573, + "learning_rate": 8.92105589049907e-06, + "loss": 1.1569, + "step": 15535 + }, + { + "epoch": 0.88, + "grad_norm": 5.534260385387185, + "learning_rate": 8.9200386827021e-06, + "loss": 1.1126, + "step": 15540 + }, + { + "epoch": 0.88, + "grad_norm": 12.362250458323114, + "learning_rate": 8.919021053679672e-06, + "loss": 1.1224, + "step": 15545 + }, + { + "epoch": 0.88, + "grad_norm": 11.309395112552444, + "learning_rate": 8.918003003541136e-06, + "loss": 1.0922, + "step": 15550 + }, + { + "epoch": 0.88, + "grad_norm": 22.45995518545166, + "learning_rate": 8.916984532395887e-06, + "loss": 1.1151, + "step": 15555 + }, + { + "epoch": 0.88, + "grad_norm": 5.799846626012507, + "learning_rate": 8.91596564035336e-06, + "loss": 1.1321, + "step": 15560 + }, + { + "epoch": 0.88, + "grad_norm": 17.819301489285724, + "learning_rate": 8.914946327523047e-06, + "loss": 1.1421, + "step": 15565 + }, + { + "epoch": 0.88, + "grad_norm": 10.37561174852505, + "learning_rate": 8.91392659401447e-06, + "loss": 1.1423, + "step": 15570 + }, + { + "epoch": 0.88, + "grad_norm": 7.8702545885999955, + "learning_rate": 8.912906439937207e-06, + "loss": 1.1309, + "step": 15575 + }, + { + "epoch": 0.88, + "grad_norm": 7.042355847210189, + "learning_rate": 8.91188586540088e-06, + "loss": 1.0924, + "step": 15580 + }, + { + "epoch": 0.88, + "grad_norm": 10.682587939526758, + "learning_rate": 8.91086487051515e-06, + "loss": 1.1059, + "step": 15585 + }, + { + "epoch": 0.88, + "grad_norm": 5.871931530926953, + "learning_rate": 8.90984345538973e-06, + "loss": 1.1232, + "step": 15590 + }, + { + "epoch": 0.88, + "grad_norm": 8.112442975979869, + "learning_rate": 8.908821620134377e-06, + "loss": 1.0661, + "step": 15595 + }, + { + "epoch": 0.88, + "grad_norm": 8.912873380119569, + "learning_rate": 8.907799364858888e-06, + "loss": 1.1066, + "step": 15600 + }, + { + "epoch": 0.88, + "grad_norm": 9.72976963135992, + "learning_rate": 8.906776689673113e-06, + "loss": 1.0747, + "step": 15605 + }, + { + "epoch": 0.88, + "grad_norm": 6.297488178624966, + "learning_rate": 8.905753594686938e-06, + "loss": 1.0851, + "step": 15610 + }, + { + "epoch": 0.88, + "grad_norm": 11.769513972727212, + "learning_rate": 8.904730080010304e-06, + "loss": 1.155, + "step": 15615 + }, + { + "epoch": 0.88, + "grad_norm": 8.209314719264187, + "learning_rate": 8.90370614575319e-06, + "loss": 1.1069, + "step": 15620 + }, + { + "epoch": 0.88, + "grad_norm": 5.869716104495888, + "learning_rate": 8.902681792025621e-06, + "loss": 1.0908, + "step": 15625 + }, + { + "epoch": 0.88, + "grad_norm": 17.494067016564845, + "learning_rate": 8.90165701893767e-06, + "loss": 1.1306, + "step": 15630 + }, + { + "epoch": 0.88, + "grad_norm": 21.616906490157792, + "learning_rate": 8.900631826599454e-06, + "loss": 1.1103, + "step": 15635 + }, + { + "epoch": 0.88, + "grad_norm": 7.4077577363818, + "learning_rate": 8.899606215121132e-06, + "loss": 1.1245, + "step": 15640 + }, + { + "epoch": 0.88, + "grad_norm": 13.634824470674596, + "learning_rate": 8.898580184612913e-06, + "loss": 1.1558, + "step": 15645 + }, + { + "epoch": 0.88, + "grad_norm": 12.699519493918952, + "learning_rate": 8.897553735185048e-06, + "loss": 1.1039, + "step": 15650 + }, + { + "epoch": 0.88, + "grad_norm": 10.862963576619954, + "learning_rate": 8.89652686694783e-06, + "loss": 1.0783, + "step": 15655 + }, + { + "epoch": 0.88, + "grad_norm": 22.144900784416958, + "learning_rate": 8.895499580011606e-06, + "loss": 1.1025, + "step": 15660 + }, + { + "epoch": 0.88, + "grad_norm": 23.832881740551286, + "learning_rate": 8.894471874486759e-06, + "loss": 1.0998, + "step": 15665 + }, + { + "epoch": 0.88, + "grad_norm": 8.5828596885912, + "learning_rate": 8.893443750483722e-06, + "loss": 1.1212, + "step": 15670 + }, + { + "epoch": 0.88, + "grad_norm": 38.823298467421445, + "learning_rate": 8.89241520811297e-06, + "loss": 1.1026, + "step": 15675 + }, + { + "epoch": 0.88, + "grad_norm": 43.279108304643266, + "learning_rate": 8.891386247485027e-06, + "loss": 1.1717, + "step": 15680 + }, + { + "epoch": 0.88, + "grad_norm": 23.204916311755625, + "learning_rate": 8.890356868710457e-06, + "loss": 1.16, + "step": 15685 + }, + { + "epoch": 0.88, + "grad_norm": 8.237581085215133, + "learning_rate": 8.889327071899872e-06, + "loss": 1.1444, + "step": 15690 + }, + { + "epoch": 0.88, + "grad_norm": 16.03368700824802, + "learning_rate": 8.888296857163927e-06, + "loss": 1.1257, + "step": 15695 + }, + { + "epoch": 0.88, + "grad_norm": 48.306439237789114, + "learning_rate": 8.887266224613326e-06, + "loss": 1.11, + "step": 15700 + }, + { + "epoch": 0.88, + "grad_norm": 17.132291292277202, + "learning_rate": 8.886235174358812e-06, + "loss": 1.1581, + "step": 15705 + }, + { + "epoch": 0.89, + "grad_norm": 57.65949417291116, + "learning_rate": 8.885203706511178e-06, + "loss": 1.134, + "step": 15710 + }, + { + "epoch": 0.89, + "grad_norm": 19.569696607607145, + "learning_rate": 8.88417182118126e-06, + "loss": 1.0934, + "step": 15715 + }, + { + "epoch": 0.89, + "grad_norm": 27.037448824784263, + "learning_rate": 8.883139518479936e-06, + "loss": 1.1227, + "step": 15720 + }, + { + "epoch": 0.89, + "grad_norm": 35.81651807981647, + "learning_rate": 8.882106798518135e-06, + "loss": 1.1356, + "step": 15725 + }, + { + "epoch": 0.89, + "grad_norm": 15.657523340271581, + "learning_rate": 8.881073661406822e-06, + "loss": 1.1112, + "step": 15730 + }, + { + "epoch": 0.89, + "grad_norm": 16.151989068657166, + "learning_rate": 8.88004010725702e-06, + "loss": 1.1329, + "step": 15735 + }, + { + "epoch": 0.89, + "grad_norm": 9.62975572551237, + "learning_rate": 8.879006136179781e-06, + "loss": 1.1044, + "step": 15740 + }, + { + "epoch": 0.89, + "grad_norm": 6.057841844382403, + "learning_rate": 8.877971748286215e-06, + "loss": 1.0976, + "step": 15745 + }, + { + "epoch": 0.89, + "grad_norm": 10.280863510880357, + "learning_rate": 8.87693694368747e-06, + "loss": 1.1132, + "step": 15750 + }, + { + "epoch": 0.89, + "grad_norm": 12.365025414091182, + "learning_rate": 8.87590172249474e-06, + "loss": 1.1198, + "step": 15755 + }, + { + "epoch": 0.89, + "grad_norm": 35.9105215347335, + "learning_rate": 8.874866084819264e-06, + "loss": 1.1248, + "step": 15760 + }, + { + "epoch": 0.89, + "grad_norm": 13.372504021182381, + "learning_rate": 8.873830030772326e-06, + "loss": 1.2148, + "step": 15765 + }, + { + "epoch": 0.89, + "grad_norm": 21.25397187021847, + "learning_rate": 8.872793560465254e-06, + "loss": 1.0871, + "step": 15770 + }, + { + "epoch": 0.89, + "grad_norm": 17.514453140096677, + "learning_rate": 8.871756674009424e-06, + "loss": 1.0848, + "step": 15775 + }, + { + "epoch": 0.89, + "grad_norm": 23.66195328791996, + "learning_rate": 8.87071937151625e-06, + "loss": 1.1194, + "step": 15780 + }, + { + "epoch": 0.89, + "grad_norm": 26.19155099835197, + "learning_rate": 8.869681653097197e-06, + "loss": 1.1232, + "step": 15785 + }, + { + "epoch": 0.89, + "grad_norm": 11.85542025290746, + "learning_rate": 8.868643518863773e-06, + "loss": 1.1544, + "step": 15790 + }, + { + "epoch": 0.89, + "grad_norm": 16.51186349299305, + "learning_rate": 8.86760496892753e-06, + "loss": 1.1807, + "step": 15795 + }, + { + "epoch": 0.89, + "grad_norm": 19.91108235467044, + "learning_rate": 8.866566003400063e-06, + "loss": 1.1473, + "step": 15800 + }, + { + "epoch": 0.89, + "grad_norm": 10.732440182991025, + "learning_rate": 8.865526622393014e-06, + "loss": 1.1547, + "step": 15805 + }, + { + "epoch": 0.89, + "grad_norm": 15.109443017549859, + "learning_rate": 8.864486826018071e-06, + "loss": 1.0966, + "step": 15810 + }, + { + "epoch": 0.89, + "grad_norm": 14.255265210407417, + "learning_rate": 8.86344661438696e-06, + "loss": 1.1086, + "step": 15815 + }, + { + "epoch": 0.89, + "grad_norm": 7.0010913863033375, + "learning_rate": 8.862405987611462e-06, + "loss": 1.1323, + "step": 15820 + }, + { + "epoch": 0.89, + "grad_norm": 12.027189162578985, + "learning_rate": 8.861364945803395e-06, + "loss": 1.1185, + "step": 15825 + }, + { + "epoch": 0.89, + "grad_norm": 5.516654026014469, + "learning_rate": 8.860323489074623e-06, + "loss": 1.1079, + "step": 15830 + }, + { + "epoch": 0.89, + "grad_norm": 7.650182475404735, + "learning_rate": 8.859281617537054e-06, + "loss": 1.1647, + "step": 15835 + }, + { + "epoch": 0.89, + "grad_norm": 10.986235347722728, + "learning_rate": 8.858239331302644e-06, + "loss": 1.1173, + "step": 15840 + }, + { + "epoch": 0.89, + "grad_norm": 7.23175185845024, + "learning_rate": 8.857196630483389e-06, + "loss": 1.1243, + "step": 15845 + }, + { + "epoch": 0.89, + "grad_norm": 12.945782923891434, + "learning_rate": 8.856153515191335e-06, + "loss": 1.1062, + "step": 15850 + }, + { + "epoch": 0.89, + "grad_norm": 5.685618784108, + "learning_rate": 8.855109985538566e-06, + "loss": 1.1132, + "step": 15855 + }, + { + "epoch": 0.89, + "grad_norm": 25.11690911364613, + "learning_rate": 8.854066041637213e-06, + "loss": 1.1178, + "step": 15860 + }, + { + "epoch": 0.89, + "grad_norm": 8.213614309771685, + "learning_rate": 8.853021683599459e-06, + "loss": 1.1257, + "step": 15865 + }, + { + "epoch": 0.89, + "grad_norm": 6.781984162951699, + "learning_rate": 8.851976911537517e-06, + "loss": 1.128, + "step": 15870 + }, + { + "epoch": 0.89, + "grad_norm": 18.240618915281846, + "learning_rate": 8.850931725563656e-06, + "loss": 1.1809, + "step": 15875 + }, + { + "epoch": 0.89, + "grad_norm": 17.52773367410937, + "learning_rate": 8.849886125790186e-06, + "loss": 1.1418, + "step": 15880 + }, + { + "epoch": 0.9, + "grad_norm": 6.509552966492867, + "learning_rate": 8.848840112329462e-06, + "loss": 1.0787, + "step": 15885 + }, + { + "epoch": 0.9, + "grad_norm": 6.612424373792031, + "learning_rate": 8.847793685293882e-06, + "loss": 1.1333, + "step": 15890 + }, + { + "epoch": 0.9, + "grad_norm": 7.985888668119721, + "learning_rate": 8.846746844795888e-06, + "loss": 1.0613, + "step": 15895 + }, + { + "epoch": 0.9, + "grad_norm": 18.08355129876919, + "learning_rate": 8.845699590947968e-06, + "loss": 1.0849, + "step": 15900 + }, + { + "epoch": 0.9, + "grad_norm": 9.862136138661226, + "learning_rate": 8.844651923862655e-06, + "loss": 1.1323, + "step": 15905 + }, + { + "epoch": 0.9, + "grad_norm": 42.95501916755341, + "learning_rate": 8.843603843652526e-06, + "loss": 1.1118, + "step": 15910 + }, + { + "epoch": 0.9, + "grad_norm": 17.72326961263032, + "learning_rate": 8.8425553504302e-06, + "loss": 1.1186, + "step": 15915 + }, + { + "epoch": 0.9, + "grad_norm": 17.028465241187952, + "learning_rate": 8.841506444308344e-06, + "loss": 1.1104, + "step": 15920 + }, + { + "epoch": 0.9, + "grad_norm": 11.453218113393806, + "learning_rate": 8.840457125399664e-06, + "loss": 1.0451, + "step": 15925 + }, + { + "epoch": 0.9, + "grad_norm": 11.375243031040855, + "learning_rate": 8.83940739381692e-06, + "loss": 1.1673, + "step": 15930 + }, + { + "epoch": 0.9, + "grad_norm": 9.496741585782315, + "learning_rate": 8.838357249672904e-06, + "loss": 1.1325, + "step": 15935 + }, + { + "epoch": 0.9, + "grad_norm": 10.022342603648964, + "learning_rate": 8.837306693080463e-06, + "loss": 1.1297, + "step": 15940 + }, + { + "epoch": 0.9, + "grad_norm": 9.239180693152152, + "learning_rate": 8.836255724152482e-06, + "loss": 1.1209, + "step": 15945 + }, + { + "epoch": 0.9, + "grad_norm": 5.476427111269794, + "learning_rate": 8.835204343001895e-06, + "loss": 1.047, + "step": 15950 + }, + { + "epoch": 0.9, + "grad_norm": 11.23621621332625, + "learning_rate": 8.834152549741672e-06, + "loss": 1.0892, + "step": 15955 + }, + { + "epoch": 0.9, + "grad_norm": 7.9072491904722595, + "learning_rate": 8.833100344484839e-06, + "loss": 1.1609, + "step": 15960 + }, + { + "epoch": 0.9, + "grad_norm": 17.171899439262887, + "learning_rate": 8.832047727344456e-06, + "loss": 1.1218, + "step": 15965 + }, + { + "epoch": 0.9, + "grad_norm": 8.660014783667418, + "learning_rate": 8.830994698433633e-06, + "loss": 1.0618, + "step": 15970 + }, + { + "epoch": 0.9, + "grad_norm": 6.965348345133702, + "learning_rate": 8.82994125786552e-06, + "loss": 1.1037, + "step": 15975 + }, + { + "epoch": 0.9, + "grad_norm": 34.27148989398509, + "learning_rate": 8.828887405753319e-06, + "loss": 1.1039, + "step": 15980 + }, + { + "epoch": 0.9, + "grad_norm": 16.381648603300448, + "learning_rate": 8.827833142210266e-06, + "loss": 1.1186, + "step": 15985 + }, + { + "epoch": 0.9, + "grad_norm": 12.703178732356198, + "learning_rate": 8.826778467349649e-06, + "loss": 1.1484, + "step": 15990 + }, + { + "epoch": 0.9, + "grad_norm": 17.094715932962195, + "learning_rate": 8.825723381284797e-06, + "loss": 1.0973, + "step": 15995 + }, + { + "epoch": 0.9, + "grad_norm": 12.756907025277915, + "learning_rate": 8.824667884129084e-06, + "loss": 1.0878, + "step": 16000 + }, + { + "epoch": 0.9, + "grad_norm": 21.993517813012506, + "learning_rate": 8.823611975995926e-06, + "loss": 1.1696, + "step": 16005 + }, + { + "epoch": 0.9, + "grad_norm": 10.46932570998332, + "learning_rate": 8.822555656998787e-06, + "loss": 1.0832, + "step": 16010 + }, + { + "epoch": 0.9, + "grad_norm": 6.043052096345904, + "learning_rate": 8.821498927251173e-06, + "loss": 1.1203, + "step": 16015 + }, + { + "epoch": 0.9, + "grad_norm": 8.103538532001767, + "learning_rate": 8.820441786866633e-06, + "loss": 1.0753, + "step": 16020 + }, + { + "epoch": 0.9, + "grad_norm": 12.18136565211477, + "learning_rate": 8.819384235958763e-06, + "loss": 1.1236, + "step": 16025 + }, + { + "epoch": 0.9, + "grad_norm": 15.248905393904629, + "learning_rate": 8.8183262746412e-06, + "loss": 1.1073, + "step": 16030 + }, + { + "epoch": 0.9, + "grad_norm": 8.340618674349047, + "learning_rate": 8.817267903027626e-06, + "loss": 1.1415, + "step": 16035 + }, + { + "epoch": 0.9, + "grad_norm": 11.372859072118066, + "learning_rate": 8.81620912123177e-06, + "loss": 1.1508, + "step": 16040 + }, + { + "epoch": 0.9, + "grad_norm": 13.326172721309334, + "learning_rate": 8.815149929367401e-06, + "loss": 1.1244, + "step": 16045 + }, + { + "epoch": 0.9, + "grad_norm": 5.236125571217685, + "learning_rate": 8.814090327548337e-06, + "loss": 1.1394, + "step": 16050 + }, + { + "epoch": 0.9, + "grad_norm": 7.296148214222332, + "learning_rate": 8.813030315888433e-06, + "loss": 1.1009, + "step": 16055 + }, + { + "epoch": 0.9, + "grad_norm": 6.182229401804242, + "learning_rate": 8.811969894501592e-06, + "loss": 1.0911, + "step": 16060 + }, + { + "epoch": 0.91, + "grad_norm": 6.998399376154743, + "learning_rate": 8.810909063501764e-06, + "loss": 1.0595, + "step": 16065 + }, + { + "epoch": 0.91, + "grad_norm": 23.497996454399594, + "learning_rate": 8.809847823002938e-06, + "loss": 1.1009, + "step": 16070 + }, + { + "epoch": 0.91, + "grad_norm": 13.322854357171817, + "learning_rate": 8.808786173119148e-06, + "loss": 1.0992, + "step": 16075 + }, + { + "epoch": 0.91, + "grad_norm": 11.196916653237418, + "learning_rate": 8.807724113964476e-06, + "loss": 1.0928, + "step": 16080 + }, + { + "epoch": 0.91, + "grad_norm": 7.100637163787419, + "learning_rate": 8.806661645653042e-06, + "loss": 1.0811, + "step": 16085 + }, + { + "epoch": 0.91, + "grad_norm": 6.223392754663758, + "learning_rate": 8.805598768299016e-06, + "loss": 1.0792, + "step": 16090 + }, + { + "epoch": 0.91, + "grad_norm": 6.232467412039565, + "learning_rate": 8.804535482016603e-06, + "loss": 1.0971, + "step": 16095 + }, + { + "epoch": 0.91, + "grad_norm": 9.587657103940197, + "learning_rate": 8.803471786920066e-06, + "loss": 1.0939, + "step": 16100 + }, + { + "epoch": 0.91, + "grad_norm": 11.405831133128997, + "learning_rate": 8.802407683123696e-06, + "loss": 1.0998, + "step": 16105 + }, + { + "epoch": 0.91, + "grad_norm": 9.139479921017692, + "learning_rate": 8.801343170741841e-06, + "loss": 1.1771, + "step": 16110 + }, + { + "epoch": 0.91, + "grad_norm": 12.091588251831105, + "learning_rate": 8.800278249888888e-06, + "loss": 1.1341, + "step": 16115 + }, + { + "epoch": 0.91, + "grad_norm": 15.732088914984313, + "learning_rate": 8.79921292067926e-06, + "loss": 1.1608, + "step": 16120 + }, + { + "epoch": 0.91, + "grad_norm": 22.358520755565852, + "learning_rate": 8.798147183227441e-06, + "loss": 1.0895, + "step": 16125 + }, + { + "epoch": 0.91, + "grad_norm": 11.021681756582822, + "learning_rate": 8.797081037647944e-06, + "loss": 1.1165, + "step": 16130 + }, + { + "epoch": 0.91, + "grad_norm": 5.9118619006907, + "learning_rate": 8.796014484055328e-06, + "loss": 1.0875, + "step": 16135 + }, + { + "epoch": 0.91, + "grad_norm": 16.21697134216012, + "learning_rate": 8.794947522564205e-06, + "loss": 1.1072, + "step": 16140 + }, + { + "epoch": 0.91, + "grad_norm": 7.7939752733004815, + "learning_rate": 8.793880153289224e-06, + "loss": 1.0968, + "step": 16145 + }, + { + "epoch": 0.91, + "grad_norm": 8.830503959797658, + "learning_rate": 8.792812376345074e-06, + "loss": 1.1274, + "step": 16150 + }, + { + "epoch": 0.91, + "grad_norm": 9.685047451008934, + "learning_rate": 8.7917441918465e-06, + "loss": 1.0992, + "step": 16155 + }, + { + "epoch": 0.91, + "grad_norm": 6.420774631204822, + "learning_rate": 8.790675599908271e-06, + "loss": 1.1241, + "step": 16160 + }, + { + "epoch": 0.91, + "grad_norm": 12.473468488099863, + "learning_rate": 8.789606600645225e-06, + "loss": 1.1349, + "step": 16165 + }, + { + "epoch": 0.91, + "grad_norm": 5.817799435816081, + "learning_rate": 8.788537194172224e-06, + "loss": 1.1052, + "step": 16170 + }, + { + "epoch": 0.91, + "grad_norm": 21.580602261212043, + "learning_rate": 8.78746738060418e-06, + "loss": 1.0905, + "step": 16175 + }, + { + "epoch": 0.91, + "grad_norm": 9.060631817923278, + "learning_rate": 8.78639716005605e-06, + "loss": 1.1561, + "step": 16180 + }, + { + "epoch": 0.91, + "grad_norm": 15.815627525466809, + "learning_rate": 8.785326532642837e-06, + "loss": 1.1507, + "step": 16185 + }, + { + "epoch": 0.91, + "grad_norm": 6.026518156587827, + "learning_rate": 8.784255498479579e-06, + "loss": 1.157, + "step": 16190 + }, + { + "epoch": 0.91, + "grad_norm": 16.07346093619661, + "learning_rate": 8.783184057681369e-06, + "loss": 1.1179, + "step": 16195 + }, + { + "epoch": 0.91, + "grad_norm": 20.969875759523667, + "learning_rate": 8.782112210363334e-06, + "loss": 1.0961, + "step": 16200 + }, + { + "epoch": 0.91, + "grad_norm": 9.833205612253515, + "learning_rate": 8.781039956640652e-06, + "loss": 1.1373, + "step": 16205 + }, + { + "epoch": 0.91, + "grad_norm": 18.355534811501904, + "learning_rate": 8.779967296628538e-06, + "loss": 1.1, + "step": 16210 + }, + { + "epoch": 0.91, + "grad_norm": 6.374499462893863, + "learning_rate": 8.778894230442256e-06, + "loss": 1.1991, + "step": 16215 + }, + { + "epoch": 0.91, + "grad_norm": 18.895556724357824, + "learning_rate": 8.777820758197113e-06, + "loss": 1.2023, + "step": 16220 + }, + { + "epoch": 0.91, + "grad_norm": 12.190352446121144, + "learning_rate": 8.776746880008456e-06, + "loss": 1.0906, + "step": 16225 + }, + { + "epoch": 0.91, + "grad_norm": 13.572189145294844, + "learning_rate": 8.775672595991679e-06, + "loss": 1.0985, + "step": 16230 + }, + { + "epoch": 0.91, + "grad_norm": 30.780148334145984, + "learning_rate": 8.774597906262216e-06, + "loss": 1.0957, + "step": 16235 + }, + { + "epoch": 0.92, + "grad_norm": 20.840074732905205, + "learning_rate": 8.77352281093555e-06, + "loss": 1.1014, + "step": 16240 + }, + { + "epoch": 0.92, + "grad_norm": 25.14932494567834, + "learning_rate": 8.772447310127205e-06, + "loss": 1.1366, + "step": 16245 + }, + { + "epoch": 0.92, + "grad_norm": 6.524691081025394, + "learning_rate": 8.771371403952748e-06, + "loss": 1.1546, + "step": 16250 + }, + { + "epoch": 0.92, + "grad_norm": 8.458253430618536, + "learning_rate": 8.77029509252779e-06, + "loss": 1.1583, + "step": 16255 + }, + { + "epoch": 0.92, + "grad_norm": 6.343448104102179, + "learning_rate": 8.769218375967983e-06, + "loss": 1.0449, + "step": 16260 + }, + { + "epoch": 0.92, + "grad_norm": 7.282051545362369, + "learning_rate": 8.768141254389026e-06, + "loss": 1.1447, + "step": 16265 + }, + { + "epoch": 0.92, + "grad_norm": 5.273008482653508, + "learning_rate": 8.767063727906664e-06, + "loss": 1.1134, + "step": 16270 + }, + { + "epoch": 0.92, + "grad_norm": 7.052302981351795, + "learning_rate": 8.765985796636677e-06, + "loss": 1.1085, + "step": 16275 + }, + { + "epoch": 0.92, + "grad_norm": 8.170097022853318, + "learning_rate": 8.764907460694897e-06, + "loss": 1.1784, + "step": 16280 + }, + { + "epoch": 0.92, + "grad_norm": 6.256415269142373, + "learning_rate": 8.763828720197193e-06, + "loss": 1.108, + "step": 16285 + }, + { + "epoch": 0.92, + "grad_norm": 15.954914884126506, + "learning_rate": 8.76274957525948e-06, + "loss": 1.0677, + "step": 16290 + }, + { + "epoch": 0.92, + "grad_norm": 11.77344232994075, + "learning_rate": 8.761670025997724e-06, + "loss": 1.1047, + "step": 16295 + }, + { + "epoch": 0.92, + "grad_norm": 8.489888817776245, + "learning_rate": 8.76059007252792e-06, + "loss": 1.1786, + "step": 16300 + }, + { + "epoch": 0.92, + "grad_norm": 19.79756346741257, + "learning_rate": 8.759509714966114e-06, + "loss": 1.1012, + "step": 16305 + }, + { + "epoch": 0.92, + "grad_norm": 17.970560448934954, + "learning_rate": 8.758428953428399e-06, + "loss": 1.0937, + "step": 16310 + }, + { + "epoch": 0.92, + "grad_norm": 15.696285012759176, + "learning_rate": 8.757347788030906e-06, + "loss": 1.1387, + "step": 16315 + }, + { + "epoch": 0.92, + "grad_norm": 16.007036261122607, + "learning_rate": 8.75626621888981e-06, + "loss": 1.0979, + "step": 16320 + }, + { + "epoch": 0.92, + "grad_norm": 9.03476086094843, + "learning_rate": 8.755184246121332e-06, + "loss": 1.0754, + "step": 16325 + }, + { + "epoch": 0.92, + "grad_norm": 17.576555628021772, + "learning_rate": 8.754101869841733e-06, + "loss": 1.1414, + "step": 16330 + }, + { + "epoch": 0.92, + "grad_norm": 8.362484837905388, + "learning_rate": 8.753019090167319e-06, + "loss": 1.14, + "step": 16335 + }, + { + "epoch": 0.92, + "grad_norm": 76.12247768134962, + "learning_rate": 8.751935907214442e-06, + "loss": 1.139, + "step": 16340 + }, + { + "epoch": 0.92, + "grad_norm": 24.24045767611907, + "learning_rate": 8.750852321099492e-06, + "loss": 1.1291, + "step": 16345 + }, + { + "epoch": 0.92, + "grad_norm": 46.17592888130963, + "learning_rate": 8.749768331938908e-06, + "loss": 1.1227, + "step": 16350 + }, + { + "epoch": 0.92, + "grad_norm": 44.6076177145666, + "learning_rate": 8.748683939849168e-06, + "loss": 1.0924, + "step": 16355 + }, + { + "epoch": 0.92, + "grad_norm": 38.36581040746154, + "learning_rate": 8.747599144946792e-06, + "loss": 1.1413, + "step": 16360 + }, + { + "epoch": 0.92, + "grad_norm": 13.272535418452899, + "learning_rate": 8.746513947348349e-06, + "loss": 1.1449, + "step": 16365 + }, + { + "epoch": 0.92, + "grad_norm": 22.622182584692663, + "learning_rate": 8.745428347170449e-06, + "loss": 1.1333, + "step": 16370 + }, + { + "epoch": 0.92, + "grad_norm": 10.204687707319023, + "learning_rate": 8.744342344529743e-06, + "loss": 1.1012, + "step": 16375 + }, + { + "epoch": 0.92, + "grad_norm": 5.964796972326862, + "learning_rate": 8.743255939542928e-06, + "loss": 1.1267, + "step": 16380 + }, + { + "epoch": 0.92, + "grad_norm": 19.320735472324557, + "learning_rate": 8.74216913232674e-06, + "loss": 1.1239, + "step": 16385 + }, + { + "epoch": 0.92, + "grad_norm": 24.054811884597008, + "learning_rate": 8.741081922997964e-06, + "loss": 1.1229, + "step": 16390 + }, + { + "epoch": 0.92, + "grad_norm": 18.22144563252079, + "learning_rate": 8.739994311673425e-06, + "loss": 1.1444, + "step": 16395 + }, + { + "epoch": 0.92, + "grad_norm": 7.169098747662678, + "learning_rate": 8.73890629846999e-06, + "loss": 1.1194, + "step": 16400 + }, + { + "epoch": 0.92, + "grad_norm": 8.202104055040978, + "learning_rate": 8.737817883504574e-06, + "loss": 1.1159, + "step": 16405 + }, + { + "epoch": 0.92, + "grad_norm": 9.78528051776811, + "learning_rate": 8.73672906689413e-06, + "loss": 1.1106, + "step": 16410 + }, + { + "epoch": 0.92, + "grad_norm": 14.062906823839292, + "learning_rate": 8.735639848755655e-06, + "loss": 1.108, + "step": 16415 + }, + { + "epoch": 0.93, + "grad_norm": 6.7989081569689045, + "learning_rate": 8.734550229206193e-06, + "loss": 1.1217, + "step": 16420 + }, + { + "epoch": 0.93, + "grad_norm": 5.928192640795595, + "learning_rate": 8.733460208362827e-06, + "loss": 1.0494, + "step": 16425 + }, + { + "epoch": 0.93, + "grad_norm": 5.685317764958415, + "learning_rate": 8.732369786342684e-06, + "loss": 1.1242, + "step": 16430 + }, + { + "epoch": 0.93, + "grad_norm": 22.138909155666813, + "learning_rate": 8.731278963262934e-06, + "loss": 1.0998, + "step": 16435 + }, + { + "epoch": 0.93, + "grad_norm": 7.005040029562854, + "learning_rate": 8.730187739240794e-06, + "loss": 1.0938, + "step": 16440 + }, + { + "epoch": 0.93, + "grad_norm": 6.659180914035443, + "learning_rate": 8.729096114393519e-06, + "loss": 1.095, + "step": 16445 + }, + { + "epoch": 0.93, + "grad_norm": 9.040709143735745, + "learning_rate": 8.728004088838407e-06, + "loss": 1.1103, + "step": 16450 + }, + { + "epoch": 0.93, + "grad_norm": 7.620165078357946, + "learning_rate": 8.726911662692804e-06, + "loss": 1.0346, + "step": 16455 + }, + { + "epoch": 0.93, + "grad_norm": 6.4202464009306315, + "learning_rate": 8.725818836074094e-06, + "loss": 1.0585, + "step": 16460 + }, + { + "epoch": 0.93, + "grad_norm": 5.122519344833424, + "learning_rate": 8.724725609099708e-06, + "loss": 1.0677, + "step": 16465 + }, + { + "epoch": 0.93, + "grad_norm": 18.379173430495328, + "learning_rate": 8.723631981887115e-06, + "loss": 1.1095, + "step": 16470 + }, + { + "epoch": 0.93, + "grad_norm": 11.807500606971903, + "learning_rate": 8.722537954553834e-06, + "loss": 1.1456, + "step": 16475 + }, + { + "epoch": 0.93, + "grad_norm": 19.057686553939874, + "learning_rate": 8.72144352721742e-06, + "loss": 1.1618, + "step": 16480 + }, + { + "epoch": 0.93, + "grad_norm": 7.129235480390931, + "learning_rate": 8.720348699995475e-06, + "loss": 1.1248, + "step": 16485 + }, + { + "epoch": 0.93, + "grad_norm": 9.865586804101719, + "learning_rate": 8.719253473005642e-06, + "loss": 1.1253, + "step": 16490 + }, + { + "epoch": 0.93, + "grad_norm": 6.141786377717228, + "learning_rate": 8.718157846365612e-06, + "loss": 1.1272, + "step": 16495 + }, + { + "epoch": 0.93, + "grad_norm": 6.014185685581909, + "learning_rate": 8.717061820193109e-06, + "loss": 1.0713, + "step": 16500 + }, + { + "epoch": 0.93, + "grad_norm": 6.39974596188113, + "learning_rate": 8.71596539460591e-06, + "loss": 1.0618, + "step": 16505 + }, + { + "epoch": 0.93, + "grad_norm": 6.767146051283026, + "learning_rate": 8.714868569721829e-06, + "loss": 1.1561, + "step": 16510 + }, + { + "epoch": 0.93, + "grad_norm": 14.77005098166936, + "learning_rate": 8.713771345658725e-06, + "loss": 1.086, + "step": 16515 + }, + { + "epoch": 0.93, + "grad_norm": 8.483453547078433, + "learning_rate": 8.7126737225345e-06, + "loss": 1.1, + "step": 16520 + }, + { + "epoch": 0.93, + "grad_norm": 8.028485044351108, + "learning_rate": 8.711575700467099e-06, + "loss": 1.1149, + "step": 16525 + }, + { + "epoch": 0.93, + "grad_norm": 6.776660395252255, + "learning_rate": 8.710477279574508e-06, + "loss": 1.0807, + "step": 16530 + }, + { + "epoch": 0.93, + "grad_norm": 6.321915649913745, + "learning_rate": 8.709378459974757e-06, + "loss": 1.0956, + "step": 16535 + }, + { + "epoch": 0.93, + "grad_norm": 16.015565458827222, + "learning_rate": 8.708279241785921e-06, + "loss": 1.0931, + "step": 16540 + }, + { + "epoch": 0.93, + "grad_norm": 38.165963473472665, + "learning_rate": 8.707179625126115e-06, + "loss": 1.1324, + "step": 16545 + }, + { + "epoch": 0.93, + "grad_norm": 9.334238440334481, + "learning_rate": 8.706079610113496e-06, + "loss": 1.1376, + "step": 16550 + }, + { + "epoch": 0.93, + "grad_norm": 12.098952970685284, + "learning_rate": 8.704979196866266e-06, + "loss": 1.1392, + "step": 16555 + }, + { + "epoch": 0.93, + "grad_norm": 16.854497211266295, + "learning_rate": 8.703878385502671e-06, + "loss": 1.1182, + "step": 16560 + }, + { + "epoch": 0.93, + "grad_norm": 17.669911348817724, + "learning_rate": 8.702777176140997e-06, + "loss": 1.0911, + "step": 16565 + }, + { + "epoch": 0.93, + "grad_norm": 14.752468121840435, + "learning_rate": 8.701675568899574e-06, + "loss": 1.0768, + "step": 16570 + }, + { + "epoch": 0.93, + "grad_norm": 17.627644905402025, + "learning_rate": 8.700573563896774e-06, + "loss": 1.0934, + "step": 16575 + }, + { + "epoch": 0.93, + "grad_norm": 20.195405848785732, + "learning_rate": 8.699471161251012e-06, + "loss": 1.1107, + "step": 16580 + }, + { + "epoch": 0.93, + "grad_norm": 12.198940222449206, + "learning_rate": 8.698368361080749e-06, + "loss": 1.1203, + "step": 16585 + }, + { + "epoch": 0.93, + "grad_norm": 6.2067746874275365, + "learning_rate": 8.69726516350448e-06, + "loss": 1.0956, + "step": 16590 + }, + { + "epoch": 0.94, + "grad_norm": 6.733743717287036, + "learning_rate": 8.696161568640756e-06, + "loss": 1.0886, + "step": 16595 + }, + { + "epoch": 0.94, + "grad_norm": 12.89035233966049, + "learning_rate": 8.695057576608156e-06, + "loss": 1.1542, + "step": 16600 + }, + { + "epoch": 0.94, + "grad_norm": 15.117215892219331, + "learning_rate": 8.693953187525312e-06, + "loss": 1.1133, + "step": 16605 + }, + { + "epoch": 0.94, + "grad_norm": 12.567981163642601, + "learning_rate": 8.692848401510899e-06, + "loss": 1.1064, + "step": 16610 + }, + { + "epoch": 0.94, + "grad_norm": 8.234473257192498, + "learning_rate": 8.691743218683623e-06, + "loss": 1.0654, + "step": 16615 + }, + { + "epoch": 0.94, + "grad_norm": 12.476461636235864, + "learning_rate": 8.690637639162247e-06, + "loss": 1.0996, + "step": 16620 + }, + { + "epoch": 0.94, + "grad_norm": 13.89283221404281, + "learning_rate": 8.689531663065569e-06, + "loss": 1.0795, + "step": 16625 + }, + { + "epoch": 0.94, + "grad_norm": 7.6064564707496665, + "learning_rate": 8.68842529051243e-06, + "loss": 1.1061, + "step": 16630 + }, + { + "epoch": 0.94, + "grad_norm": 6.8832524553344046, + "learning_rate": 8.687318521621715e-06, + "loss": 1.1455, + "step": 16635 + }, + { + "epoch": 0.94, + "grad_norm": 5.96740972184527, + "learning_rate": 8.686211356512353e-06, + "loss": 1.1003, + "step": 16640 + }, + { + "epoch": 0.94, + "grad_norm": 7.563313662079871, + "learning_rate": 8.68510379530331e-06, + "loss": 1.1143, + "step": 16645 + }, + { + "epoch": 0.94, + "grad_norm": 9.39720690050603, + "learning_rate": 8.683995838113602e-06, + "loss": 1.1014, + "step": 16650 + }, + { + "epoch": 0.94, + "grad_norm": 10.758215647478176, + "learning_rate": 8.682887485062283e-06, + "loss": 1.1131, + "step": 16655 + }, + { + "epoch": 0.94, + "grad_norm": 5.8197066067424235, + "learning_rate": 8.68177873626845e-06, + "loss": 1.0837, + "step": 16660 + }, + { + "epoch": 0.94, + "grad_norm": 20.021803830672226, + "learning_rate": 8.680669591851241e-06, + "loss": 1.1323, + "step": 16665 + }, + { + "epoch": 0.94, + "grad_norm": 9.48333234748205, + "learning_rate": 8.679560051929841e-06, + "loss": 1.1795, + "step": 16670 + }, + { + "epoch": 0.94, + "grad_norm": 12.004186377201357, + "learning_rate": 8.678450116623475e-06, + "loss": 1.0931, + "step": 16675 + }, + { + "epoch": 0.94, + "grad_norm": 8.78110602933926, + "learning_rate": 8.677339786051409e-06, + "loss": 1.0697, + "step": 16680 + }, + { + "epoch": 0.94, + "grad_norm": 12.05618061588711, + "learning_rate": 8.676229060332955e-06, + "loss": 1.133, + "step": 16685 + }, + { + "epoch": 0.94, + "grad_norm": 8.34079703403976, + "learning_rate": 8.675117939587462e-06, + "loss": 1.0632, + "step": 16690 + }, + { + "epoch": 0.94, + "grad_norm": 9.57522360774888, + "learning_rate": 8.674006423934329e-06, + "loss": 1.0819, + "step": 16695 + }, + { + "epoch": 0.94, + "grad_norm": 6.981022926998485, + "learning_rate": 8.672894513492989e-06, + "loss": 1.0911, + "step": 16700 + }, + { + "epoch": 0.94, + "grad_norm": 10.377801934015054, + "learning_rate": 8.671782208382925e-06, + "loss": 1.0995, + "step": 16705 + }, + { + "epoch": 0.94, + "grad_norm": 10.55500338678948, + "learning_rate": 8.670669508723658e-06, + "loss": 1.1319, + "step": 16710 + }, + { + "epoch": 0.94, + "grad_norm": 5.287168833812008, + "learning_rate": 8.669556414634753e-06, + "loss": 1.093, + "step": 16715 + }, + { + "epoch": 0.94, + "grad_norm": 5.485193154142666, + "learning_rate": 8.668442926235815e-06, + "loss": 1.0344, + "step": 16720 + }, + { + "epoch": 0.94, + "grad_norm": 6.674636382508078, + "learning_rate": 8.667329043646495e-06, + "loss": 1.1413, + "step": 16725 + }, + { + "epoch": 0.94, + "grad_norm": 8.605226086662105, + "learning_rate": 8.666214766986484e-06, + "loss": 1.1142, + "step": 16730 + }, + { + "epoch": 0.94, + "grad_norm": 5.042553356278292, + "learning_rate": 8.665100096375517e-06, + "loss": 1.1072, + "step": 16735 + }, + { + "epoch": 0.94, + "grad_norm": 11.721074379071956, + "learning_rate": 8.663985031933367e-06, + "loss": 1.1298, + "step": 16740 + }, + { + "epoch": 0.94, + "grad_norm": 10.066261313152033, + "learning_rate": 8.662869573779856e-06, + "loss": 1.1151, + "step": 16745 + }, + { + "epoch": 0.94, + "grad_norm": 6.267548006539362, + "learning_rate": 8.661753722034844e-06, + "loss": 1.1654, + "step": 16750 + }, + { + "epoch": 0.94, + "grad_norm": 15.069320807507802, + "learning_rate": 8.660637476818235e-06, + "loss": 1.1548, + "step": 16755 + }, + { + "epoch": 0.94, + "grad_norm": 6.917268241092627, + "learning_rate": 8.659520838249972e-06, + "loss": 1.1078, + "step": 16760 + }, + { + "epoch": 0.94, + "grad_norm": 13.103795176539565, + "learning_rate": 8.658403806450045e-06, + "loss": 1.1569, + "step": 16765 + }, + { + "epoch": 0.94, + "grad_norm": 10.039259669749306, + "learning_rate": 8.657286381538484e-06, + "loss": 1.0719, + "step": 16770 + }, + { + "epoch": 0.95, + "grad_norm": 5.64818010374265, + "learning_rate": 8.656168563635358e-06, + "loss": 1.0916, + "step": 16775 + }, + { + "epoch": 0.95, + "grad_norm": 8.667193970299143, + "learning_rate": 8.655050352860786e-06, + "loss": 1.1076, + "step": 16780 + }, + { + "epoch": 0.95, + "grad_norm": 6.629460052604731, + "learning_rate": 8.653931749334922e-06, + "loss": 1.1023, + "step": 16785 + }, + { + "epoch": 0.95, + "grad_norm": 33.732872738539115, + "learning_rate": 8.652812753177965e-06, + "loss": 1.1522, + "step": 16790 + }, + { + "epoch": 0.95, + "grad_norm": 34.37900182726514, + "learning_rate": 8.65169336451016e-06, + "loss": 1.1751, + "step": 16795 + }, + { + "epoch": 0.95, + "grad_norm": 16.114958382608194, + "learning_rate": 8.650573583451782e-06, + "loss": 1.1227, + "step": 16800 + }, + { + "epoch": 0.95, + "grad_norm": 16.55020664542636, + "learning_rate": 8.649453410123164e-06, + "loss": 1.1111, + "step": 16805 + }, + { + "epoch": 0.95, + "grad_norm": 8.922271126707626, + "learning_rate": 8.648332844644669e-06, + "loss": 1.087, + "step": 16810 + }, + { + "epoch": 0.95, + "grad_norm": 5.1183304135241885, + "learning_rate": 8.64721188713671e-06, + "loss": 1.1104, + "step": 16815 + }, + { + "epoch": 0.95, + "grad_norm": 9.800394367832217, + "learning_rate": 8.646090537719738e-06, + "loss": 1.1132, + "step": 16820 + }, + { + "epoch": 0.95, + "grad_norm": 17.588142176645427, + "learning_rate": 8.644968796514244e-06, + "loss": 1.124, + "step": 16825 + }, + { + "epoch": 0.95, + "grad_norm": 37.79013112368904, + "learning_rate": 8.643846663640767e-06, + "loss": 1.0859, + "step": 16830 + }, + { + "epoch": 0.95, + "grad_norm": 14.83873331471025, + "learning_rate": 8.642724139219887e-06, + "loss": 1.0483, + "step": 16835 + }, + { + "epoch": 0.95, + "grad_norm": 5.748327254509929, + "learning_rate": 8.641601223372219e-06, + "loss": 1.1463, + "step": 16840 + }, + { + "epoch": 0.95, + "grad_norm": 16.32810310898289, + "learning_rate": 8.640477916218427e-06, + "loss": 1.147, + "step": 16845 + }, + { + "epoch": 0.95, + "grad_norm": 8.781404767906865, + "learning_rate": 8.639354217879219e-06, + "loss": 1.1185, + "step": 16850 + }, + { + "epoch": 0.95, + "grad_norm": 6.948261878606812, + "learning_rate": 8.638230128475339e-06, + "loss": 1.1286, + "step": 16855 + }, + { + "epoch": 0.95, + "grad_norm": 5.774906837685148, + "learning_rate": 8.637105648127572e-06, + "loss": 1.1657, + "step": 16860 + }, + { + "epoch": 0.95, + "grad_norm": 6.829624181780629, + "learning_rate": 8.635980776956753e-06, + "loss": 1.1155, + "step": 16865 + }, + { + "epoch": 0.95, + "grad_norm": 5.265451296320978, + "learning_rate": 8.634855515083754e-06, + "loss": 1.0956, + "step": 16870 + }, + { + "epoch": 0.95, + "grad_norm": 11.919765183096331, + "learning_rate": 8.633729862629485e-06, + "loss": 1.0738, + "step": 16875 + }, + { + "epoch": 0.95, + "grad_norm": 9.275255046385121, + "learning_rate": 8.632603819714907e-06, + "loss": 1.1066, + "step": 16880 + }, + { + "epoch": 0.95, + "grad_norm": 12.779714646987017, + "learning_rate": 8.631477386461017e-06, + "loss": 1.1275, + "step": 16885 + }, + { + "epoch": 0.95, + "grad_norm": 11.48483276220722, + "learning_rate": 8.630350562988854e-06, + "loss": 1.0948, + "step": 16890 + }, + { + "epoch": 0.95, + "grad_norm": 14.85938014572465, + "learning_rate": 8.629223349419502e-06, + "loss": 1.0943, + "step": 16895 + }, + { + "epoch": 0.95, + "grad_norm": 15.004591130647585, + "learning_rate": 8.628095745874084e-06, + "loss": 1.1074, + "step": 16900 + }, + { + "epoch": 0.95, + "grad_norm": 9.65457644471501, + "learning_rate": 8.626967752473766e-06, + "loss": 1.1077, + "step": 16905 + }, + { + "epoch": 0.95, + "grad_norm": 12.182654798327075, + "learning_rate": 8.625839369339756e-06, + "loss": 1.1352, + "step": 16910 + }, + { + "epoch": 0.95, + "grad_norm": 7.047457540280567, + "learning_rate": 8.624710596593304e-06, + "loss": 1.0868, + "step": 16915 + }, + { + "epoch": 0.95, + "grad_norm": 11.733693314136023, + "learning_rate": 8.6235814343557e-06, + "loss": 1.1602, + "step": 16920 + }, + { + "epoch": 0.95, + "grad_norm": 10.463943057759602, + "learning_rate": 8.62245188274828e-06, + "loss": 1.0792, + "step": 16925 + }, + { + "epoch": 0.95, + "grad_norm": 7.136447871619055, + "learning_rate": 8.621321941892418e-06, + "loss": 1.1297, + "step": 16930 + }, + { + "epoch": 0.95, + "grad_norm": 6.101580015915936, + "learning_rate": 8.62019161190953e-06, + "loss": 1.1202, + "step": 16935 + }, + { + "epoch": 0.95, + "grad_norm": 9.992471439897448, + "learning_rate": 8.619060892921077e-06, + "loss": 1.0861, + "step": 16940 + }, + { + "epoch": 0.95, + "grad_norm": 7.160694331840603, + "learning_rate": 8.61792978504856e-06, + "loss": 1.0824, + "step": 16945 + }, + { + "epoch": 0.96, + "grad_norm": 17.43779713325381, + "learning_rate": 8.616798288413519e-06, + "loss": 1.1127, + "step": 16950 + }, + { + "epoch": 0.96, + "grad_norm": 11.919756875258514, + "learning_rate": 8.61566640313754e-06, + "loss": 1.1164, + "step": 16955 + }, + { + "epoch": 0.96, + "grad_norm": 13.73141976572175, + "learning_rate": 8.61453412934225e-06, + "loss": 1.0591, + "step": 16960 + }, + { + "epoch": 0.96, + "grad_norm": 5.1210669962111535, + "learning_rate": 8.613401467149313e-06, + "loss": 1.0836, + "step": 16965 + }, + { + "epoch": 0.96, + "grad_norm": 10.10212590273476, + "learning_rate": 8.612268416680445e-06, + "loss": 1.0952, + "step": 16970 + }, + { + "epoch": 0.96, + "grad_norm": 5.103311016410487, + "learning_rate": 8.61113497805739e-06, + "loss": 1.04, + "step": 16975 + }, + { + "epoch": 0.96, + "grad_norm": 14.858408644862056, + "learning_rate": 8.610001151401947e-06, + "loss": 1.1391, + "step": 16980 + }, + { + "epoch": 0.96, + "grad_norm": 28.537734196563537, + "learning_rate": 8.608866936835946e-06, + "loss": 1.1008, + "step": 16985 + }, + { + "epoch": 0.96, + "grad_norm": 17.678920307043942, + "learning_rate": 8.607732334481268e-06, + "loss": 1.1211, + "step": 16990 + }, + { + "epoch": 0.96, + "grad_norm": 7.049879390681618, + "learning_rate": 8.606597344459826e-06, + "loss": 1.0908, + "step": 16995 + }, + { + "epoch": 0.96, + "grad_norm": 10.000289047398722, + "learning_rate": 8.605461966893582e-06, + "loss": 1.0672, + "step": 17000 + }, + { + "epoch": 0.96, + "grad_norm": 10.300366652184351, + "learning_rate": 8.604326201904541e-06, + "loss": 1.1107, + "step": 17005 + }, + { + "epoch": 0.96, + "grad_norm": 14.03427331057001, + "learning_rate": 8.60319004961474e-06, + "loss": 1.0685, + "step": 17010 + }, + { + "epoch": 0.96, + "grad_norm": 14.520653156003474, + "learning_rate": 8.602053510146265e-06, + "loss": 1.0649, + "step": 17015 + }, + { + "epoch": 0.96, + "grad_norm": 21.63507991906392, + "learning_rate": 8.600916583621247e-06, + "loss": 1.1288, + "step": 17020 + }, + { + "epoch": 0.96, + "grad_norm": 7.793669670278979, + "learning_rate": 8.599779270161846e-06, + "loss": 1.1012, + "step": 17025 + }, + { + "epoch": 0.96, + "grad_norm": 22.626520487692865, + "learning_rate": 8.598641569890275e-06, + "loss": 1.1016, + "step": 17030 + }, + { + "epoch": 0.96, + "grad_norm": 32.28364135134357, + "learning_rate": 8.597503482928788e-06, + "loss": 1.1309, + "step": 17035 + }, + { + "epoch": 0.96, + "grad_norm": 32.0488042790708, + "learning_rate": 8.596365009399676e-06, + "loss": 1.114, + "step": 17040 + }, + { + "epoch": 0.96, + "grad_norm": 16.97518127775392, + "learning_rate": 8.595226149425268e-06, + "loss": 1.1284, + "step": 17045 + }, + { + "epoch": 0.96, + "grad_norm": 8.59692849599178, + "learning_rate": 8.594086903127946e-06, + "loss": 1.1647, + "step": 17050 + }, + { + "epoch": 0.96, + "grad_norm": 7.677751460142839, + "learning_rate": 8.592947270630122e-06, + "loss": 1.0799, + "step": 17055 + }, + { + "epoch": 0.96, + "grad_norm": 56.47850036664735, + "learning_rate": 8.591807252054261e-06, + "loss": 1.1001, + "step": 17060 + }, + { + "epoch": 0.96, + "grad_norm": 16.033786859529016, + "learning_rate": 8.590666847522855e-06, + "loss": 1.0932, + "step": 17065 + }, + { + "epoch": 0.96, + "grad_norm": 15.922058186030087, + "learning_rate": 8.58952605715845e-06, + "loss": 1.1408, + "step": 17070 + }, + { + "epoch": 0.96, + "grad_norm": 11.695337545016002, + "learning_rate": 8.58838488108363e-06, + "loss": 1.0902, + "step": 17075 + }, + { + "epoch": 0.96, + "grad_norm": 6.853955323983569, + "learning_rate": 8.587243319421018e-06, + "loss": 1.0831, + "step": 17080 + }, + { + "epoch": 0.96, + "grad_norm": 9.282878247017411, + "learning_rate": 8.586101372293278e-06, + "loss": 1.1114, + "step": 17085 + }, + { + "epoch": 0.96, + "grad_norm": 6.932947242973346, + "learning_rate": 8.584959039823118e-06, + "loss": 1.1423, + "step": 17090 + }, + { + "epoch": 0.96, + "grad_norm": 6.271938691365637, + "learning_rate": 8.583816322133289e-06, + "loss": 1.1269, + "step": 17095 + }, + { + "epoch": 0.96, + "grad_norm": 9.445713789115437, + "learning_rate": 8.58267321934658e-06, + "loss": 1.1257, + "step": 17100 + }, + { + "epoch": 0.96, + "grad_norm": 29.421390018795922, + "learning_rate": 8.581529731585823e-06, + "loss": 1.0923, + "step": 17105 + }, + { + "epoch": 0.96, + "grad_norm": 8.998397195340639, + "learning_rate": 8.580385858973888e-06, + "loss": 1.133, + "step": 17110 + }, + { + "epoch": 0.96, + "grad_norm": 5.831929265163076, + "learning_rate": 8.57924160163369e-06, + "loss": 1.0752, + "step": 17115 + }, + { + "epoch": 0.96, + "grad_norm": 5.415976045958236, + "learning_rate": 8.578096959688186e-06, + "loss": 1.1306, + "step": 17120 + }, + { + "epoch": 0.96, + "grad_norm": 5.972343347910804, + "learning_rate": 8.576951933260375e-06, + "loss": 1.1042, + "step": 17125 + }, + { + "epoch": 0.97, + "grad_norm": 5.943121935071054, + "learning_rate": 8.57580652247329e-06, + "loss": 1.1209, + "step": 17130 + }, + { + "epoch": 0.97, + "grad_norm": 10.278290573093377, + "learning_rate": 8.574660727450015e-06, + "loss": 1.1442, + "step": 17135 + }, + { + "epoch": 0.97, + "grad_norm": 5.003901926627689, + "learning_rate": 8.573514548313666e-06, + "loss": 1.089, + "step": 17140 + }, + { + "epoch": 0.97, + "grad_norm": 6.367489844559957, + "learning_rate": 8.572367985187408e-06, + "loss": 1.126, + "step": 17145 + }, + { + "epoch": 0.97, + "grad_norm": 6.023590407704637, + "learning_rate": 8.571221038194446e-06, + "loss": 1.0789, + "step": 17150 + }, + { + "epoch": 0.97, + "grad_norm": 7.418039829453431, + "learning_rate": 8.570073707458022e-06, + "loss": 1.1212, + "step": 17155 + }, + { + "epoch": 0.97, + "grad_norm": 7.534951519053513, + "learning_rate": 8.56892599310142e-06, + "loss": 1.0669, + "step": 17160 + }, + { + "epoch": 0.97, + "grad_norm": 7.211321714989818, + "learning_rate": 8.56777789524797e-06, + "loss": 1.0577, + "step": 17165 + }, + { + "epoch": 0.97, + "grad_norm": 13.349987216251385, + "learning_rate": 8.56662941402104e-06, + "loss": 1.12, + "step": 17170 + }, + { + "epoch": 0.97, + "grad_norm": 6.514759203836478, + "learning_rate": 8.56548054954404e-06, + "loss": 1.0546, + "step": 17175 + }, + { + "epoch": 0.97, + "grad_norm": 8.264870038261684, + "learning_rate": 8.564331301940419e-06, + "loss": 1.0375, + "step": 17180 + }, + { + "epoch": 0.97, + "grad_norm": 9.773221916226845, + "learning_rate": 8.563181671333666e-06, + "loss": 1.1053, + "step": 17185 + }, + { + "epoch": 0.97, + "grad_norm": 5.392172356282338, + "learning_rate": 8.56203165784732e-06, + "loss": 1.1423, + "step": 17190 + }, + { + "epoch": 0.97, + "grad_norm": 10.528523758799187, + "learning_rate": 8.560881261604951e-06, + "loss": 1.143, + "step": 17195 + }, + { + "epoch": 0.97, + "grad_norm": 11.933484178575647, + "learning_rate": 8.559730482730176e-06, + "loss": 1.096, + "step": 17200 + }, + { + "epoch": 0.97, + "grad_norm": 6.325635838846004, + "learning_rate": 8.55857932134665e-06, + "loss": 1.0839, + "step": 17205 + }, + { + "epoch": 0.97, + "grad_norm": 7.721837717096488, + "learning_rate": 8.55742777757807e-06, + "loss": 1.122, + "step": 17210 + }, + { + "epoch": 0.97, + "grad_norm": 7.7142419398959134, + "learning_rate": 8.556275851548173e-06, + "loss": 1.0708, + "step": 17215 + }, + { + "epoch": 0.97, + "grad_norm": 15.614369692298798, + "learning_rate": 8.555123543380743e-06, + "loss": 1.0669, + "step": 17220 + }, + { + "epoch": 0.97, + "grad_norm": 6.420334429553925, + "learning_rate": 8.553970853199599e-06, + "loss": 1.1087, + "step": 17225 + }, + { + "epoch": 0.97, + "grad_norm": 21.343029747736015, + "learning_rate": 8.552817781128602e-06, + "loss": 1.1398, + "step": 17230 + }, + { + "epoch": 0.97, + "grad_norm": 18.319859430286797, + "learning_rate": 8.551664327291653e-06, + "loss": 1.1277, + "step": 17235 + }, + { + "epoch": 0.97, + "grad_norm": 12.849825184667681, + "learning_rate": 8.550510491812697e-06, + "loss": 1.1342, + "step": 17240 + }, + { + "epoch": 0.97, + "grad_norm": 8.964489914643135, + "learning_rate": 8.549356274815721e-06, + "loss": 1.1359, + "step": 17245 + }, + { + "epoch": 0.97, + "grad_norm": 10.42300877290792, + "learning_rate": 8.548201676424748e-06, + "loss": 1.1076, + "step": 17250 + }, + { + "epoch": 0.97, + "grad_norm": 8.462190775163863, + "learning_rate": 8.547046696763845e-06, + "loss": 1.1187, + "step": 17255 + }, + { + "epoch": 0.97, + "grad_norm": 19.216531661712807, + "learning_rate": 8.54589133595712e-06, + "loss": 1.1116, + "step": 17260 + }, + { + "epoch": 0.97, + "grad_norm": 24.40952282571262, + "learning_rate": 8.544735594128723e-06, + "loss": 1.0711, + "step": 17265 + }, + { + "epoch": 0.97, + "grad_norm": 23.459383355754376, + "learning_rate": 8.54357947140284e-06, + "loss": 1.126, + "step": 17270 + }, + { + "epoch": 0.97, + "grad_norm": 19.065887056262643, + "learning_rate": 8.542422967903707e-06, + "loss": 1.1083, + "step": 17275 + }, + { + "epoch": 0.97, + "grad_norm": 10.537942460173479, + "learning_rate": 8.541266083755591e-06, + "loss": 1.1231, + "step": 17280 + }, + { + "epoch": 0.97, + "grad_norm": 19.351557397708405, + "learning_rate": 8.540108819082805e-06, + "loss": 1.1319, + "step": 17285 + }, + { + "epoch": 0.97, + "grad_norm": 16.852202759882676, + "learning_rate": 8.538951174009702e-06, + "loss": 1.0799, + "step": 17290 + }, + { + "epoch": 0.97, + "grad_norm": 10.61855316270117, + "learning_rate": 8.537793148660678e-06, + "loss": 1.1442, + "step": 17295 + }, + { + "epoch": 0.97, + "grad_norm": 8.2349835295622, + "learning_rate": 8.536634743160166e-06, + "loss": 1.0929, + "step": 17300 + }, + { + "epoch": 0.98, + "grad_norm": 9.587824493928512, + "learning_rate": 8.535475957632643e-06, + "loss": 1.0586, + "step": 17305 + }, + { + "epoch": 0.98, + "grad_norm": 12.667751947461904, + "learning_rate": 8.534316792202625e-06, + "loss": 1.1205, + "step": 17310 + }, + { + "epoch": 0.98, + "grad_norm": 26.370449991021072, + "learning_rate": 8.533157246994669e-06, + "loss": 1.0946, + "step": 17315 + }, + { + "epoch": 0.98, + "grad_norm": 36.973689379481165, + "learning_rate": 8.531997322133375e-06, + "loss": 1.1253, + "step": 17320 + }, + { + "epoch": 0.98, + "grad_norm": 10.53314149309355, + "learning_rate": 8.530837017743378e-06, + "loss": 1.1086, + "step": 17325 + }, + { + "epoch": 0.98, + "grad_norm": 25.817016166659737, + "learning_rate": 8.529676333949363e-06, + "loss": 1.1238, + "step": 17330 + }, + { + "epoch": 0.98, + "grad_norm": 6.6761358372072435, + "learning_rate": 8.528515270876048e-06, + "loss": 1.0697, + "step": 17335 + }, + { + "epoch": 0.98, + "grad_norm": 9.469243813776712, + "learning_rate": 8.527353828648194e-06, + "loss": 1.1164, + "step": 17340 + }, + { + "epoch": 0.98, + "grad_norm": 5.649172251010545, + "learning_rate": 8.526192007390606e-06, + "loss": 1.1045, + "step": 17345 + }, + { + "epoch": 0.98, + "grad_norm": 6.4949183545598075, + "learning_rate": 8.525029807228123e-06, + "loss": 1.0692, + "step": 17350 + }, + { + "epoch": 0.98, + "grad_norm": 6.811352187241923, + "learning_rate": 8.52386722828563e-06, + "loss": 1.0709, + "step": 17355 + }, + { + "epoch": 0.98, + "grad_norm": 17.980562540623275, + "learning_rate": 8.522704270688052e-06, + "loss": 1.1251, + "step": 17360 + }, + { + "epoch": 0.98, + "grad_norm": 22.60227073390142, + "learning_rate": 8.52154093456035e-06, + "loss": 1.1097, + "step": 17365 + }, + { + "epoch": 0.98, + "grad_norm": 12.706177546209313, + "learning_rate": 8.520377220027536e-06, + "loss": 1.1025, + "step": 17370 + }, + { + "epoch": 0.98, + "grad_norm": 24.111639874634594, + "learning_rate": 8.519213127214652e-06, + "loss": 1.0489, + "step": 17375 + }, + { + "epoch": 0.98, + "grad_norm": 6.64188633456058, + "learning_rate": 8.518048656246787e-06, + "loss": 1.1185, + "step": 17380 + }, + { + "epoch": 0.98, + "grad_norm": 7.108092664502759, + "learning_rate": 8.516883807249068e-06, + "loss": 1.122, + "step": 17385 + }, + { + "epoch": 0.98, + "grad_norm": 7.117524805180457, + "learning_rate": 8.515718580346661e-06, + "loss": 1.1076, + "step": 17390 + }, + { + "epoch": 0.98, + "grad_norm": 22.24577824837758, + "learning_rate": 8.514552975664777e-06, + "loss": 1.1248, + "step": 17395 + }, + { + "epoch": 0.98, + "grad_norm": 10.026810677379471, + "learning_rate": 8.513386993328665e-06, + "loss": 1.1077, + "step": 17400 + }, + { + "epoch": 0.98, + "grad_norm": 6.1915382204436025, + "learning_rate": 8.512220633463614e-06, + "loss": 1.0916, + "step": 17405 + }, + { + "epoch": 0.98, + "grad_norm": 6.839240989711329, + "learning_rate": 8.511053896194957e-06, + "loss": 1.1187, + "step": 17410 + }, + { + "epoch": 0.98, + "grad_norm": 26.15965631789435, + "learning_rate": 8.509886781648063e-06, + "loss": 1.1202, + "step": 17415 + }, + { + "epoch": 0.98, + "grad_norm": 22.241398466093468, + "learning_rate": 8.508719289948344e-06, + "loss": 1.1342, + "step": 17420 + }, + { + "epoch": 0.98, + "grad_norm": 20.247589317940527, + "learning_rate": 8.507551421221249e-06, + "loss": 1.0671, + "step": 17425 + }, + { + "epoch": 0.98, + "grad_norm": 22.653854170506236, + "learning_rate": 8.506383175592277e-06, + "loss": 1.1051, + "step": 17430 + }, + { + "epoch": 0.98, + "grad_norm": 19.541347250509766, + "learning_rate": 8.505214553186958e-06, + "loss": 1.1214, + "step": 17435 + }, + { + "epoch": 0.98, + "grad_norm": 25.45608404430628, + "learning_rate": 8.504045554130865e-06, + "loss": 1.1242, + "step": 17440 + }, + { + "epoch": 0.98, + "grad_norm": 16.94343309854376, + "learning_rate": 8.502876178549613e-06, + "loss": 1.0825, + "step": 17445 + }, + { + "epoch": 0.98, + "grad_norm": 15.730566313918818, + "learning_rate": 8.501706426568855e-06, + "loss": 1.0683, + "step": 17450 + }, + { + "epoch": 0.98, + "grad_norm": 18.819286866830012, + "learning_rate": 8.500536298314289e-06, + "loss": 1.036, + "step": 17455 + }, + { + "epoch": 0.98, + "grad_norm": 7.960477254420302, + "learning_rate": 8.499365793911647e-06, + "loss": 1.1349, + "step": 17460 + }, + { + "epoch": 0.98, + "grad_norm": 11.312382476193674, + "learning_rate": 8.498194913486708e-06, + "loss": 1.0864, + "step": 17465 + }, + { + "epoch": 0.98, + "grad_norm": 13.997993720101222, + "learning_rate": 8.497023657165286e-06, + "loss": 1.1126, + "step": 17470 + }, + { + "epoch": 0.98, + "grad_norm": 7.806390923878613, + "learning_rate": 8.495852025073239e-06, + "loss": 1.089, + "step": 17475 + }, + { + "epoch": 0.98, + "grad_norm": 27.91487422067381, + "learning_rate": 8.494680017336463e-06, + "loss": 1.0606, + "step": 17480 + }, + { + "epoch": 0.99, + "grad_norm": 5.6369038730092225, + "learning_rate": 8.493507634080897e-06, + "loss": 1.0521, + "step": 17485 + }, + { + "epoch": 0.99, + "grad_norm": 15.734492768079987, + "learning_rate": 8.492334875432515e-06, + "loss": 1.1017, + "step": 17490 + }, + { + "epoch": 0.99, + "grad_norm": 23.74806671070999, + "learning_rate": 8.491161741517339e-06, + "loss": 1.0697, + "step": 17495 + }, + { + "epoch": 0.99, + "grad_norm": 23.599203453778514, + "learning_rate": 8.489988232461429e-06, + "loss": 1.1092, + "step": 17500 + }, + { + "epoch": 0.99, + "grad_norm": 6.452834439130612, + "learning_rate": 8.48881434839088e-06, + "loss": 1.132, + "step": 17505 + }, + { + "epoch": 0.99, + "grad_norm": 23.06522444093552, + "learning_rate": 8.487640089431828e-06, + "loss": 1.0897, + "step": 17510 + }, + { + "epoch": 0.99, + "grad_norm": 15.220613719612215, + "learning_rate": 8.48646545571046e-06, + "loss": 1.1332, + "step": 17515 + }, + { + "epoch": 0.99, + "grad_norm": 14.881418264358748, + "learning_rate": 8.485290447352991e-06, + "loss": 1.0482, + "step": 17520 + }, + { + "epoch": 0.99, + "grad_norm": 15.721331269933362, + "learning_rate": 8.48411506448568e-06, + "loss": 1.0661, + "step": 17525 + }, + { + "epoch": 0.99, + "grad_norm": 38.611690420170696, + "learning_rate": 8.482939307234832e-06, + "loss": 1.1265, + "step": 17530 + }, + { + "epoch": 0.99, + "grad_norm": 20.76774680312566, + "learning_rate": 8.481763175726783e-06, + "loss": 1.1176, + "step": 17535 + }, + { + "epoch": 0.99, + "grad_norm": 6.548987196078617, + "learning_rate": 8.480586670087912e-06, + "loss": 1.1401, + "step": 17540 + }, + { + "epoch": 0.99, + "grad_norm": 58.47056233784023, + "learning_rate": 8.479409790444645e-06, + "loss": 1.0948, + "step": 17545 + }, + { + "epoch": 0.99, + "grad_norm": 39.16992664642986, + "learning_rate": 8.478232536923439e-06, + "loss": 1.0975, + "step": 17550 + }, + { + "epoch": 0.99, + "grad_norm": 14.935799863947528, + "learning_rate": 8.477054909650798e-06, + "loss": 1.136, + "step": 17555 + }, + { + "epoch": 0.99, + "grad_norm": 44.17679517978767, + "learning_rate": 8.47587690875326e-06, + "loss": 1.0761, + "step": 17560 + }, + { + "epoch": 0.99, + "grad_norm": 28.713438754897968, + "learning_rate": 8.474698534357408e-06, + "loss": 1.1264, + "step": 17565 + }, + { + "epoch": 0.99, + "grad_norm": 27.395694336471735, + "learning_rate": 8.473519786589863e-06, + "loss": 1.0881, + "step": 17570 + }, + { + "epoch": 0.99, + "grad_norm": 32.9082789260856, + "learning_rate": 8.472340665577288e-06, + "loss": 1.1478, + "step": 17575 + }, + { + "epoch": 0.99, + "grad_norm": 12.325827596095246, + "learning_rate": 8.471161171446383e-06, + "loss": 1.0463, + "step": 17580 + }, + { + "epoch": 0.99, + "grad_norm": 13.935308252137894, + "learning_rate": 8.469981304323893e-06, + "loss": 1.0782, + "step": 17585 + }, + { + "epoch": 0.99, + "grad_norm": 27.91690380967443, + "learning_rate": 8.468801064336596e-06, + "loss": 1.1201, + "step": 17590 + }, + { + "epoch": 0.99, + "grad_norm": 17.16370033358599, + "learning_rate": 8.467620451611314e-06, + "loss": 1.0942, + "step": 17595 + }, + { + "epoch": 0.99, + "grad_norm": 29.914032628110014, + "learning_rate": 8.466439466274914e-06, + "loss": 1.1052, + "step": 17600 + }, + { + "epoch": 0.99, + "grad_norm": 25.549628881727934, + "learning_rate": 8.465258108454292e-06, + "loss": 1.1478, + "step": 17605 + }, + { + "epoch": 0.99, + "grad_norm": 34.184125937523916, + "learning_rate": 8.464076378276393e-06, + "loss": 1.1315, + "step": 17610 + }, + { + "epoch": 0.99, + "grad_norm": 17.90086949768453, + "learning_rate": 8.4628942758682e-06, + "loss": 1.12, + "step": 17615 + }, + { + "epoch": 0.99, + "grad_norm": 38.545213076593434, + "learning_rate": 8.461711801356735e-06, + "loss": 1.1479, + "step": 17620 + }, + { + "epoch": 0.99, + "grad_norm": 12.061326289715497, + "learning_rate": 8.460528954869059e-06, + "loss": 1.0875, + "step": 17625 + }, + { + "epoch": 0.99, + "grad_norm": 13.976899938074336, + "learning_rate": 8.459345736532273e-06, + "loss": 1.1053, + "step": 17630 + }, + { + "epoch": 0.99, + "grad_norm": 11.303790615050954, + "learning_rate": 8.45816214647352e-06, + "loss": 1.1428, + "step": 17635 + }, + { + "epoch": 0.99, + "grad_norm": 10.00709108138671, + "learning_rate": 8.456978184819986e-06, + "loss": 1.1406, + "step": 17640 + }, + { + "epoch": 0.99, + "grad_norm": 10.59102350372654, + "learning_rate": 8.455793851698886e-06, + "loss": 1.0858, + "step": 17645 + }, + { + "epoch": 0.99, + "grad_norm": 8.443895983664765, + "learning_rate": 8.454609147237486e-06, + "loss": 1.1019, + "step": 17650 + }, + { + "epoch": 0.99, + "grad_norm": 6.518424946798726, + "learning_rate": 8.453424071563086e-06, + "loss": 1.0569, + "step": 17655 + }, + { + "epoch": 1.0, + "grad_norm": 7.922045969264012, + "learning_rate": 8.45223862480303e-06, + "loss": 1.0906, + "step": 17660 + }, + { + "epoch": 1.0, + "grad_norm": 7.311494406030087, + "learning_rate": 8.451052807084696e-06, + "loss": 1.1251, + "step": 17665 + }, + { + "epoch": 1.0, + "grad_norm": 17.631810292606946, + "learning_rate": 8.449866618535509e-06, + "loss": 1.1309, + "step": 17670 + }, + { + "epoch": 1.0, + "grad_norm": 13.296432872445173, + "learning_rate": 8.448680059282926e-06, + "loss": 1.078, + "step": 17675 + }, + { + "epoch": 1.0, + "grad_norm": 25.743316666721856, + "learning_rate": 8.447493129454452e-06, + "loss": 1.0751, + "step": 17680 + }, + { + "epoch": 1.0, + "grad_norm": 8.243500906845636, + "learning_rate": 8.446305829177626e-06, + "loss": 1.0927, + "step": 17685 + }, + { + "epoch": 1.0, + "grad_norm": 16.192924157932385, + "learning_rate": 8.44511815858003e-06, + "loss": 1.0935, + "step": 17690 + }, + { + "epoch": 1.0, + "grad_norm": 13.157845779594874, + "learning_rate": 8.443930117789282e-06, + "loss": 1.0926, + "step": 17695 + }, + { + "epoch": 1.0, + "grad_norm": 6.384549416657215, + "learning_rate": 8.442741706933044e-06, + "loss": 1.1331, + "step": 17700 + }, + { + "epoch": 1.0, + "grad_norm": 9.405902870706582, + "learning_rate": 8.441552926139015e-06, + "loss": 1.0611, + "step": 17705 + }, + { + "epoch": 1.0, + "grad_norm": 7.270131696636666, + "learning_rate": 8.440363775534937e-06, + "loss": 1.0998, + "step": 17710 + }, + { + "epoch": 1.0, + "grad_norm": 5.934156379264266, + "learning_rate": 8.439174255248587e-06, + "loss": 1.1216, + "step": 17715 + }, + { + "epoch": 1.0, + "grad_norm": 16.03286631358554, + "learning_rate": 8.437984365407785e-06, + "loss": 1.1187, + "step": 17720 + }, + { + "epoch": 1.0, + "grad_norm": 9.442715788040404, + "learning_rate": 8.43679410614039e-06, + "loss": 1.0706, + "step": 17725 + }, + { + "epoch": 1.0, + "grad_norm": 21.182337587915736, + "learning_rate": 8.435603477574302e-06, + "loss": 1.0991, + "step": 17730 + }, + { + "epoch": 1.0, + "grad_norm": 6.9859308497457615, + "learning_rate": 8.434412479837456e-06, + "loss": 1.089, + "step": 17735 + }, + { + "epoch": 1.0, + "grad_norm": 18.37699563985268, + "learning_rate": 8.43322111305783e-06, + "loss": 1.0968, + "step": 17740 + }, + { + "epoch": 1.0, + "grad_norm": 7.438308011952263, + "learning_rate": 8.432029377363447e-06, + "loss": 1.0785, + "step": 17745 + }, + { + "epoch": 1.0, + "eval_loss": 1.260210394859314, + "eval_runtime": 25.4511, + "eval_samples_per_second": 31.669, + "eval_steps_per_second": 3.968, + "step": 17748 + }, + { + "epoch": 1.0, + "grad_norm": 5.626869052426889, + "learning_rate": 8.430837272882359e-06, + "loss": 1.0274, + "step": 17750 + }, + { + "epoch": 1.0, + "grad_norm": 9.182969299055262, + "learning_rate": 8.429644799742664e-06, + "loss": 0.9516, + "step": 17755 + }, + { + "epoch": 1.0, + "grad_norm": 8.716810807239526, + "learning_rate": 8.4284519580725e-06, + "loss": 0.9606, + "step": 17760 + }, + { + "epoch": 1.0, + "grad_norm": 6.131151759499285, + "learning_rate": 8.427258748000042e-06, + "loss": 0.9358, + "step": 17765 + }, + { + "epoch": 1.0, + "grad_norm": 6.035344289845632, + "learning_rate": 8.426065169653506e-06, + "loss": 0.9879, + "step": 17770 + }, + { + "epoch": 1.0, + "grad_norm": 8.266042187503562, + "learning_rate": 8.424871223161149e-06, + "loss": 0.9095, + "step": 17775 + }, + { + "epoch": 1.0, + "grad_norm": 12.179896703542951, + "learning_rate": 8.423676908651262e-06, + "loss": 0.9865, + "step": 17780 + }, + { + "epoch": 1.0, + "grad_norm": 18.362839299506145, + "learning_rate": 8.42248222625218e-06, + "loss": 0.9546, + "step": 17785 + }, + { + "epoch": 1.0, + "grad_norm": 14.888365966924951, + "learning_rate": 8.421287176092281e-06, + "loss": 0.945, + "step": 17790 + }, + { + "epoch": 1.0, + "grad_norm": 10.672762024171263, + "learning_rate": 8.420091758299973e-06, + "loss": 0.9378, + "step": 17795 + }, + { + "epoch": 1.0, + "grad_norm": 15.506551720105456, + "learning_rate": 8.418895973003714e-06, + "loss": 0.9938, + "step": 17800 + }, + { + "epoch": 1.0, + "grad_norm": 5.486516634692079, + "learning_rate": 8.417699820331993e-06, + "loss": 0.9082, + "step": 17805 + }, + { + "epoch": 1.0, + "grad_norm": 6.364703652262363, + "learning_rate": 8.416503300413343e-06, + "loss": 0.9372, + "step": 17810 + }, + { + "epoch": 1.0, + "grad_norm": 9.545462843248188, + "learning_rate": 8.415306413376334e-06, + "loss": 0.9836, + "step": 17815 + }, + { + "epoch": 1.0, + "grad_norm": 11.605583686936086, + "learning_rate": 8.41410915934958e-06, + "loss": 0.9373, + "step": 17820 + }, + { + "epoch": 1.0, + "grad_norm": 9.517101393133185, + "learning_rate": 8.412911538461728e-06, + "loss": 0.9467, + "step": 17825 + }, + { + "epoch": 1.0, + "grad_norm": 26.77141370776423, + "learning_rate": 8.411713550841468e-06, + "loss": 1.0081, + "step": 17830 + }, + { + "epoch": 1.0, + "grad_norm": 9.113379449744265, + "learning_rate": 8.410515196617531e-06, + "loss": 0.9514, + "step": 17835 + }, + { + "epoch": 1.01, + "grad_norm": 14.14320939335982, + "learning_rate": 8.409316475918685e-06, + "loss": 0.9376, + "step": 17840 + }, + { + "epoch": 1.01, + "grad_norm": 6.796556111361008, + "learning_rate": 8.408117388873736e-06, + "loss": 0.9535, + "step": 17845 + }, + { + "epoch": 1.01, + "grad_norm": 13.034232956285297, + "learning_rate": 8.406917935611534e-06, + "loss": 0.9787, + "step": 17850 + }, + { + "epoch": 1.01, + "grad_norm": 13.086630477089237, + "learning_rate": 8.405718116260961e-06, + "loss": 0.984, + "step": 17855 + }, + { + "epoch": 1.01, + "grad_norm": 9.455304663864185, + "learning_rate": 8.404517930950948e-06, + "loss": 0.9526, + "step": 17860 + }, + { + "epoch": 1.01, + "grad_norm": 24.500988484822788, + "learning_rate": 8.403317379810457e-06, + "loss": 0.9772, + "step": 17865 + }, + { + "epoch": 1.01, + "grad_norm": 19.7152269612736, + "learning_rate": 8.402116462968494e-06, + "loss": 0.9556, + "step": 17870 + }, + { + "epoch": 1.01, + "grad_norm": 6.355661289210014, + "learning_rate": 8.400915180554103e-06, + "loss": 0.9564, + "step": 17875 + }, + { + "epoch": 1.01, + "grad_norm": 14.78776367650486, + "learning_rate": 8.399713532696367e-06, + "loss": 0.9458, + "step": 17880 + }, + { + "epoch": 1.01, + "grad_norm": 6.81553224748409, + "learning_rate": 8.398511519524407e-06, + "loss": 0.9935, + "step": 17885 + }, + { + "epoch": 1.01, + "grad_norm": 15.93516160483249, + "learning_rate": 8.397309141167385e-06, + "loss": 0.8953, + "step": 17890 + }, + { + "epoch": 1.01, + "grad_norm": 11.285809425161123, + "learning_rate": 8.396106397754502e-06, + "loss": 0.9396, + "step": 17895 + }, + { + "epoch": 1.01, + "grad_norm": 9.369381116123431, + "learning_rate": 8.394903289415e-06, + "loss": 0.9678, + "step": 17900 + }, + { + "epoch": 1.01, + "grad_norm": 10.775482011076765, + "learning_rate": 8.393699816278156e-06, + "loss": 0.9888, + "step": 17905 + }, + { + "epoch": 1.01, + "grad_norm": 21.196293851671367, + "learning_rate": 8.392495978473288e-06, + "loss": 0.9507, + "step": 17910 + }, + { + "epoch": 1.01, + "grad_norm": 12.04637881645689, + "learning_rate": 8.391291776129756e-06, + "loss": 1.0118, + "step": 17915 + }, + { + "epoch": 1.01, + "grad_norm": 6.197501902986017, + "learning_rate": 8.390087209376955e-06, + "loss": 1.0271, + "step": 17920 + }, + { + "epoch": 1.01, + "grad_norm": 5.8033078762451735, + "learning_rate": 8.388882278344323e-06, + "loss": 0.9333, + "step": 17925 + }, + { + "epoch": 1.01, + "grad_norm": 6.189558038761195, + "learning_rate": 8.387676983161334e-06, + "loss": 0.9384, + "step": 17930 + }, + { + "epoch": 1.01, + "grad_norm": 5.855087387346607, + "learning_rate": 8.3864713239575e-06, + "loss": 0.9485, + "step": 17935 + }, + { + "epoch": 1.01, + "grad_norm": 6.760669896597583, + "learning_rate": 8.38526530086238e-06, + "loss": 0.9504, + "step": 17940 + }, + { + "epoch": 1.01, + "grad_norm": 10.160041078365477, + "learning_rate": 8.384058914005559e-06, + "loss": 1.002, + "step": 17945 + }, + { + "epoch": 1.01, + "grad_norm": 6.497187362194074, + "learning_rate": 8.382852163516674e-06, + "loss": 0.918, + "step": 17950 + }, + { + "epoch": 1.01, + "grad_norm": 5.454154753435161, + "learning_rate": 8.381645049525397e-06, + "loss": 0.9583, + "step": 17955 + }, + { + "epoch": 1.01, + "grad_norm": 5.243586210893909, + "learning_rate": 8.380437572161433e-06, + "loss": 0.9782, + "step": 17960 + }, + { + "epoch": 1.01, + "grad_norm": 5.544112672005654, + "learning_rate": 8.379229731554535e-06, + "loss": 0.9446, + "step": 17965 + }, + { + "epoch": 1.01, + "grad_norm": 6.091305152673275, + "learning_rate": 8.378021527834486e-06, + "loss": 0.9595, + "step": 17970 + }, + { + "epoch": 1.01, + "grad_norm": 5.483751533615201, + "learning_rate": 8.376812961131116e-06, + "loss": 0.9638, + "step": 17975 + }, + { + "epoch": 1.01, + "grad_norm": 6.784214784890503, + "learning_rate": 8.37560403157429e-06, + "loss": 0.9441, + "step": 17980 + }, + { + "epoch": 1.01, + "grad_norm": 23.59859939988344, + "learning_rate": 8.374394739293915e-06, + "loss": 0.974, + "step": 17985 + }, + { + "epoch": 1.01, + "grad_norm": 6.229782337216466, + "learning_rate": 8.37318508441993e-06, + "loss": 1.0004, + "step": 17990 + }, + { + "epoch": 1.01, + "grad_norm": 9.72974584502807, + "learning_rate": 8.371975067082323e-06, + "loss": 0.9712, + "step": 17995 + }, + { + "epoch": 1.01, + "grad_norm": 21.746766732841987, + "learning_rate": 8.370764687411112e-06, + "loss": 0.9699, + "step": 18000 + }, + { + "epoch": 1.01, + "grad_norm": 12.286187714329804, + "learning_rate": 8.369553945536362e-06, + "loss": 1.003, + "step": 18005 + }, + { + "epoch": 1.01, + "grad_norm": 7.928871579151746, + "learning_rate": 8.368342841588166e-06, + "loss": 0.9202, + "step": 18010 + }, + { + "epoch": 1.02, + "grad_norm": 7.274170476041927, + "learning_rate": 8.36713137569667e-06, + "loss": 0.9862, + "step": 18015 + }, + { + "epoch": 1.02, + "grad_norm": 16.512660323560134, + "learning_rate": 8.365919547992044e-06, + "loss": 0.9561, + "step": 18020 + }, + { + "epoch": 1.02, + "grad_norm": 6.195093630974645, + "learning_rate": 8.364707358604507e-06, + "loss": 0.924, + "step": 18025 + }, + { + "epoch": 1.02, + "grad_norm": 6.1052810062634, + "learning_rate": 8.363494807664317e-06, + "loss": 0.9424, + "step": 18030 + }, + { + "epoch": 1.02, + "grad_norm": 5.602480660263411, + "learning_rate": 8.362281895301764e-06, + "loss": 0.9831, + "step": 18035 + }, + { + "epoch": 1.02, + "grad_norm": 6.421941861714285, + "learning_rate": 8.361068621647183e-06, + "loss": 0.9708, + "step": 18040 + }, + { + "epoch": 1.02, + "grad_norm": 6.941533958485444, + "learning_rate": 8.359854986830946e-06, + "loss": 0.9572, + "step": 18045 + }, + { + "epoch": 1.02, + "grad_norm": 5.174726028923948, + "learning_rate": 8.358640990983462e-06, + "loss": 0.9572, + "step": 18050 + }, + { + "epoch": 1.02, + "grad_norm": 5.8112198563692115, + "learning_rate": 8.357426634235181e-06, + "loss": 0.9365, + "step": 18055 + }, + { + "epoch": 1.02, + "grad_norm": 9.010485347737228, + "learning_rate": 8.356211916716591e-06, + "loss": 0.9303, + "step": 18060 + }, + { + "epoch": 1.02, + "grad_norm": 7.629349724775571, + "learning_rate": 8.354996838558216e-06, + "loss": 1.0019, + "step": 18065 + }, + { + "epoch": 1.02, + "grad_norm": 4.8006384018527415, + "learning_rate": 8.353781399890627e-06, + "loss": 0.955, + "step": 18070 + }, + { + "epoch": 1.02, + "grad_norm": 7.691838965415852, + "learning_rate": 8.352565600844423e-06, + "loss": 0.9841, + "step": 18075 + }, + { + "epoch": 1.02, + "grad_norm": 17.90274075326026, + "learning_rate": 8.35134944155025e-06, + "loss": 0.9797, + "step": 18080 + }, + { + "epoch": 1.02, + "grad_norm": 8.722774412280813, + "learning_rate": 8.35013292213879e-06, + "loss": 0.9849, + "step": 18085 + }, + { + "epoch": 1.02, + "grad_norm": 8.918455321710201, + "learning_rate": 8.34891604274076e-06, + "loss": 0.9636, + "step": 18090 + }, + { + "epoch": 1.02, + "grad_norm": 17.268835468399153, + "learning_rate": 8.347698803486923e-06, + "loss": 0.9758, + "step": 18095 + }, + { + "epoch": 1.02, + "grad_norm": 30.127043018401352, + "learning_rate": 8.346481204508074e-06, + "loss": 0.919, + "step": 18100 + }, + { + "epoch": 1.02, + "grad_norm": 39.89111038215658, + "learning_rate": 8.34526324593505e-06, + "loss": 1.0071, + "step": 18105 + }, + { + "epoch": 1.02, + "grad_norm": 11.074029187578066, + "learning_rate": 8.34404492789873e-06, + "loss": 0.9443, + "step": 18110 + }, + { + "epoch": 1.02, + "grad_norm": 21.07756951105243, + "learning_rate": 8.34282625053002e-06, + "loss": 0.9777, + "step": 18115 + }, + { + "epoch": 1.02, + "grad_norm": 6.261502736007123, + "learning_rate": 8.341607213959878e-06, + "loss": 0.9467, + "step": 18120 + }, + { + "epoch": 1.02, + "grad_norm": 45.67904376279292, + "learning_rate": 8.340387818319293e-06, + "loss": 0.9187, + "step": 18125 + }, + { + "epoch": 1.02, + "grad_norm": 24.91464582420456, + "learning_rate": 8.339168063739296e-06, + "loss": 0.9602, + "step": 18130 + }, + { + "epoch": 1.02, + "grad_norm": 20.608723898861903, + "learning_rate": 8.337947950350952e-06, + "loss": 0.9051, + "step": 18135 + }, + { + "epoch": 1.02, + "grad_norm": 30.400324583792425, + "learning_rate": 8.33672747828537e-06, + "loss": 0.9359, + "step": 18140 + }, + { + "epoch": 1.02, + "grad_norm": 17.50665466740123, + "learning_rate": 8.335506647673694e-06, + "loss": 0.9758, + "step": 18145 + }, + { + "epoch": 1.02, + "grad_norm": 39.0997047554433, + "learning_rate": 8.334285458647107e-06, + "loss": 0.9474, + "step": 18150 + }, + { + "epoch": 1.02, + "grad_norm": 16.64631163623313, + "learning_rate": 8.333063911336834e-06, + "loss": 0.9864, + "step": 18155 + }, + { + "epoch": 1.02, + "grad_norm": 30.842942128044452, + "learning_rate": 8.331842005874133e-06, + "loss": 0.9267, + "step": 18160 + }, + { + "epoch": 1.02, + "grad_norm": 7.724000587486835, + "learning_rate": 8.330619742390305e-06, + "loss": 0.9693, + "step": 18165 + }, + { + "epoch": 1.02, + "grad_norm": 28.029245836093526, + "learning_rate": 8.329397121016685e-06, + "loss": 0.9846, + "step": 18170 + }, + { + "epoch": 1.02, + "grad_norm": 12.986480048543006, + "learning_rate": 8.32817414188465e-06, + "loss": 0.9658, + "step": 18175 + }, + { + "epoch": 1.02, + "grad_norm": 14.736614094456346, + "learning_rate": 8.326950805125616e-06, + "loss": 0.9817, + "step": 18180 + }, + { + "epoch": 1.02, + "grad_norm": 12.629078013752862, + "learning_rate": 8.325727110871037e-06, + "loss": 0.9667, + "step": 18185 + }, + { + "epoch": 1.02, + "grad_norm": 25.056818404039877, + "learning_rate": 8.3245030592524e-06, + "loss": 0.9677, + "step": 18190 + }, + { + "epoch": 1.03, + "grad_norm": 16.78687448613767, + "learning_rate": 8.323278650401239e-06, + "loss": 0.9261, + "step": 18195 + }, + { + "epoch": 1.03, + "grad_norm": 15.668641636699055, + "learning_rate": 8.322053884449118e-06, + "loss": 0.9773, + "step": 18200 + }, + { + "epoch": 1.03, + "grad_norm": 6.144517806073648, + "learning_rate": 8.320828761527646e-06, + "loss": 0.9642, + "step": 18205 + }, + { + "epoch": 1.03, + "grad_norm": 19.87183113438539, + "learning_rate": 8.319603281768468e-06, + "loss": 0.9635, + "step": 18210 + }, + { + "epoch": 1.03, + "grad_norm": 10.087429965457005, + "learning_rate": 8.31837744530327e-06, + "loss": 0.9548, + "step": 18215 + }, + { + "epoch": 1.03, + "grad_norm": 60.09733804016439, + "learning_rate": 8.317151252263767e-06, + "loss": 0.93, + "step": 18220 + }, + { + "epoch": 1.03, + "grad_norm": 39.00904417089919, + "learning_rate": 8.315924702781722e-06, + "loss": 0.9532, + "step": 18225 + }, + { + "epoch": 1.03, + "grad_norm": 11.029073412497254, + "learning_rate": 8.314697796988935e-06, + "loss": 0.9248, + "step": 18230 + }, + { + "epoch": 1.03, + "grad_norm": 42.28286385404939, + "learning_rate": 8.313470535017238e-06, + "loss": 0.9838, + "step": 18235 + }, + { + "epoch": 1.03, + "grad_norm": 12.955384374997312, + "learning_rate": 8.312242916998512e-06, + "loss": 0.9115, + "step": 18240 + }, + { + "epoch": 1.03, + "grad_norm": 34.29604175273556, + "learning_rate": 8.311014943064666e-06, + "loss": 0.9836, + "step": 18245 + }, + { + "epoch": 1.03, + "grad_norm": 15.155447289367002, + "learning_rate": 8.30978661334765e-06, + "loss": 0.9535, + "step": 18250 + }, + { + "epoch": 1.03, + "grad_norm": 22.786695422905964, + "learning_rate": 8.308557927979457e-06, + "loss": 0.9502, + "step": 18255 + }, + { + "epoch": 1.03, + "grad_norm": 12.835472742132069, + "learning_rate": 8.307328887092113e-06, + "loss": 0.9517, + "step": 18260 + }, + { + "epoch": 1.03, + "grad_norm": 23.41340268501469, + "learning_rate": 8.30609949081768e-06, + "loss": 1.0081, + "step": 18265 + }, + { + "epoch": 1.03, + "grad_norm": 14.790843337054666, + "learning_rate": 8.30486973928827e-06, + "loss": 0.9768, + "step": 18270 + }, + { + "epoch": 1.03, + "grad_norm": 10.022962350746162, + "learning_rate": 8.30363963263602e-06, + "loss": 0.9343, + "step": 18275 + }, + { + "epoch": 1.03, + "grad_norm": 10.901891764383109, + "learning_rate": 8.302409170993111e-06, + "loss": 0.9641, + "step": 18280 + }, + { + "epoch": 1.03, + "grad_norm": 6.76624124271663, + "learning_rate": 8.301178354491761e-06, + "loss": 0.8977, + "step": 18285 + }, + { + "epoch": 1.03, + "grad_norm": 21.27823795057321, + "learning_rate": 8.299947183264228e-06, + "loss": 0.9807, + "step": 18290 + }, + { + "epoch": 1.03, + "grad_norm": 21.249876163768274, + "learning_rate": 8.298715657442806e-06, + "loss": 0.9157, + "step": 18295 + }, + { + "epoch": 1.03, + "grad_norm": 13.004007173788033, + "learning_rate": 8.297483777159832e-06, + "loss": 0.9421, + "step": 18300 + }, + { + "epoch": 1.03, + "grad_norm": 6.392554791068047, + "learning_rate": 8.296251542547669e-06, + "loss": 0.986, + "step": 18305 + }, + { + "epoch": 1.03, + "grad_norm": 11.615683155154661, + "learning_rate": 8.295018953738731e-06, + "loss": 0.9661, + "step": 18310 + }, + { + "epoch": 1.03, + "grad_norm": 10.114232138557812, + "learning_rate": 8.293786010865466e-06, + "loss": 0.988, + "step": 18315 + }, + { + "epoch": 1.03, + "grad_norm": 10.30042125520741, + "learning_rate": 8.292552714060356e-06, + "loss": 0.9409, + "step": 18320 + }, + { + "epoch": 1.03, + "grad_norm": 8.5353023364251, + "learning_rate": 8.291319063455926e-06, + "loss": 0.9558, + "step": 18325 + }, + { + "epoch": 1.03, + "grad_norm": 25.610488752863407, + "learning_rate": 8.290085059184738e-06, + "loss": 0.9496, + "step": 18330 + }, + { + "epoch": 1.03, + "grad_norm": 23.270991061708862, + "learning_rate": 8.288850701379389e-06, + "loss": 0.9814, + "step": 18335 + }, + { + "epoch": 1.03, + "grad_norm": 5.308313373842472, + "learning_rate": 8.287615990172518e-06, + "loss": 0.9542, + "step": 18340 + }, + { + "epoch": 1.03, + "grad_norm": 19.33781984905678, + "learning_rate": 8.286380925696798e-06, + "loss": 0.967, + "step": 18345 + }, + { + "epoch": 1.03, + "grad_norm": 6.208328854455573, + "learning_rate": 8.285145508084946e-06, + "loss": 0.9375, + "step": 18350 + }, + { + "epoch": 1.03, + "grad_norm": 22.416980708087756, + "learning_rate": 8.283909737469708e-06, + "loss": 0.9893, + "step": 18355 + }, + { + "epoch": 1.03, + "grad_norm": 5.192097478384558, + "learning_rate": 8.282673613983879e-06, + "loss": 0.9131, + "step": 18360 + }, + { + "epoch": 1.03, + "grad_norm": 15.378178719432283, + "learning_rate": 8.28143713776028e-06, + "loss": 0.9457, + "step": 18365 + }, + { + "epoch": 1.04, + "grad_norm": 12.290966136744645, + "learning_rate": 8.28020030893178e-06, + "loss": 0.9822, + "step": 18370 + }, + { + "epoch": 1.04, + "grad_norm": 6.307433316113864, + "learning_rate": 8.278963127631279e-06, + "loss": 1.0093, + "step": 18375 + }, + { + "epoch": 1.04, + "grad_norm": 6.957628210901851, + "learning_rate": 8.27772559399172e-06, + "loss": 0.9393, + "step": 18380 + }, + { + "epoch": 1.04, + "grad_norm": 7.356604342839569, + "learning_rate": 8.276487708146079e-06, + "loss": 0.9358, + "step": 18385 + }, + { + "epoch": 1.04, + "grad_norm": 6.7197882949054195, + "learning_rate": 8.275249470227375e-06, + "loss": 1.0173, + "step": 18390 + }, + { + "epoch": 1.04, + "grad_norm": 9.923678298763505, + "learning_rate": 8.274010880368659e-06, + "loss": 0.9534, + "step": 18395 + }, + { + "epoch": 1.04, + "grad_norm": 8.614201875233425, + "learning_rate": 8.272771938703026e-06, + "loss": 0.9382, + "step": 18400 + }, + { + "epoch": 1.04, + "grad_norm": 13.504115565168725, + "learning_rate": 8.271532645363603e-06, + "loss": 0.959, + "step": 18405 + }, + { + "epoch": 1.04, + "grad_norm": 6.04604956495506, + "learning_rate": 8.270293000483558e-06, + "loss": 0.9537, + "step": 18410 + }, + { + "epoch": 1.04, + "grad_norm": 5.5964264965602855, + "learning_rate": 8.269053004196101e-06, + "loss": 0.9786, + "step": 18415 + }, + { + "epoch": 1.04, + "grad_norm": 23.66660192681161, + "learning_rate": 8.26781265663447e-06, + "loss": 0.9706, + "step": 18420 + }, + { + "epoch": 1.04, + "grad_norm": 13.796563103202072, + "learning_rate": 8.266571957931944e-06, + "loss": 0.9459, + "step": 18425 + }, + { + "epoch": 1.04, + "grad_norm": 5.570452622301191, + "learning_rate": 8.265330908221847e-06, + "loss": 0.9281, + "step": 18430 + }, + { + "epoch": 1.04, + "grad_norm": 9.176551398651894, + "learning_rate": 8.264089507637532e-06, + "loss": 0.9126, + "step": 18435 + }, + { + "epoch": 1.04, + "grad_norm": 14.338693718246894, + "learning_rate": 8.262847756312395e-06, + "loss": 0.9521, + "step": 18440 + }, + { + "epoch": 1.04, + "grad_norm": 5.637525785879936, + "learning_rate": 8.261605654379867e-06, + "loss": 1.017, + "step": 18445 + }, + { + "epoch": 1.04, + "grad_norm": 16.01039001176703, + "learning_rate": 8.260363201973414e-06, + "loss": 0.9709, + "step": 18450 + }, + { + "epoch": 1.04, + "grad_norm": 7.14051163935431, + "learning_rate": 8.259120399226549e-06, + "loss": 0.997, + "step": 18455 + }, + { + "epoch": 1.04, + "grad_norm": 6.98945406377979, + "learning_rate": 8.257877246272812e-06, + "loss": 0.9537, + "step": 18460 + }, + { + "epoch": 1.04, + "grad_norm": 7.090962525715208, + "learning_rate": 8.256633743245785e-06, + "loss": 0.9723, + "step": 18465 + }, + { + "epoch": 1.04, + "grad_norm": 6.515364227793765, + "learning_rate": 8.255389890279092e-06, + "loss": 0.9487, + "step": 18470 + }, + { + "epoch": 1.04, + "grad_norm": 14.510355697987137, + "learning_rate": 8.25414568750639e-06, + "loss": 0.9522, + "step": 18475 + }, + { + "epoch": 1.04, + "grad_norm": 10.103611961373405, + "learning_rate": 8.252901135061368e-06, + "loss": 0.988, + "step": 18480 + }, + { + "epoch": 1.04, + "grad_norm": 4.770202819196264, + "learning_rate": 8.251656233077765e-06, + "loss": 0.9149, + "step": 18485 + }, + { + "epoch": 1.04, + "grad_norm": 5.991801820714467, + "learning_rate": 8.25041098168935e-06, + "loss": 0.9593, + "step": 18490 + }, + { + "epoch": 1.04, + "grad_norm": 6.650659264452085, + "learning_rate": 8.24916538102993e-06, + "loss": 0.9414, + "step": 18495 + }, + { + "epoch": 1.04, + "grad_norm": 7.966841558848007, + "learning_rate": 8.247919431233348e-06, + "loss": 0.955, + "step": 18500 + }, + { + "epoch": 1.04, + "grad_norm": 15.225653690132798, + "learning_rate": 8.24667313243349e-06, + "loss": 0.9534, + "step": 18505 + }, + { + "epoch": 1.04, + "grad_norm": 5.631953567911231, + "learning_rate": 8.245426484764276e-06, + "loss": 0.8882, + "step": 18510 + }, + { + "epoch": 1.04, + "grad_norm": 7.726119781198639, + "learning_rate": 8.244179488359662e-06, + "loss": 1.0197, + "step": 18515 + }, + { + "epoch": 1.04, + "grad_norm": 16.174482327233644, + "learning_rate": 8.242932143353644e-06, + "loss": 0.9582, + "step": 18520 + }, + { + "epoch": 1.04, + "grad_norm": 8.534930144614199, + "learning_rate": 8.241684449880256e-06, + "loss": 0.976, + "step": 18525 + }, + { + "epoch": 1.04, + "grad_norm": 6.856211269291136, + "learning_rate": 8.240436408073567e-06, + "loss": 0.9495, + "step": 18530 + }, + { + "epoch": 1.04, + "grad_norm": 5.98076858251849, + "learning_rate": 8.239188018067685e-06, + "loss": 1.0105, + "step": 18535 + }, + { + "epoch": 1.04, + "grad_norm": 9.07406647910944, + "learning_rate": 8.237939279996756e-06, + "loss": 0.8891, + "step": 18540 + }, + { + "epoch": 1.04, + "grad_norm": 7.515147583058341, + "learning_rate": 8.236690193994958e-06, + "loss": 0.9179, + "step": 18545 + }, + { + "epoch": 1.05, + "grad_norm": 5.714080446754478, + "learning_rate": 8.235440760196515e-06, + "loss": 0.9494, + "step": 18550 + }, + { + "epoch": 1.05, + "grad_norm": 11.492272147032065, + "learning_rate": 8.234190978735685e-06, + "loss": 0.9619, + "step": 18555 + }, + { + "epoch": 1.05, + "grad_norm": 9.568265001262244, + "learning_rate": 8.23294084974676e-06, + "loss": 0.9696, + "step": 18560 + }, + { + "epoch": 1.05, + "grad_norm": 6.3419706985076845, + "learning_rate": 8.231690373364073e-06, + "loss": 1.0006, + "step": 18565 + }, + { + "epoch": 1.05, + "grad_norm": 5.564488376137859, + "learning_rate": 8.23043954972199e-06, + "loss": 0.9454, + "step": 18570 + }, + { + "epoch": 1.05, + "grad_norm": 8.048791559594555, + "learning_rate": 8.229188378954924e-06, + "loss": 0.9429, + "step": 18575 + }, + { + "epoch": 1.05, + "grad_norm": 10.597372581230177, + "learning_rate": 8.227936861197315e-06, + "loss": 0.8992, + "step": 18580 + }, + { + "epoch": 1.05, + "grad_norm": 7.154848619959518, + "learning_rate": 8.226684996583643e-06, + "loss": 0.9725, + "step": 18585 + }, + { + "epoch": 1.05, + "grad_norm": 5.668712866256631, + "learning_rate": 8.225432785248428e-06, + "loss": 0.924, + "step": 18590 + }, + { + "epoch": 1.05, + "grad_norm": 14.1545260742609, + "learning_rate": 8.224180227326225e-06, + "loss": 0.9462, + "step": 18595 + }, + { + "epoch": 1.05, + "grad_norm": 9.001727228534417, + "learning_rate": 8.222927322951627e-06, + "loss": 0.9561, + "step": 18600 + }, + { + "epoch": 1.05, + "grad_norm": 7.339464771272855, + "learning_rate": 8.221674072259264e-06, + "loss": 0.9805, + "step": 18605 + }, + { + "epoch": 1.05, + "grad_norm": 5.465257003196277, + "learning_rate": 8.220420475383806e-06, + "loss": 0.9831, + "step": 18610 + }, + { + "epoch": 1.05, + "grad_norm": 5.309880652239877, + "learning_rate": 8.219166532459952e-06, + "loss": 0.9339, + "step": 18615 + }, + { + "epoch": 1.05, + "grad_norm": 5.303498663406126, + "learning_rate": 8.217912243622448e-06, + "loss": 0.9385, + "step": 18620 + }, + { + "epoch": 1.05, + "grad_norm": 7.601022428278913, + "learning_rate": 8.216657609006074e-06, + "loss": 0.9489, + "step": 18625 + }, + { + "epoch": 1.05, + "grad_norm": 12.263553704963337, + "learning_rate": 8.215402628745641e-06, + "loss": 0.9579, + "step": 18630 + }, + { + "epoch": 1.05, + "grad_norm": 9.934401613463754, + "learning_rate": 8.214147302976006e-06, + "loss": 0.939, + "step": 18635 + }, + { + "epoch": 1.05, + "grad_norm": 5.7049324252921885, + "learning_rate": 8.212891631832056e-06, + "loss": 0.9588, + "step": 18640 + }, + { + "epoch": 1.05, + "grad_norm": 12.209531753799407, + "learning_rate": 8.211635615448724e-06, + "loss": 0.9399, + "step": 18645 + }, + { + "epoch": 1.05, + "grad_norm": 8.827967546412326, + "learning_rate": 8.210379253960968e-06, + "loss": 0.9697, + "step": 18650 + }, + { + "epoch": 1.05, + "grad_norm": 10.527337209997317, + "learning_rate": 8.209122547503794e-06, + "loss": 0.928, + "step": 18655 + }, + { + "epoch": 1.05, + "grad_norm": 5.3975283433171315, + "learning_rate": 8.20786549621224e-06, + "loss": 0.9518, + "step": 18660 + }, + { + "epoch": 1.05, + "grad_norm": 5.86373472604526, + "learning_rate": 8.206608100221381e-06, + "loss": 0.9493, + "step": 18665 + }, + { + "epoch": 1.05, + "grad_norm": 5.305465192759861, + "learning_rate": 8.205350359666328e-06, + "loss": 0.9427, + "step": 18670 + }, + { + "epoch": 1.05, + "grad_norm": 7.213564348863166, + "learning_rate": 8.204092274682234e-06, + "loss": 0.9364, + "step": 18675 + }, + { + "epoch": 1.05, + "grad_norm": 8.333924477842656, + "learning_rate": 8.202833845404283e-06, + "loss": 0.9285, + "step": 18680 + }, + { + "epoch": 1.05, + "grad_norm": 5.860249703771888, + "learning_rate": 8.201575071967702e-06, + "loss": 0.9049, + "step": 18685 + }, + { + "epoch": 1.05, + "grad_norm": 8.474337294939785, + "learning_rate": 8.20031595450775e-06, + "loss": 0.9035, + "step": 18690 + }, + { + "epoch": 1.05, + "grad_norm": 6.196275356019304, + "learning_rate": 8.199056493159723e-06, + "loss": 0.964, + "step": 18695 + }, + { + "epoch": 1.05, + "grad_norm": 6.4508756345650315, + "learning_rate": 8.197796688058958e-06, + "loss": 0.9324, + "step": 18700 + }, + { + "epoch": 1.05, + "grad_norm": 9.844182868137905, + "learning_rate": 8.196536539340826e-06, + "loss": 0.9888, + "step": 18705 + }, + { + "epoch": 1.05, + "grad_norm": 18.547983933573764, + "learning_rate": 8.195276047140736e-06, + "loss": 1.0005, + "step": 18710 + }, + { + "epoch": 1.05, + "grad_norm": 13.488927970940656, + "learning_rate": 8.194015211594132e-06, + "loss": 0.9491, + "step": 18715 + }, + { + "epoch": 1.05, + "grad_norm": 8.705863112768338, + "learning_rate": 8.192754032836497e-06, + "loss": 0.9034, + "step": 18720 + }, + { + "epoch": 1.06, + "grad_norm": 9.145966580637015, + "learning_rate": 8.191492511003352e-06, + "loss": 0.9655, + "step": 18725 + }, + { + "epoch": 1.06, + "grad_norm": 6.20031713580108, + "learning_rate": 8.19023064623025e-06, + "loss": 0.9553, + "step": 18730 + }, + { + "epoch": 1.06, + "grad_norm": 13.639991001447758, + "learning_rate": 8.188968438652786e-06, + "loss": 0.9624, + "step": 18735 + }, + { + "epoch": 1.06, + "grad_norm": 21.608267164285888, + "learning_rate": 8.18770588840659e-06, + "loss": 0.9002, + "step": 18740 + }, + { + "epoch": 1.06, + "grad_norm": 5.945583228113203, + "learning_rate": 8.186442995627326e-06, + "loss": 0.9304, + "step": 18745 + }, + { + "epoch": 1.06, + "grad_norm": 28.34890652585614, + "learning_rate": 8.185179760450701e-06, + "loss": 0.9739, + "step": 18750 + }, + { + "epoch": 1.06, + "grad_norm": 6.069300970650084, + "learning_rate": 8.183916183012454e-06, + "loss": 0.8806, + "step": 18755 + }, + { + "epoch": 1.06, + "grad_norm": 36.11615174510507, + "learning_rate": 8.18265226344836e-06, + "loss": 0.9402, + "step": 18760 + }, + { + "epoch": 1.06, + "grad_norm": 16.215858061573503, + "learning_rate": 8.181388001894235e-06, + "loss": 0.9577, + "step": 18765 + }, + { + "epoch": 1.06, + "grad_norm": 22.219075572387343, + "learning_rate": 8.180123398485928e-06, + "loss": 0.9962, + "step": 18770 + }, + { + "epoch": 1.06, + "grad_norm": 30.866794792385654, + "learning_rate": 8.178858453359325e-06, + "loss": 0.9723, + "step": 18775 + }, + { + "epoch": 1.06, + "grad_norm": 9.349053682449533, + "learning_rate": 8.177593166650355e-06, + "loss": 1.0173, + "step": 18780 + }, + { + "epoch": 1.06, + "grad_norm": 18.764661962599007, + "learning_rate": 8.176327538494974e-06, + "loss": 0.9737, + "step": 18785 + }, + { + "epoch": 1.06, + "grad_norm": 5.157296726846901, + "learning_rate": 8.175061569029184e-06, + "loss": 0.967, + "step": 18790 + }, + { + "epoch": 1.06, + "grad_norm": 9.996580215077445, + "learning_rate": 8.17379525838901e-06, + "loss": 0.9716, + "step": 18795 + }, + { + "epoch": 1.06, + "grad_norm": 6.957251562807729, + "learning_rate": 8.172528606710532e-06, + "loss": 0.9563, + "step": 18800 + }, + { + "epoch": 1.06, + "grad_norm": 7.233281480523723, + "learning_rate": 8.171261614129853e-06, + "loss": 0.9245, + "step": 18805 + }, + { + "epoch": 1.06, + "grad_norm": 5.3928794184713915, + "learning_rate": 8.169994280783118e-06, + "loss": 0.9789, + "step": 18810 + }, + { + "epoch": 1.06, + "grad_norm": 7.849178797058468, + "learning_rate": 8.168726606806509e-06, + "loss": 0.941, + "step": 18815 + }, + { + "epoch": 1.06, + "grad_norm": 4.978465793251911, + "learning_rate": 8.16745859233624e-06, + "loss": 0.9328, + "step": 18820 + }, + { + "epoch": 1.06, + "grad_norm": 6.995401138400932, + "learning_rate": 8.166190237508564e-06, + "loss": 0.976, + "step": 18825 + }, + { + "epoch": 1.06, + "grad_norm": 15.280665478113042, + "learning_rate": 8.164921542459775e-06, + "loss": 0.9521, + "step": 18830 + }, + { + "epoch": 1.06, + "grad_norm": 6.471328976224394, + "learning_rate": 8.1636525073262e-06, + "loss": 0.9743, + "step": 18835 + }, + { + "epoch": 1.06, + "grad_norm": 12.82507813033722, + "learning_rate": 8.1623831322442e-06, + "loss": 1.003, + "step": 18840 + }, + { + "epoch": 1.06, + "grad_norm": 13.591937122079878, + "learning_rate": 8.161113417350175e-06, + "loss": 0.949, + "step": 18845 + }, + { + "epoch": 1.06, + "grad_norm": 5.920017579385835, + "learning_rate": 8.159843362780561e-06, + "loss": 0.9835, + "step": 18850 + }, + { + "epoch": 1.06, + "grad_norm": 6.151195001545371, + "learning_rate": 8.158572968671836e-06, + "loss": 0.9278, + "step": 18855 + }, + { + "epoch": 1.06, + "grad_norm": 6.16530122699321, + "learning_rate": 8.157302235160503e-06, + "loss": 0.9589, + "step": 18860 + }, + { + "epoch": 1.06, + "grad_norm": 5.795817208230676, + "learning_rate": 8.156031162383112e-06, + "loss": 0.9317, + "step": 18865 + }, + { + "epoch": 1.06, + "grad_norm": 5.669749041217251, + "learning_rate": 8.154759750476243e-06, + "loss": 0.9238, + "step": 18870 + }, + { + "epoch": 1.06, + "grad_norm": 6.223008124934531, + "learning_rate": 8.153487999576515e-06, + "loss": 0.9893, + "step": 18875 + }, + { + "epoch": 1.06, + "grad_norm": 6.195875131379625, + "learning_rate": 8.152215909820585e-06, + "loss": 0.9693, + "step": 18880 + }, + { + "epoch": 1.06, + "grad_norm": 11.460823359114162, + "learning_rate": 8.150943481345144e-06, + "loss": 0.9553, + "step": 18885 + }, + { + "epoch": 1.06, + "grad_norm": 6.194043017006326, + "learning_rate": 8.149670714286918e-06, + "loss": 0.9884, + "step": 18890 + }, + { + "epoch": 1.06, + "grad_norm": 6.370120778961474, + "learning_rate": 8.148397608782674e-06, + "loss": 0.9534, + "step": 18895 + }, + { + "epoch": 1.06, + "grad_norm": 5.44001417886006, + "learning_rate": 8.147124164969215e-06, + "loss": 0.9226, + "step": 18900 + }, + { + "epoch": 1.07, + "grad_norm": 5.367211857488165, + "learning_rate": 8.145850382983371e-06, + "loss": 0.97, + "step": 18905 + }, + { + "epoch": 1.07, + "grad_norm": 5.771808296700199, + "learning_rate": 8.144576262962021e-06, + "loss": 0.9357, + "step": 18910 + }, + { + "epoch": 1.07, + "grad_norm": 8.366267655412884, + "learning_rate": 8.143301805042075e-06, + "loss": 0.9368, + "step": 18915 + }, + { + "epoch": 1.07, + "grad_norm": 9.554192544212176, + "learning_rate": 8.142027009360476e-06, + "loss": 0.9708, + "step": 18920 + }, + { + "epoch": 1.07, + "grad_norm": 6.221417115603707, + "learning_rate": 8.14075187605421e-06, + "loss": 0.9195, + "step": 18925 + }, + { + "epoch": 1.07, + "grad_norm": 6.9617101973673, + "learning_rate": 8.139476405260291e-06, + "loss": 0.9513, + "step": 18930 + }, + { + "epoch": 1.07, + "grad_norm": 9.965495849954547, + "learning_rate": 8.138200597115777e-06, + "loss": 0.9652, + "step": 18935 + }, + { + "epoch": 1.07, + "grad_norm": 11.128574959625924, + "learning_rate": 8.13692445175776e-06, + "loss": 0.9638, + "step": 18940 + }, + { + "epoch": 1.07, + "grad_norm": 11.027231065166058, + "learning_rate": 8.135647969323365e-06, + "loss": 0.9341, + "step": 18945 + }, + { + "epoch": 1.07, + "grad_norm": 10.670297375594766, + "learning_rate": 8.13437114994976e-06, + "loss": 0.906, + "step": 18950 + }, + { + "epoch": 1.07, + "grad_norm": 8.137314314312876, + "learning_rate": 8.133093993774139e-06, + "loss": 0.9647, + "step": 18955 + }, + { + "epoch": 1.07, + "grad_norm": 4.7281437061946265, + "learning_rate": 8.131816500933743e-06, + "loss": 0.9215, + "step": 18960 + }, + { + "epoch": 1.07, + "grad_norm": 23.789460071290314, + "learning_rate": 8.130538671565842e-06, + "loss": 0.9854, + "step": 18965 + }, + { + "epoch": 1.07, + "grad_norm": 30.65871741662231, + "learning_rate": 8.129260505807743e-06, + "loss": 0.9303, + "step": 18970 + }, + { + "epoch": 1.07, + "grad_norm": 6.632581448626001, + "learning_rate": 8.127982003796793e-06, + "loss": 0.9178, + "step": 18975 + }, + { + "epoch": 1.07, + "grad_norm": 17.618529898929967, + "learning_rate": 8.126703165670372e-06, + "loss": 1.0079, + "step": 18980 + }, + { + "epoch": 1.07, + "grad_norm": 6.324825991342937, + "learning_rate": 8.125423991565897e-06, + "loss": 0.9703, + "step": 18985 + }, + { + "epoch": 1.07, + "grad_norm": 5.506536800116093, + "learning_rate": 8.124144481620819e-06, + "loss": 0.9108, + "step": 18990 + }, + { + "epoch": 1.07, + "grad_norm": 7.205021872546387, + "learning_rate": 8.12286463597263e-06, + "loss": 0.9596, + "step": 18995 + }, + { + "epoch": 1.07, + "grad_norm": 8.12751175739675, + "learning_rate": 8.121584454758853e-06, + "loss": 0.9372, + "step": 19000 + }, + { + "epoch": 1.07, + "grad_norm": 14.204895926529103, + "learning_rate": 8.12030393811705e-06, + "loss": 0.9751, + "step": 19005 + }, + { + "epoch": 1.07, + "grad_norm": 7.641658872199885, + "learning_rate": 8.119023086184819e-06, + "loss": 0.9427, + "step": 19010 + }, + { + "epoch": 1.07, + "grad_norm": 8.460254379406491, + "learning_rate": 8.117741899099792e-06, + "loss": 0.9642, + "step": 19015 + }, + { + "epoch": 1.07, + "grad_norm": 9.496584675510707, + "learning_rate": 8.116460376999635e-06, + "loss": 0.9969, + "step": 19020 + }, + { + "epoch": 1.07, + "grad_norm": 5.332517697442171, + "learning_rate": 8.11517852002206e-06, + "loss": 0.9598, + "step": 19025 + }, + { + "epoch": 1.07, + "grad_norm": 14.114972784844499, + "learning_rate": 8.113896328304803e-06, + "loss": 0.9483, + "step": 19030 + }, + { + "epoch": 1.07, + "grad_norm": 9.348835767403838, + "learning_rate": 8.112613801985643e-06, + "loss": 0.9471, + "step": 19035 + }, + { + "epoch": 1.07, + "grad_norm": 30.686926455930507, + "learning_rate": 8.111330941202394e-06, + "loss": 0.9906, + "step": 19040 + }, + { + "epoch": 1.07, + "grad_norm": 64.61487049476504, + "learning_rate": 8.110047746092902e-06, + "loss": 0.9905, + "step": 19045 + }, + { + "epoch": 1.07, + "grad_norm": 26.250353886694356, + "learning_rate": 8.108764216795056e-06, + "loss": 0.9956, + "step": 19050 + }, + { + "epoch": 1.07, + "grad_norm": 60.48397806017447, + "learning_rate": 8.107480353446774e-06, + "loss": 1.0264, + "step": 19055 + }, + { + "epoch": 1.07, + "grad_norm": 15.496287744009525, + "learning_rate": 8.106196156186014e-06, + "loss": 0.9552, + "step": 19060 + }, + { + "epoch": 1.07, + "grad_norm": 46.872428608283386, + "learning_rate": 8.104911625150769e-06, + "loss": 1.0021, + "step": 19065 + }, + { + "epoch": 1.07, + "grad_norm": 12.489594361741183, + "learning_rate": 8.103626760479066e-06, + "loss": 0.9865, + "step": 19070 + }, + { + "epoch": 1.07, + "grad_norm": 25.851004185686303, + "learning_rate": 8.10234156230897e-06, + "loss": 0.9875, + "step": 19075 + }, + { + "epoch": 1.08, + "grad_norm": 7.685957859549898, + "learning_rate": 8.101056030778583e-06, + "loss": 0.961, + "step": 19080 + }, + { + "epoch": 1.08, + "grad_norm": 18.472298903540388, + "learning_rate": 8.099770166026038e-06, + "loss": 0.965, + "step": 19085 + }, + { + "epoch": 1.08, + "grad_norm": 15.045807887758574, + "learning_rate": 8.098483968189508e-06, + "loss": 1.0052, + "step": 19090 + }, + { + "epoch": 1.08, + "grad_norm": 6.657552807974352, + "learning_rate": 8.0971974374072e-06, + "loss": 0.9837, + "step": 19095 + }, + { + "epoch": 1.08, + "grad_norm": 8.024360037175807, + "learning_rate": 8.09591057381736e-06, + "loss": 0.9882, + "step": 19100 + }, + { + "epoch": 1.08, + "grad_norm": 6.421729973972606, + "learning_rate": 8.094623377558267e-06, + "loss": 0.9836, + "step": 19105 + }, + { + "epoch": 1.08, + "grad_norm": 5.870301147408619, + "learning_rate": 8.093335848768233e-06, + "loss": 0.9749, + "step": 19110 + }, + { + "epoch": 1.08, + "grad_norm": 5.307985263352527, + "learning_rate": 8.09204798758561e-06, + "loss": 0.9897, + "step": 19115 + }, + { + "epoch": 1.08, + "grad_norm": 30.308768334493212, + "learning_rate": 8.090759794148784e-06, + "loss": 0.9734, + "step": 19120 + }, + { + "epoch": 1.08, + "grad_norm": 11.812322916269569, + "learning_rate": 8.089471268596179e-06, + "loss": 0.9719, + "step": 19125 + }, + { + "epoch": 1.08, + "grad_norm": 18.281872072100057, + "learning_rate": 8.08818241106625e-06, + "loss": 0.9711, + "step": 19130 + }, + { + "epoch": 1.08, + "grad_norm": 17.350191912606665, + "learning_rate": 8.086893221697492e-06, + "loss": 0.9444, + "step": 19135 + }, + { + "epoch": 1.08, + "grad_norm": 7.905795692103847, + "learning_rate": 8.085603700628433e-06, + "loss": 0.9115, + "step": 19140 + }, + { + "epoch": 1.08, + "grad_norm": 7.494627237059104, + "learning_rate": 8.08431384799764e-06, + "loss": 0.8975, + "step": 19145 + }, + { + "epoch": 1.08, + "grad_norm": 7.504272272834612, + "learning_rate": 8.083023663943711e-06, + "loss": 0.9935, + "step": 19150 + }, + { + "epoch": 1.08, + "grad_norm": 10.466082915234127, + "learning_rate": 8.081733148605284e-06, + "loss": 0.9135, + "step": 19155 + }, + { + "epoch": 1.08, + "grad_norm": 6.128940782938963, + "learning_rate": 8.080442302121028e-06, + "loss": 0.9419, + "step": 19160 + }, + { + "epoch": 1.08, + "grad_norm": 14.905885961095287, + "learning_rate": 8.079151124629651e-06, + "loss": 0.9847, + "step": 19165 + }, + { + "epoch": 1.08, + "grad_norm": 5.14963058167278, + "learning_rate": 8.077859616269899e-06, + "loss": 0.9679, + "step": 19170 + }, + { + "epoch": 1.08, + "grad_norm": 16.156502034570472, + "learning_rate": 8.076567777180547e-06, + "loss": 0.9999, + "step": 19175 + }, + { + "epoch": 1.08, + "grad_norm": 4.999590755179119, + "learning_rate": 8.075275607500407e-06, + "loss": 0.9442, + "step": 19180 + }, + { + "epoch": 1.08, + "grad_norm": 12.474147826255185, + "learning_rate": 8.073983107368334e-06, + "loss": 0.9343, + "step": 19185 + }, + { + "epoch": 1.08, + "grad_norm": 16.978192324670545, + "learning_rate": 8.072690276923208e-06, + "loss": 0.9922, + "step": 19190 + }, + { + "epoch": 1.08, + "grad_norm": 8.040302910399815, + "learning_rate": 8.07139711630395e-06, + "loss": 0.9598, + "step": 19195 + }, + { + "epoch": 1.08, + "grad_norm": 7.3148028608519375, + "learning_rate": 8.070103625649518e-06, + "loss": 0.9404, + "step": 19200 + }, + { + "epoch": 1.08, + "grad_norm": 27.689607326731224, + "learning_rate": 8.068809805098903e-06, + "loss": 0.9692, + "step": 19205 + }, + { + "epoch": 1.08, + "grad_norm": 17.275963124357617, + "learning_rate": 8.06751565479113e-06, + "loss": 0.9202, + "step": 19210 + }, + { + "epoch": 1.08, + "grad_norm": 5.496705319574194, + "learning_rate": 8.066221174865261e-06, + "loss": 0.9988, + "step": 19215 + }, + { + "epoch": 1.08, + "grad_norm": 31.37702244184853, + "learning_rate": 8.064926365460394e-06, + "loss": 0.9333, + "step": 19220 + }, + { + "epoch": 1.08, + "grad_norm": 6.7848024561920335, + "learning_rate": 8.063631226715663e-06, + "loss": 0.8728, + "step": 19225 + }, + { + "epoch": 1.08, + "grad_norm": 8.981401505795201, + "learning_rate": 8.062335758770237e-06, + "loss": 0.9123, + "step": 19230 + }, + { + "epoch": 1.08, + "grad_norm": 5.463761471424651, + "learning_rate": 8.061039961763317e-06, + "loss": 0.9598, + "step": 19235 + }, + { + "epoch": 1.08, + "grad_norm": 10.227621295533597, + "learning_rate": 8.059743835834146e-06, + "loss": 0.9562, + "step": 19240 + }, + { + "epoch": 1.08, + "grad_norm": 22.251037260477297, + "learning_rate": 8.058447381121994e-06, + "loss": 0.9289, + "step": 19245 + }, + { + "epoch": 1.08, + "grad_norm": 5.459511512872369, + "learning_rate": 8.057150597766175e-06, + "loss": 0.9958, + "step": 19250 + }, + { + "epoch": 1.08, + "grad_norm": 6.330265059106776, + "learning_rate": 8.055853485906029e-06, + "loss": 0.9903, + "step": 19255 + }, + { + "epoch": 1.09, + "grad_norm": 8.670618505402853, + "learning_rate": 8.054556045680941e-06, + "loss": 0.9569, + "step": 19260 + }, + { + "epoch": 1.09, + "grad_norm": 16.362562558245227, + "learning_rate": 8.053258277230324e-06, + "loss": 0.9019, + "step": 19265 + }, + { + "epoch": 1.09, + "grad_norm": 9.88995697002873, + "learning_rate": 8.051960180693633e-06, + "loss": 0.9523, + "step": 19270 + }, + { + "epoch": 1.09, + "grad_norm": 7.741444655475432, + "learning_rate": 8.05066175621035e-06, + "loss": 0.9264, + "step": 19275 + }, + { + "epoch": 1.09, + "grad_norm": 6.072142501407967, + "learning_rate": 8.049363003919997e-06, + "loss": 0.946, + "step": 19280 + }, + { + "epoch": 1.09, + "grad_norm": 7.649458524454594, + "learning_rate": 8.04806392396213e-06, + "loss": 0.9676, + "step": 19285 + }, + { + "epoch": 1.09, + "grad_norm": 10.008869414362337, + "learning_rate": 8.046764516476344e-06, + "loss": 0.9662, + "step": 19290 + }, + { + "epoch": 1.09, + "grad_norm": 9.940476007854292, + "learning_rate": 8.045464781602265e-06, + "loss": 0.9511, + "step": 19295 + }, + { + "epoch": 1.09, + "grad_norm": 5.533736787794461, + "learning_rate": 8.044164719479555e-06, + "loss": 0.9082, + "step": 19300 + }, + { + "epoch": 1.09, + "grad_norm": 8.97256028421361, + "learning_rate": 8.042864330247908e-06, + "loss": 0.9621, + "step": 19305 + }, + { + "epoch": 1.09, + "grad_norm": 6.719141334024216, + "learning_rate": 8.041563614047062e-06, + "loss": 0.965, + "step": 19310 + }, + { + "epoch": 1.09, + "grad_norm": 5.192133825637401, + "learning_rate": 8.040262571016781e-06, + "loss": 0.959, + "step": 19315 + }, + { + "epoch": 1.09, + "grad_norm": 6.743848115116697, + "learning_rate": 8.03896120129687e-06, + "loss": 0.944, + "step": 19320 + }, + { + "epoch": 1.09, + "grad_norm": 7.112480487988186, + "learning_rate": 8.037659505027166e-06, + "loss": 0.9175, + "step": 19325 + }, + { + "epoch": 1.09, + "grad_norm": 8.368858588572888, + "learning_rate": 8.03635748234754e-06, + "loss": 0.9366, + "step": 19330 + }, + { + "epoch": 1.09, + "grad_norm": 11.698209830999122, + "learning_rate": 8.035055133397905e-06, + "loss": 0.9499, + "step": 19335 + }, + { + "epoch": 1.09, + "grad_norm": 10.464124207181301, + "learning_rate": 8.033752458318199e-06, + "loss": 0.9452, + "step": 19340 + }, + { + "epoch": 1.09, + "grad_norm": 14.195499561112465, + "learning_rate": 8.032449457248402e-06, + "loss": 0.9832, + "step": 19345 + }, + { + "epoch": 1.09, + "grad_norm": 30.492675598290408, + "learning_rate": 8.031146130328529e-06, + "loss": 0.9359, + "step": 19350 + }, + { + "epoch": 1.09, + "grad_norm": 11.567005081859069, + "learning_rate": 8.029842477698627e-06, + "loss": 0.9332, + "step": 19355 + }, + { + "epoch": 1.09, + "grad_norm": 8.461193392688758, + "learning_rate": 8.02853849949878e-06, + "loss": 1.0088, + "step": 19360 + }, + { + "epoch": 1.09, + "grad_norm": 11.56565444359108, + "learning_rate": 8.027234195869103e-06, + "loss": 0.9748, + "step": 19365 + }, + { + "epoch": 1.09, + "grad_norm": 15.050053366170115, + "learning_rate": 8.025929566949751e-06, + "loss": 0.9733, + "step": 19370 + }, + { + "epoch": 1.09, + "grad_norm": 6.653637879963531, + "learning_rate": 8.024624612880913e-06, + "loss": 0.9393, + "step": 19375 + }, + { + "epoch": 1.09, + "grad_norm": 6.798955707180229, + "learning_rate": 8.023319333802813e-06, + "loss": 0.9355, + "step": 19380 + }, + { + "epoch": 1.09, + "grad_norm": 7.565570627134732, + "learning_rate": 8.022013729855708e-06, + "loss": 0.9703, + "step": 19385 + }, + { + "epoch": 1.09, + "grad_norm": 7.826299823673436, + "learning_rate": 8.02070780117989e-06, + "loss": 0.9325, + "step": 19390 + }, + { + "epoch": 1.09, + "grad_norm": 7.0159554325501094, + "learning_rate": 8.019401547915685e-06, + "loss": 0.9844, + "step": 19395 + }, + { + "epoch": 1.09, + "grad_norm": 6.261569313375878, + "learning_rate": 8.018094970203461e-06, + "loss": 0.9494, + "step": 19400 + }, + { + "epoch": 1.09, + "grad_norm": 10.004110579814553, + "learning_rate": 8.016788068183613e-06, + "loss": 0.9833, + "step": 19405 + }, + { + "epoch": 1.09, + "grad_norm": 7.451316943774587, + "learning_rate": 8.01548084199657e-06, + "loss": 0.967, + "step": 19410 + }, + { + "epoch": 1.09, + "grad_norm": 7.630238035382916, + "learning_rate": 8.014173291782806e-06, + "loss": 0.9702, + "step": 19415 + }, + { + "epoch": 1.09, + "grad_norm": 24.126467039884865, + "learning_rate": 8.012865417682815e-06, + "loss": 0.951, + "step": 19420 + }, + { + "epoch": 1.09, + "grad_norm": 6.683487608165553, + "learning_rate": 8.011557219837141e-06, + "loss": 0.9963, + "step": 19425 + }, + { + "epoch": 1.09, + "grad_norm": 10.349705191691237, + "learning_rate": 8.010248698386352e-06, + "loss": 0.9462, + "step": 19430 + }, + { + "epoch": 1.1, + "grad_norm": 12.711331585817193, + "learning_rate": 8.008939853471055e-06, + "loss": 0.9605, + "step": 19435 + }, + { + "epoch": 1.1, + "grad_norm": 18.98077388159023, + "learning_rate": 8.007630685231892e-06, + "loss": 0.9497, + "step": 19440 + }, + { + "epoch": 1.1, + "grad_norm": 17.91240858152615, + "learning_rate": 8.006321193809538e-06, + "loss": 0.9629, + "step": 19445 + }, + { + "epoch": 1.1, + "grad_norm": 8.066176753467188, + "learning_rate": 8.005011379344702e-06, + "loss": 0.9564, + "step": 19450 + }, + { + "epoch": 1.1, + "grad_norm": 17.698201041761696, + "learning_rate": 8.003701241978133e-06, + "loss": 1.0054, + "step": 19455 + }, + { + "epoch": 1.1, + "grad_norm": 7.421053631687196, + "learning_rate": 8.002390781850608e-06, + "loss": 0.9293, + "step": 19460 + }, + { + "epoch": 1.1, + "grad_norm": 7.13309559011947, + "learning_rate": 8.001079999102944e-06, + "loss": 0.953, + "step": 19465 + }, + { + "epoch": 1.1, + "grad_norm": 13.496890683342123, + "learning_rate": 7.99976889387599e-06, + "loss": 0.9352, + "step": 19470 + }, + { + "epoch": 1.1, + "grad_norm": 14.474961844980022, + "learning_rate": 7.998457466310627e-06, + "loss": 0.9317, + "step": 19475 + }, + { + "epoch": 1.1, + "grad_norm": 6.439084447002402, + "learning_rate": 7.997145716547777e-06, + "loss": 0.992, + "step": 19480 + }, + { + "epoch": 1.1, + "grad_norm": 24.455108082467582, + "learning_rate": 7.995833644728392e-06, + "loss": 0.9719, + "step": 19485 + }, + { + "epoch": 1.1, + "grad_norm": 6.766452783205073, + "learning_rate": 7.994521250993462e-06, + "loss": 0.9482, + "step": 19490 + }, + { + "epoch": 1.1, + "grad_norm": 36.68519370259944, + "learning_rate": 7.993208535484007e-06, + "loss": 0.9433, + "step": 19495 + }, + { + "epoch": 1.1, + "grad_norm": 8.86956301167978, + "learning_rate": 7.991895498341084e-06, + "loss": 0.9447, + "step": 19500 + }, + { + "epoch": 1.1, + "grad_norm": 25.656579481434097, + "learning_rate": 7.990582139705788e-06, + "loss": 0.9273, + "step": 19505 + }, + { + "epoch": 1.1, + "grad_norm": 11.770256508544099, + "learning_rate": 7.98926845971924e-06, + "loss": 0.9583, + "step": 19510 + }, + { + "epoch": 1.1, + "grad_norm": 6.550009874566956, + "learning_rate": 7.987954458522602e-06, + "loss": 0.9801, + "step": 19515 + }, + { + "epoch": 1.1, + "grad_norm": 18.906244823367583, + "learning_rate": 7.986640136257073e-06, + "loss": 0.8999, + "step": 19520 + }, + { + "epoch": 1.1, + "grad_norm": 6.8235374691748785, + "learning_rate": 7.98532549306388e-06, + "loss": 0.9643, + "step": 19525 + }, + { + "epoch": 1.1, + "grad_norm": 10.699269384621902, + "learning_rate": 7.984010529084288e-06, + "loss": 0.8983, + "step": 19530 + }, + { + "epoch": 1.1, + "grad_norm": 6.227718940194455, + "learning_rate": 7.982695244459596e-06, + "loss": 0.9525, + "step": 19535 + }, + { + "epoch": 1.1, + "grad_norm": 9.713407954850574, + "learning_rate": 7.981379639331133e-06, + "loss": 0.97, + "step": 19540 + }, + { + "epoch": 1.1, + "grad_norm": 6.503803093212527, + "learning_rate": 7.980063713840271e-06, + "loss": 0.9277, + "step": 19545 + }, + { + "epoch": 1.1, + "grad_norm": 10.05706236845638, + "learning_rate": 7.978747468128412e-06, + "loss": 0.9435, + "step": 19550 + }, + { + "epoch": 1.1, + "grad_norm": 5.36127654938038, + "learning_rate": 7.977430902336992e-06, + "loss": 0.9456, + "step": 19555 + }, + { + "epoch": 1.1, + "grad_norm": 6.428830630164985, + "learning_rate": 7.97611401660748e-06, + "loss": 0.9216, + "step": 19560 + }, + { + "epoch": 1.1, + "grad_norm": 11.011920096570101, + "learning_rate": 7.974796811081382e-06, + "loss": 0.9783, + "step": 19565 + }, + { + "epoch": 1.1, + "grad_norm": 5.199607915822191, + "learning_rate": 7.973479285900239e-06, + "loss": 0.8955, + "step": 19570 + }, + { + "epoch": 1.1, + "grad_norm": 8.890940547817857, + "learning_rate": 7.972161441205623e-06, + "loss": 0.9498, + "step": 19575 + }, + { + "epoch": 1.1, + "grad_norm": 13.70694765558659, + "learning_rate": 7.970843277139142e-06, + "loss": 0.9534, + "step": 19580 + }, + { + "epoch": 1.1, + "grad_norm": 5.305971650187844, + "learning_rate": 7.969524793842442e-06, + "loss": 0.964, + "step": 19585 + }, + { + "epoch": 1.1, + "grad_norm": 16.13277345477748, + "learning_rate": 7.968205991457196e-06, + "loss": 0.9355, + "step": 19590 + }, + { + "epoch": 1.1, + "grad_norm": 15.825923515805263, + "learning_rate": 7.966886870125117e-06, + "loss": 0.9677, + "step": 19595 + }, + { + "epoch": 1.1, + "grad_norm": 20.975135735608088, + "learning_rate": 7.96556742998795e-06, + "loss": 0.9352, + "step": 19600 + }, + { + "epoch": 1.1, + "grad_norm": 9.591421233230836, + "learning_rate": 7.964247671187474e-06, + "loss": 0.9637, + "step": 19605 + }, + { + "epoch": 1.1, + "grad_norm": 12.718634106108007, + "learning_rate": 7.962927593865504e-06, + "loss": 0.9085, + "step": 19610 + }, + { + "epoch": 1.11, + "grad_norm": 16.955638635767677, + "learning_rate": 7.961607198163888e-06, + "loss": 0.9228, + "step": 19615 + }, + { + "epoch": 1.11, + "grad_norm": 43.602053972987015, + "learning_rate": 7.960286484224506e-06, + "loss": 0.9427, + "step": 19620 + }, + { + "epoch": 1.11, + "grad_norm": 12.16533227672473, + "learning_rate": 7.958965452189278e-06, + "loss": 0.9279, + "step": 19625 + }, + { + "epoch": 1.11, + "grad_norm": 18.829481821488702, + "learning_rate": 7.957644102200153e-06, + "loss": 0.9973, + "step": 19630 + }, + { + "epoch": 1.11, + "grad_norm": 6.970687551034499, + "learning_rate": 7.956322434399116e-06, + "loss": 0.9837, + "step": 19635 + }, + { + "epoch": 1.11, + "grad_norm": 18.190972374786007, + "learning_rate": 7.955000448928188e-06, + "loss": 0.9744, + "step": 19640 + }, + { + "epoch": 1.11, + "grad_norm": 12.588181451304402, + "learning_rate": 7.953678145929417e-06, + "loss": 0.926, + "step": 19645 + }, + { + "epoch": 1.11, + "grad_norm": 5.381918459585516, + "learning_rate": 7.952355525544896e-06, + "loss": 0.9735, + "step": 19650 + }, + { + "epoch": 1.11, + "grad_norm": 9.80693524917704, + "learning_rate": 7.951032587916742e-06, + "loss": 1.0078, + "step": 19655 + }, + { + "epoch": 1.11, + "grad_norm": 5.3503338628040575, + "learning_rate": 7.949709333187116e-06, + "loss": 0.959, + "step": 19660 + }, + { + "epoch": 1.11, + "grad_norm": 7.237006720925747, + "learning_rate": 7.9483857614982e-06, + "loss": 0.9856, + "step": 19665 + }, + { + "epoch": 1.11, + "grad_norm": 13.816087659148016, + "learning_rate": 7.947061872992226e-06, + "loss": 0.9693, + "step": 19670 + }, + { + "epoch": 1.11, + "grad_norm": 7.677939183677492, + "learning_rate": 7.945737667811444e-06, + "loss": 0.9488, + "step": 19675 + }, + { + "epoch": 1.11, + "grad_norm": 5.773405593237955, + "learning_rate": 7.94441314609815e-06, + "loss": 0.8974, + "step": 19680 + }, + { + "epoch": 1.11, + "grad_norm": 10.885395413739726, + "learning_rate": 7.94308830799467e-06, + "loss": 0.9466, + "step": 19685 + }, + { + "epoch": 1.11, + "grad_norm": 8.230162314206446, + "learning_rate": 7.94176315364336e-06, + "loss": 0.9619, + "step": 19690 + }, + { + "epoch": 1.11, + "grad_norm": 9.010389732362329, + "learning_rate": 7.94043768318662e-06, + "loss": 0.9265, + "step": 19695 + }, + { + "epoch": 1.11, + "grad_norm": 8.326671113480245, + "learning_rate": 7.939111896766871e-06, + "loss": 0.9616, + "step": 19700 + }, + { + "epoch": 1.11, + "grad_norm": 7.940817824927529, + "learning_rate": 7.937785794526579e-06, + "loss": 0.9498, + "step": 19705 + }, + { + "epoch": 1.11, + "grad_norm": 7.123994633748377, + "learning_rate": 7.936459376608235e-06, + "loss": 0.9683, + "step": 19710 + }, + { + "epoch": 1.11, + "grad_norm": 6.732863798232363, + "learning_rate": 7.935132643154372e-06, + "loss": 0.9558, + "step": 19715 + }, + { + "epoch": 1.11, + "grad_norm": 21.54114130340925, + "learning_rate": 7.933805594307554e-06, + "loss": 0.9622, + "step": 19720 + }, + { + "epoch": 1.11, + "grad_norm": 14.338570610213885, + "learning_rate": 7.932478230210379e-06, + "loss": 0.9975, + "step": 19725 + }, + { + "epoch": 1.11, + "grad_norm": 7.1643460718766, + "learning_rate": 7.931150551005474e-06, + "loss": 0.9754, + "step": 19730 + }, + { + "epoch": 1.11, + "grad_norm": 6.794767634573762, + "learning_rate": 7.929822556835504e-06, + "loss": 0.9992, + "step": 19735 + }, + { + "epoch": 1.11, + "grad_norm": 6.358733421042912, + "learning_rate": 7.928494247843173e-06, + "loss": 0.962, + "step": 19740 + }, + { + "epoch": 1.11, + "grad_norm": 6.441516126333959, + "learning_rate": 7.92716562417121e-06, + "loss": 0.9343, + "step": 19745 + }, + { + "epoch": 1.11, + "grad_norm": 5.005697348918134, + "learning_rate": 7.925836685962381e-06, + "loss": 0.94, + "step": 19750 + }, + { + "epoch": 1.11, + "grad_norm": 5.954779456667186, + "learning_rate": 7.92450743335949e-06, + "loss": 0.9297, + "step": 19755 + }, + { + "epoch": 1.11, + "grad_norm": 21.96543735644006, + "learning_rate": 7.923177866505367e-06, + "loss": 0.9999, + "step": 19760 + }, + { + "epoch": 1.11, + "grad_norm": 14.439114656591249, + "learning_rate": 7.92184798554288e-06, + "loss": 0.9381, + "step": 19765 + }, + { + "epoch": 1.11, + "grad_norm": 11.437159688487576, + "learning_rate": 7.920517790614932e-06, + "loss": 0.9548, + "step": 19770 + }, + { + "epoch": 1.11, + "grad_norm": 18.31164624093142, + "learning_rate": 7.91918728186446e-06, + "loss": 0.9739, + "step": 19775 + }, + { + "epoch": 1.11, + "grad_norm": 7.568427451030278, + "learning_rate": 7.91785645943443e-06, + "loss": 0.9471, + "step": 19780 + }, + { + "epoch": 1.11, + "grad_norm": 25.524585604423766, + "learning_rate": 7.916525323467847e-06, + "loss": 0.9716, + "step": 19785 + }, + { + "epoch": 1.12, + "grad_norm": 35.19814614535263, + "learning_rate": 7.915193874107745e-06, + "loss": 0.8945, + "step": 19790 + }, + { + "epoch": 1.12, + "grad_norm": 10.81733098741581, + "learning_rate": 7.913862111497197e-06, + "loss": 0.9891, + "step": 19795 + }, + { + "epoch": 1.12, + "grad_norm": 16.765431762253932, + "learning_rate": 7.912530035779305e-06, + "loss": 0.9364, + "step": 19800 + }, + { + "epoch": 1.12, + "grad_norm": 11.650495804124496, + "learning_rate": 7.911197647097205e-06, + "loss": 0.9527, + "step": 19805 + }, + { + "epoch": 1.12, + "grad_norm": 8.759189748283042, + "learning_rate": 7.90986494559407e-06, + "loss": 0.9488, + "step": 19810 + }, + { + "epoch": 1.12, + "grad_norm": 10.432197284445616, + "learning_rate": 7.908531931413106e-06, + "loss": 0.9614, + "step": 19815 + }, + { + "epoch": 1.12, + "grad_norm": 6.144157754736919, + "learning_rate": 7.907198604697548e-06, + "loss": 0.9442, + "step": 19820 + }, + { + "epoch": 1.12, + "grad_norm": 11.58622891155811, + "learning_rate": 7.90586496559067e-06, + "loss": 0.9407, + "step": 19825 + }, + { + "epoch": 1.12, + "grad_norm": 13.256606032959596, + "learning_rate": 7.904531014235778e-06, + "loss": 0.9402, + "step": 19830 + }, + { + "epoch": 1.12, + "grad_norm": 18.995752133576723, + "learning_rate": 7.90319675077621e-06, + "loss": 0.9276, + "step": 19835 + }, + { + "epoch": 1.12, + "grad_norm": 16.08826606164148, + "learning_rate": 7.901862175355336e-06, + "loss": 0.9613, + "step": 19840 + }, + { + "epoch": 1.12, + "grad_norm": 6.823681467049705, + "learning_rate": 7.900527288116567e-06, + "loss": 0.9438, + "step": 19845 + }, + { + "epoch": 1.12, + "grad_norm": 12.458283910186196, + "learning_rate": 7.899192089203339e-06, + "loss": 0.9377, + "step": 19850 + }, + { + "epoch": 1.12, + "grad_norm": 20.56108461369566, + "learning_rate": 7.897856578759125e-06, + "loss": 0.9592, + "step": 19855 + }, + { + "epoch": 1.12, + "grad_norm": 11.117862529409534, + "learning_rate": 7.896520756927433e-06, + "loss": 0.9397, + "step": 19860 + }, + { + "epoch": 1.12, + "grad_norm": 13.540676078214242, + "learning_rate": 7.895184623851802e-06, + "loss": 0.9409, + "step": 19865 + }, + { + "epoch": 1.12, + "grad_norm": 9.922870762478146, + "learning_rate": 7.893848179675805e-06, + "loss": 0.945, + "step": 19870 + }, + { + "epoch": 1.12, + "grad_norm": 8.026419476200202, + "learning_rate": 7.892511424543052e-06, + "loss": 0.961, + "step": 19875 + }, + { + "epoch": 1.12, + "grad_norm": 7.098333436086616, + "learning_rate": 7.891174358597177e-06, + "loss": 0.9516, + "step": 19880 + }, + { + "epoch": 1.12, + "grad_norm": 12.174111229345392, + "learning_rate": 7.889836981981862e-06, + "loss": 0.9242, + "step": 19885 + }, + { + "epoch": 1.12, + "grad_norm": 15.583304008671309, + "learning_rate": 7.888499294840807e-06, + "loss": 0.9575, + "step": 19890 + }, + { + "epoch": 1.12, + "grad_norm": 5.976771479838358, + "learning_rate": 7.887161297317756e-06, + "loss": 0.9566, + "step": 19895 + }, + { + "epoch": 1.12, + "grad_norm": 6.552608534180628, + "learning_rate": 7.88582298955648e-06, + "loss": 0.9596, + "step": 19900 + }, + { + "epoch": 1.12, + "grad_norm": 6.614054283870222, + "learning_rate": 7.884484371700788e-06, + "loss": 0.9002, + "step": 19905 + }, + { + "epoch": 1.12, + "grad_norm": 7.567773616178767, + "learning_rate": 7.88314544389452e-06, + "loss": 0.9542, + "step": 19910 + }, + { + "epoch": 1.12, + "grad_norm": 6.335168192737689, + "learning_rate": 7.88180620628155e-06, + "loss": 0.9427, + "step": 19915 + }, + { + "epoch": 1.12, + "grad_norm": 7.476534656304151, + "learning_rate": 7.880466659005786e-06, + "loss": 0.9583, + "step": 19920 + }, + { + "epoch": 1.12, + "grad_norm": 5.664795105240772, + "learning_rate": 7.879126802211165e-06, + "loss": 0.9185, + "step": 19925 + }, + { + "epoch": 1.12, + "grad_norm": 26.065707131841684, + "learning_rate": 7.877786636041664e-06, + "loss": 0.912, + "step": 19930 + }, + { + "epoch": 1.12, + "grad_norm": 17.60635611885136, + "learning_rate": 7.87644616064129e-06, + "loss": 0.9218, + "step": 19935 + }, + { + "epoch": 1.12, + "grad_norm": 10.444135083948037, + "learning_rate": 7.875105376154078e-06, + "loss": 0.9589, + "step": 19940 + }, + { + "epoch": 1.12, + "grad_norm": 7.963626629764778, + "learning_rate": 7.873764282724104e-06, + "loss": 0.9531, + "step": 19945 + }, + { + "epoch": 1.12, + "grad_norm": 6.953867406038498, + "learning_rate": 7.872422880495476e-06, + "loss": 0.9422, + "step": 19950 + }, + { + "epoch": 1.12, + "grad_norm": 10.281832810997978, + "learning_rate": 7.871081169612335e-06, + "loss": 0.9363, + "step": 19955 + }, + { + "epoch": 1.12, + "grad_norm": 13.67327295981892, + "learning_rate": 7.869739150218849e-06, + "loss": 0.973, + "step": 19960 + }, + { + "epoch": 1.12, + "grad_norm": 7.492111430740662, + "learning_rate": 7.868396822459227e-06, + "loss": 0.9387, + "step": 19965 + }, + { + "epoch": 1.13, + "grad_norm": 7.82283212288816, + "learning_rate": 7.867054186477707e-06, + "loss": 0.9438, + "step": 19970 + }, + { + "epoch": 1.13, + "grad_norm": 6.228653455876002, + "learning_rate": 7.865711242418561e-06, + "loss": 0.9475, + "step": 19975 + }, + { + "epoch": 1.13, + "grad_norm": 11.178307091181455, + "learning_rate": 7.864367990426095e-06, + "loss": 0.9576, + "step": 19980 + }, + { + "epoch": 1.13, + "grad_norm": 5.142175396053066, + "learning_rate": 7.863024430644647e-06, + "loss": 0.9847, + "step": 19985 + }, + { + "epoch": 1.13, + "grad_norm": 6.199772689796203, + "learning_rate": 7.861680563218588e-06, + "loss": 0.9865, + "step": 19990 + }, + { + "epoch": 1.13, + "grad_norm": 6.44718736131562, + "learning_rate": 7.860336388292324e-06, + "loss": 0.9453, + "step": 19995 + }, + { + "epoch": 1.13, + "grad_norm": 11.565393743290027, + "learning_rate": 7.858991906010289e-06, + "loss": 0.9078, + "step": 20000 + }, + { + "epoch": 1.13, + "grad_norm": 7.229254186701492, + "learning_rate": 7.857647116516959e-06, + "loss": 0.9054, + "step": 20005 + }, + { + "epoch": 1.13, + "grad_norm": 9.699017336532892, + "learning_rate": 7.856302019956833e-06, + "loss": 0.9626, + "step": 20010 + }, + { + "epoch": 1.13, + "grad_norm": 7.219672531293912, + "learning_rate": 7.85495661647445e-06, + "loss": 0.9441, + "step": 20015 + }, + { + "epoch": 1.13, + "grad_norm": 9.426445358055107, + "learning_rate": 7.853610906214376e-06, + "loss": 0.9438, + "step": 20020 + }, + { + "epoch": 1.13, + "grad_norm": 9.130086153119944, + "learning_rate": 7.852264889321219e-06, + "loss": 0.9131, + "step": 20025 + }, + { + "epoch": 1.13, + "grad_norm": 4.950594775885904, + "learning_rate": 7.850918565939611e-06, + "loss": 0.9071, + "step": 20030 + }, + { + "epoch": 1.13, + "grad_norm": 13.683234471569929, + "learning_rate": 7.849571936214221e-06, + "loss": 0.9831, + "step": 20035 + }, + { + "epoch": 1.13, + "grad_norm": 5.918927231712517, + "learning_rate": 7.84822500028975e-06, + "loss": 0.9563, + "step": 20040 + }, + { + "epoch": 1.13, + "grad_norm": 6.723994762414341, + "learning_rate": 7.846877758310933e-06, + "loss": 0.933, + "step": 20045 + }, + { + "epoch": 1.13, + "grad_norm": 6.704931324078806, + "learning_rate": 7.845530210422537e-06, + "loss": 0.953, + "step": 20050 + }, + { + "epoch": 1.13, + "grad_norm": 28.593829438922224, + "learning_rate": 7.84418235676936e-06, + "loss": 0.9777, + "step": 20055 + }, + { + "epoch": 1.13, + "grad_norm": 7.884887942381204, + "learning_rate": 7.842834197496238e-06, + "loss": 0.9455, + "step": 20060 + }, + { + "epoch": 1.13, + "grad_norm": 5.992617975755011, + "learning_rate": 7.841485732748035e-06, + "loss": 0.9018, + "step": 20065 + }, + { + "epoch": 1.13, + "grad_norm": 9.638986263329572, + "learning_rate": 7.84013696266965e-06, + "loss": 0.9249, + "step": 20070 + }, + { + "epoch": 1.13, + "grad_norm": 8.268277699407994, + "learning_rate": 7.838787887406012e-06, + "loss": 0.9544, + "step": 20075 + }, + { + "epoch": 1.13, + "grad_norm": 6.252205588716973, + "learning_rate": 7.83743850710209e-06, + "loss": 0.9261, + "step": 20080 + }, + { + "epoch": 1.13, + "grad_norm": 28.813291291039274, + "learning_rate": 7.836088821902874e-06, + "loss": 1.047, + "step": 20085 + }, + { + "epoch": 1.13, + "grad_norm": 13.072403660756706, + "learning_rate": 7.8347388319534e-06, + "loss": 0.954, + "step": 20090 + }, + { + "epoch": 1.13, + "grad_norm": 5.777456113095121, + "learning_rate": 7.833388537398728e-06, + "loss": 0.9591, + "step": 20095 + }, + { + "epoch": 1.13, + "grad_norm": 7.021113854385633, + "learning_rate": 7.832037938383951e-06, + "loss": 0.9556, + "step": 20100 + }, + { + "epoch": 1.13, + "grad_norm": 6.788626586060873, + "learning_rate": 7.830687035054202e-06, + "loss": 0.9563, + "step": 20105 + }, + { + "epoch": 1.13, + "grad_norm": 8.205718897745614, + "learning_rate": 7.829335827554634e-06, + "loss": 0.9358, + "step": 20110 + }, + { + "epoch": 1.13, + "grad_norm": 4.827859640256731, + "learning_rate": 7.827984316030447e-06, + "loss": 0.9815, + "step": 20115 + }, + { + "epoch": 1.13, + "grad_norm": 18.496962409757696, + "learning_rate": 7.826632500626863e-06, + "loss": 0.9493, + "step": 20120 + }, + { + "epoch": 1.13, + "grad_norm": 9.612688104431891, + "learning_rate": 7.825280381489145e-06, + "loss": 0.9321, + "step": 20125 + }, + { + "epoch": 1.13, + "grad_norm": 9.301062602979739, + "learning_rate": 7.823927958762578e-06, + "loss": 1.0011, + "step": 20130 + }, + { + "epoch": 1.13, + "grad_norm": 9.476625104434346, + "learning_rate": 7.822575232592491e-06, + "loss": 0.8694, + "step": 20135 + }, + { + "epoch": 1.13, + "grad_norm": 7.1728480126820635, + "learning_rate": 7.821222203124238e-06, + "loss": 0.9675, + "step": 20140 + }, + { + "epoch": 1.14, + "grad_norm": 6.186787079320183, + "learning_rate": 7.819868870503207e-06, + "loss": 0.9378, + "step": 20145 + }, + { + "epoch": 1.14, + "grad_norm": 9.02753075537611, + "learning_rate": 7.81851523487482e-06, + "loss": 0.9412, + "step": 20150 + }, + { + "epoch": 1.14, + "grad_norm": 4.966718193736909, + "learning_rate": 7.817161296384535e-06, + "loss": 0.9182, + "step": 20155 + }, + { + "epoch": 1.14, + "grad_norm": 9.133481628479382, + "learning_rate": 7.815807055177833e-06, + "loss": 0.9422, + "step": 20160 + }, + { + "epoch": 1.14, + "grad_norm": 23.655816673631616, + "learning_rate": 7.814452511400236e-06, + "loss": 0.8711, + "step": 20165 + }, + { + "epoch": 1.14, + "grad_norm": 26.946670626715143, + "learning_rate": 7.813097665197298e-06, + "loss": 0.9733, + "step": 20170 + }, + { + "epoch": 1.14, + "grad_norm": 7.224702603212419, + "learning_rate": 7.811742516714597e-06, + "loss": 0.9133, + "step": 20175 + }, + { + "epoch": 1.14, + "grad_norm": 27.390459826633204, + "learning_rate": 7.810387066097757e-06, + "loss": 0.9404, + "step": 20180 + }, + { + "epoch": 1.14, + "grad_norm": 6.010382622704545, + "learning_rate": 7.809031313492424e-06, + "loss": 0.8973, + "step": 20185 + }, + { + "epoch": 1.14, + "grad_norm": 30.271574718468564, + "learning_rate": 7.807675259044276e-06, + "loss": 0.9459, + "step": 20190 + }, + { + "epoch": 1.14, + "grad_norm": 16.320562856419855, + "learning_rate": 7.80631890289903e-06, + "loss": 0.9574, + "step": 20195 + }, + { + "epoch": 1.14, + "grad_norm": 47.9206358179213, + "learning_rate": 7.804962245202435e-06, + "loss": 0.9591, + "step": 20200 + }, + { + "epoch": 1.14, + "grad_norm": 20.098770244061356, + "learning_rate": 7.803605286100267e-06, + "loss": 0.9812, + "step": 20205 + }, + { + "epoch": 1.14, + "grad_norm": 25.01687722783874, + "learning_rate": 7.802248025738337e-06, + "loss": 0.949, + "step": 20210 + }, + { + "epoch": 1.14, + "grad_norm": 22.772754727830986, + "learning_rate": 7.80089046426249e-06, + "loss": 0.9643, + "step": 20215 + }, + { + "epoch": 1.14, + "grad_norm": 20.757226003243538, + "learning_rate": 7.7995326018186e-06, + "loss": 0.9686, + "step": 20220 + }, + { + "epoch": 1.14, + "grad_norm": 23.62684872274855, + "learning_rate": 7.798174438552579e-06, + "loss": 0.9553, + "step": 20225 + }, + { + "epoch": 1.14, + "grad_norm": 13.882771029211606, + "learning_rate": 7.796815974610364e-06, + "loss": 0.9107, + "step": 20230 + }, + { + "epoch": 1.14, + "grad_norm": 20.457454530687464, + "learning_rate": 7.79545721013793e-06, + "loss": 0.997, + "step": 20235 + }, + { + "epoch": 1.14, + "grad_norm": 10.00567752149711, + "learning_rate": 7.794098145281278e-06, + "loss": 0.9093, + "step": 20240 + }, + { + "epoch": 1.14, + "grad_norm": 16.015518065801285, + "learning_rate": 7.792738780186454e-06, + "loss": 0.9648, + "step": 20245 + }, + { + "epoch": 1.14, + "grad_norm": 20.447413868315554, + "learning_rate": 7.791379114999519e-06, + "loss": 0.8944, + "step": 20250 + }, + { + "epoch": 1.14, + "grad_norm": 9.709060533492773, + "learning_rate": 7.790019149866581e-06, + "loss": 0.9712, + "step": 20255 + }, + { + "epoch": 1.14, + "grad_norm": 21.06159144898778, + "learning_rate": 7.78865888493377e-06, + "loss": 0.8869, + "step": 20260 + }, + { + "epoch": 1.14, + "grad_norm": 8.418773761436006, + "learning_rate": 7.787298320347256e-06, + "loss": 0.9299, + "step": 20265 + }, + { + "epoch": 1.14, + "grad_norm": 22.04331772266648, + "learning_rate": 7.785937456253237e-06, + "loss": 0.9819, + "step": 20270 + }, + { + "epoch": 1.14, + "grad_norm": 20.04280913224276, + "learning_rate": 7.784576292797942e-06, + "loss": 0.9749, + "step": 20275 + }, + { + "epoch": 1.14, + "grad_norm": 19.096545780104982, + "learning_rate": 7.783214830127635e-06, + "loss": 0.9623, + "step": 20280 + }, + { + "epoch": 1.14, + "grad_norm": 7.837282050928299, + "learning_rate": 7.781853068388612e-06, + "loss": 0.948, + "step": 20285 + }, + { + "epoch": 1.14, + "grad_norm": 6.823047837922731, + "learning_rate": 7.780491007727199e-06, + "loss": 0.9361, + "step": 20290 + }, + { + "epoch": 1.14, + "grad_norm": 7.69324400255971, + "learning_rate": 7.779128648289756e-06, + "loss": 0.9205, + "step": 20295 + }, + { + "epoch": 1.14, + "grad_norm": 7.876811524927135, + "learning_rate": 7.777765990222674e-06, + "loss": 0.9432, + "step": 20300 + }, + { + "epoch": 1.14, + "grad_norm": 8.62810775402767, + "learning_rate": 7.776403033672377e-06, + "loss": 0.972, + "step": 20305 + }, + { + "epoch": 1.14, + "grad_norm": 10.855485166436244, + "learning_rate": 7.775039778785323e-06, + "loss": 0.9392, + "step": 20310 + }, + { + "epoch": 1.14, + "grad_norm": 17.91010169971048, + "learning_rate": 7.773676225707996e-06, + "loss": 0.9639, + "step": 20315 + }, + { + "epoch": 1.14, + "grad_norm": 8.493402247240592, + "learning_rate": 7.772312374586917e-06, + "loss": 0.9441, + "step": 20320 + }, + { + "epoch": 1.15, + "grad_norm": 9.117611514101663, + "learning_rate": 7.770948225568638e-06, + "loss": 0.9031, + "step": 20325 + }, + { + "epoch": 1.15, + "grad_norm": 11.4333155763605, + "learning_rate": 7.769583778799743e-06, + "loss": 0.9402, + "step": 20330 + }, + { + "epoch": 1.15, + "grad_norm": 6.814839172345621, + "learning_rate": 7.76821903442685e-06, + "loss": 0.9168, + "step": 20335 + }, + { + "epoch": 1.15, + "grad_norm": 10.994237784498635, + "learning_rate": 7.766853992596603e-06, + "loss": 0.9239, + "step": 20340 + }, + { + "epoch": 1.15, + "grad_norm": 6.5864266968455745, + "learning_rate": 7.765488653455684e-06, + "loss": 0.8941, + "step": 20345 + }, + { + "epoch": 1.15, + "grad_norm": 8.030732770562809, + "learning_rate": 7.764123017150803e-06, + "loss": 0.9493, + "step": 20350 + }, + { + "epoch": 1.15, + "grad_norm": 12.812501584881364, + "learning_rate": 7.762757083828706e-06, + "loss": 0.8914, + "step": 20355 + }, + { + "epoch": 1.15, + "grad_norm": 7.5567430448249615, + "learning_rate": 7.761390853636166e-06, + "loss": 0.9132, + "step": 20360 + }, + { + "epoch": 1.15, + "grad_norm": 10.99304509738203, + "learning_rate": 7.760024326719992e-06, + "loss": 0.9529, + "step": 20365 + }, + { + "epoch": 1.15, + "grad_norm": 6.917609481835917, + "learning_rate": 7.758657503227026e-06, + "loss": 0.9552, + "step": 20370 + }, + { + "epoch": 1.15, + "grad_norm": 6.693355948175669, + "learning_rate": 7.757290383304134e-06, + "loss": 0.9275, + "step": 20375 + }, + { + "epoch": 1.15, + "grad_norm": 8.295712353407465, + "learning_rate": 7.755922967098222e-06, + "loss": 1.0089, + "step": 20380 + }, + { + "epoch": 1.15, + "grad_norm": 14.108495593307115, + "learning_rate": 7.754555254756223e-06, + "loss": 0.9478, + "step": 20385 + }, + { + "epoch": 1.15, + "grad_norm": 5.504904737179936, + "learning_rate": 7.753187246425107e-06, + "loss": 0.9633, + "step": 20390 + }, + { + "epoch": 1.15, + "grad_norm": 6.300866739301536, + "learning_rate": 7.75181894225187e-06, + "loss": 0.9455, + "step": 20395 + }, + { + "epoch": 1.15, + "grad_norm": 7.201812749460246, + "learning_rate": 7.750450342383543e-06, + "loss": 0.9413, + "step": 20400 + }, + { + "epoch": 1.15, + "grad_norm": 14.026197017140966, + "learning_rate": 7.749081446967185e-06, + "loss": 0.9313, + "step": 20405 + }, + { + "epoch": 1.15, + "grad_norm": 12.669060099154594, + "learning_rate": 7.747712256149899e-06, + "loss": 0.9103, + "step": 20410 + }, + { + "epoch": 1.15, + "grad_norm": 5.294498548513014, + "learning_rate": 7.7463427700788e-06, + "loss": 0.9381, + "step": 20415 + }, + { + "epoch": 1.15, + "grad_norm": 13.866788856883709, + "learning_rate": 7.744972988901052e-06, + "loss": 0.9512, + "step": 20420 + }, + { + "epoch": 1.15, + "grad_norm": 5.957223837013078, + "learning_rate": 7.74360291276384e-06, + "loss": 0.9544, + "step": 20425 + }, + { + "epoch": 1.15, + "grad_norm": 9.670840019573902, + "learning_rate": 7.742232541814387e-06, + "loss": 0.9358, + "step": 20430 + }, + { + "epoch": 1.15, + "grad_norm": 19.67131364133245, + "learning_rate": 7.740861876199945e-06, + "loss": 0.9053, + "step": 20435 + }, + { + "epoch": 1.15, + "grad_norm": 12.972472203850405, + "learning_rate": 7.7394909160678e-06, + "loss": 0.9314, + "step": 20440 + }, + { + "epoch": 1.15, + "grad_norm": 19.11134998831311, + "learning_rate": 7.738119661565265e-06, + "loss": 0.8935, + "step": 20445 + }, + { + "epoch": 1.15, + "grad_norm": 19.452260015075453, + "learning_rate": 7.736748112839687e-06, + "loss": 0.9954, + "step": 20450 + }, + { + "epoch": 1.15, + "grad_norm": 5.667512558201727, + "learning_rate": 7.735376270038445e-06, + "loss": 0.9535, + "step": 20455 + }, + { + "epoch": 1.15, + "grad_norm": 6.460760878877248, + "learning_rate": 7.734004133308954e-06, + "loss": 0.9174, + "step": 20460 + }, + { + "epoch": 1.15, + "grad_norm": 7.292102266339746, + "learning_rate": 7.73263170279865e-06, + "loss": 0.9333, + "step": 20465 + }, + { + "epoch": 1.15, + "grad_norm": 6.486230922070714, + "learning_rate": 7.731258978655011e-06, + "loss": 0.9345, + "step": 20470 + }, + { + "epoch": 1.15, + "grad_norm": 5.199946706587383, + "learning_rate": 7.729885961025541e-06, + "loss": 0.9325, + "step": 20475 + }, + { + "epoch": 1.15, + "grad_norm": 6.473331183840534, + "learning_rate": 7.728512650057775e-06, + "loss": 0.932, + "step": 20480 + }, + { + "epoch": 1.15, + "grad_norm": 5.312204523691917, + "learning_rate": 7.727139045899283e-06, + "loss": 0.9185, + "step": 20485 + }, + { + "epoch": 1.15, + "grad_norm": 10.14242500288437, + "learning_rate": 7.725765148697666e-06, + "loss": 0.9197, + "step": 20490 + }, + { + "epoch": 1.15, + "grad_norm": 10.2522951212072, + "learning_rate": 7.724390958600554e-06, + "loss": 0.9463, + "step": 20495 + }, + { + "epoch": 1.16, + "grad_norm": 6.519152645189666, + "learning_rate": 7.723016475755609e-06, + "loss": 0.9647, + "step": 20500 + }, + { + "epoch": 1.16, + "grad_norm": 8.05566160577291, + "learning_rate": 7.721641700310526e-06, + "loss": 0.9518, + "step": 20505 + }, + { + "epoch": 1.16, + "grad_norm": 13.178885415473305, + "learning_rate": 7.720266632413032e-06, + "loss": 0.9649, + "step": 20510 + }, + { + "epoch": 1.16, + "grad_norm": 7.391420205741028, + "learning_rate": 7.718891272210882e-06, + "loss": 0.9804, + "step": 20515 + }, + { + "epoch": 1.16, + "grad_norm": 15.572287710982629, + "learning_rate": 7.717515619851865e-06, + "loss": 0.9662, + "step": 20520 + }, + { + "epoch": 1.16, + "grad_norm": 6.462636354099574, + "learning_rate": 7.716139675483802e-06, + "loss": 0.9602, + "step": 20525 + }, + { + "epoch": 1.16, + "grad_norm": 5.124819557747461, + "learning_rate": 7.714763439254544e-06, + "loss": 0.9533, + "step": 20530 + }, + { + "epoch": 1.16, + "grad_norm": 5.1256298027688985, + "learning_rate": 7.713386911311973e-06, + "loss": 0.9414, + "step": 20535 + }, + { + "epoch": 1.16, + "grad_norm": 5.832376192386037, + "learning_rate": 7.712010091804003e-06, + "loss": 0.9107, + "step": 20540 + }, + { + "epoch": 1.16, + "grad_norm": 13.252263820472098, + "learning_rate": 7.710632980878579e-06, + "loss": 0.9456, + "step": 20545 + }, + { + "epoch": 1.16, + "grad_norm": 7.871561810712612, + "learning_rate": 7.70925557868368e-06, + "loss": 0.8975, + "step": 20550 + }, + { + "epoch": 1.16, + "grad_norm": 14.994001323908641, + "learning_rate": 7.707877885367312e-06, + "loss": 0.9204, + "step": 20555 + }, + { + "epoch": 1.16, + "grad_norm": 23.13124068629601, + "learning_rate": 7.706499901077514e-06, + "loss": 0.953, + "step": 20560 + }, + { + "epoch": 1.16, + "grad_norm": 21.7476257801919, + "learning_rate": 7.705121625962355e-06, + "loss": 0.9819, + "step": 20565 + }, + { + "epoch": 1.16, + "grad_norm": 23.1000944255777, + "learning_rate": 7.70374306016994e-06, + "loss": 0.929, + "step": 20570 + }, + { + "epoch": 1.16, + "grad_norm": 14.350684996938273, + "learning_rate": 7.702364203848402e-06, + "loss": 0.9646, + "step": 20575 + }, + { + "epoch": 1.16, + "grad_norm": 10.88530740213351, + "learning_rate": 7.700985057145901e-06, + "loss": 0.9111, + "step": 20580 + }, + { + "epoch": 1.16, + "grad_norm": 14.142530855593234, + "learning_rate": 7.69960562021064e-06, + "loss": 0.9244, + "step": 20585 + }, + { + "epoch": 1.16, + "grad_norm": 5.998818714740994, + "learning_rate": 7.698225893190836e-06, + "loss": 0.9895, + "step": 20590 + }, + { + "epoch": 1.16, + "grad_norm": 36.05026297211486, + "learning_rate": 7.696845876234754e-06, + "loss": 0.9407, + "step": 20595 + }, + { + "epoch": 1.16, + "grad_norm": 9.379932775451739, + "learning_rate": 7.69546556949068e-06, + "loss": 0.9594, + "step": 20600 + }, + { + "epoch": 1.16, + "grad_norm": 10.11858098548503, + "learning_rate": 7.694084973106935e-06, + "loss": 0.9896, + "step": 20605 + }, + { + "epoch": 1.16, + "grad_norm": 8.401269268204945, + "learning_rate": 7.69270408723187e-06, + "loss": 0.9432, + "step": 20610 + }, + { + "epoch": 1.16, + "grad_norm": 7.1797372417099155, + "learning_rate": 7.691322912013865e-06, + "loss": 0.9629, + "step": 20615 + }, + { + "epoch": 1.16, + "grad_norm": 10.705054453128703, + "learning_rate": 7.689941447601337e-06, + "loss": 0.9644, + "step": 20620 + }, + { + "epoch": 1.16, + "grad_norm": 6.516573079744466, + "learning_rate": 7.688559694142727e-06, + "loss": 0.8937, + "step": 20625 + }, + { + "epoch": 1.16, + "grad_norm": 20.21457343072996, + "learning_rate": 7.687177651786515e-06, + "loss": 0.9655, + "step": 20630 + }, + { + "epoch": 1.16, + "grad_norm": 6.996892522397096, + "learning_rate": 7.685795320681204e-06, + "loss": 0.9521, + "step": 20635 + }, + { + "epoch": 1.16, + "grad_norm": 6.237187821918866, + "learning_rate": 7.684412700975333e-06, + "loss": 0.9249, + "step": 20640 + }, + { + "epoch": 1.16, + "grad_norm": 9.159260906106853, + "learning_rate": 7.683029792817469e-06, + "loss": 0.9656, + "step": 20645 + }, + { + "epoch": 1.16, + "grad_norm": 11.635056787249885, + "learning_rate": 7.681646596356213e-06, + "loss": 0.966, + "step": 20650 + }, + { + "epoch": 1.16, + "grad_norm": 17.31157956588242, + "learning_rate": 7.680263111740196e-06, + "loss": 0.9198, + "step": 20655 + }, + { + "epoch": 1.16, + "grad_norm": 7.340323196905999, + "learning_rate": 7.678879339118076e-06, + "loss": 0.978, + "step": 20660 + }, + { + "epoch": 1.16, + "grad_norm": 9.83179833470517, + "learning_rate": 7.67749527863855e-06, + "loss": 0.9953, + "step": 20665 + }, + { + "epoch": 1.16, + "grad_norm": 7.0343859653169085, + "learning_rate": 7.676110930450341e-06, + "loss": 0.9748, + "step": 20670 + }, + { + "epoch": 1.16, + "grad_norm": 18.153752771498908, + "learning_rate": 7.6747262947022e-06, + "loss": 0.9653, + "step": 20675 + }, + { + "epoch": 1.17, + "grad_norm": 12.02771660668748, + "learning_rate": 7.673341371542915e-06, + "loss": 0.9989, + "step": 20680 + }, + { + "epoch": 1.17, + "grad_norm": 48.11076183925116, + "learning_rate": 7.671956161121301e-06, + "loss": 0.9655, + "step": 20685 + }, + { + "epoch": 1.17, + "grad_norm": 54.67378814486978, + "learning_rate": 7.670570663586206e-06, + "loss": 0.9601, + "step": 20690 + }, + { + "epoch": 1.17, + "grad_norm": 6.005885281122075, + "learning_rate": 7.669184879086506e-06, + "loss": 0.9371, + "step": 20695 + }, + { + "epoch": 1.17, + "grad_norm": 50.83827742882587, + "learning_rate": 7.667798807771113e-06, + "loss": 0.9901, + "step": 20700 + }, + { + "epoch": 1.17, + "grad_norm": 20.550982513404612, + "learning_rate": 7.666412449788962e-06, + "loss": 0.9272, + "step": 20705 + }, + { + "epoch": 1.17, + "grad_norm": 15.923425970068918, + "learning_rate": 7.665025805289024e-06, + "loss": 0.9607, + "step": 20710 + }, + { + "epoch": 1.17, + "grad_norm": 19.118786487252336, + "learning_rate": 7.663638874420304e-06, + "loss": 0.9325, + "step": 20715 + }, + { + "epoch": 1.17, + "grad_norm": 18.430024969069798, + "learning_rate": 7.66225165733183e-06, + "loss": 0.951, + "step": 20720 + }, + { + "epoch": 1.17, + "grad_norm": 12.617103972216178, + "learning_rate": 7.66086415417267e-06, + "loss": 0.9237, + "step": 20725 + }, + { + "epoch": 1.17, + "grad_norm": 7.905601849945198, + "learning_rate": 7.65947636509191e-06, + "loss": 0.9565, + "step": 20730 + }, + { + "epoch": 1.17, + "grad_norm": 11.404541839571632, + "learning_rate": 7.658088290238676e-06, + "loss": 0.9162, + "step": 20735 + }, + { + "epoch": 1.17, + "grad_norm": 10.419445163593089, + "learning_rate": 7.656699929762126e-06, + "loss": 0.9464, + "step": 20740 + }, + { + "epoch": 1.17, + "grad_norm": 5.430992931986451, + "learning_rate": 7.655311283811445e-06, + "loss": 0.8827, + "step": 20745 + }, + { + "epoch": 1.17, + "grad_norm": 8.065737646040784, + "learning_rate": 7.653922352535846e-06, + "loss": 0.9677, + "step": 20750 + }, + { + "epoch": 1.17, + "grad_norm": 18.77086705350957, + "learning_rate": 7.652533136084576e-06, + "loss": 1.0184, + "step": 20755 + }, + { + "epoch": 1.17, + "grad_norm": 24.260042994602436, + "learning_rate": 7.651143634606916e-06, + "loss": 0.9341, + "step": 20760 + }, + { + "epoch": 1.17, + "grad_norm": 10.426390430477827, + "learning_rate": 7.649753848252172e-06, + "loss": 0.928, + "step": 20765 + }, + { + "epoch": 1.17, + "grad_norm": 27.097812573409072, + "learning_rate": 7.648363777169683e-06, + "loss": 0.9227, + "step": 20770 + }, + { + "epoch": 1.17, + "grad_norm": 15.242724605152857, + "learning_rate": 7.646973421508817e-06, + "loss": 0.9526, + "step": 20775 + }, + { + "epoch": 1.17, + "grad_norm": 8.374244987877411, + "learning_rate": 7.645582781418976e-06, + "loss": 0.986, + "step": 20780 + }, + { + "epoch": 1.17, + "grad_norm": 5.228570739391233, + "learning_rate": 7.644191857049589e-06, + "loss": 0.9002, + "step": 20785 + }, + { + "epoch": 1.17, + "grad_norm": 11.83638250702849, + "learning_rate": 7.642800648550118e-06, + "loss": 0.9775, + "step": 20790 + }, + { + "epoch": 1.17, + "grad_norm": 8.336442751327782, + "learning_rate": 7.641409156070051e-06, + "loss": 0.9485, + "step": 20795 + }, + { + "epoch": 1.17, + "grad_norm": 19.797444900674478, + "learning_rate": 7.640017379758915e-06, + "loss": 0.9464, + "step": 20800 + }, + { + "epoch": 1.17, + "grad_norm": 7.420222215295391, + "learning_rate": 7.638625319766259e-06, + "loss": 0.9486, + "step": 20805 + }, + { + "epoch": 1.17, + "grad_norm": 10.96637359152699, + "learning_rate": 7.637232976241668e-06, + "loss": 0.9428, + "step": 20810 + }, + { + "epoch": 1.17, + "grad_norm": 5.5993339538563145, + "learning_rate": 7.635840349334756e-06, + "loss": 0.9472, + "step": 20815 + }, + { + "epoch": 1.17, + "grad_norm": 10.506197599095325, + "learning_rate": 7.634447439195163e-06, + "loss": 0.9636, + "step": 20820 + }, + { + "epoch": 1.17, + "grad_norm": 12.271958654692396, + "learning_rate": 7.633054245972568e-06, + "loss": 0.9427, + "step": 20825 + }, + { + "epoch": 1.17, + "grad_norm": 7.9127308558959, + "learning_rate": 7.631660769816674e-06, + "loss": 0.9478, + "step": 20830 + }, + { + "epoch": 1.17, + "grad_norm": 5.61839773462849, + "learning_rate": 7.630267010877214e-06, + "loss": 0.9005, + "step": 20835 + }, + { + "epoch": 1.17, + "grad_norm": 6.569140640620592, + "learning_rate": 7.628872969303957e-06, + "loss": 1.0022, + "step": 20840 + }, + { + "epoch": 1.17, + "grad_norm": 8.386916154329136, + "learning_rate": 7.6274786452466956e-06, + "loss": 0.971, + "step": 20845 + }, + { + "epoch": 1.17, + "grad_norm": 9.026513499182911, + "learning_rate": 7.626084038855259e-06, + "loss": 0.9737, + "step": 20850 + }, + { + "epoch": 1.18, + "grad_norm": 9.045049475414952, + "learning_rate": 7.624689150279503e-06, + "loss": 0.9591, + "step": 20855 + }, + { + "epoch": 1.18, + "grad_norm": 10.256779559712614, + "learning_rate": 7.6232939796693125e-06, + "loss": 0.8882, + "step": 20860 + }, + { + "epoch": 1.18, + "grad_norm": 8.319763241650598, + "learning_rate": 7.621898527174608e-06, + "loss": 0.9075, + "step": 20865 + }, + { + "epoch": 1.18, + "grad_norm": 4.832794111019263, + "learning_rate": 7.620502792945337e-06, + "loss": 0.9585, + "step": 20870 + }, + { + "epoch": 1.18, + "grad_norm": 24.556392955013226, + "learning_rate": 7.619106777131473e-06, + "loss": 0.9639, + "step": 20875 + }, + { + "epoch": 1.18, + "grad_norm": 10.460080309283446, + "learning_rate": 7.617710479883027e-06, + "loss": 0.9385, + "step": 20880 + }, + { + "epoch": 1.18, + "grad_norm": 14.226225363274759, + "learning_rate": 7.616313901350039e-06, + "loss": 0.9857, + "step": 20885 + }, + { + "epoch": 1.18, + "grad_norm": 21.772307778678126, + "learning_rate": 7.614917041682574e-06, + "loss": 0.9984, + "step": 20890 + }, + { + "epoch": 1.18, + "grad_norm": 7.614314404859782, + "learning_rate": 7.613519901030732e-06, + "loss": 0.9581, + "step": 20895 + }, + { + "epoch": 1.18, + "grad_norm": 7.963333458178771, + "learning_rate": 7.612122479544643e-06, + "loss": 0.9694, + "step": 20900 + }, + { + "epoch": 1.18, + "grad_norm": 8.695766001888146, + "learning_rate": 7.610724777374465e-06, + "loss": 0.8983, + "step": 20905 + }, + { + "epoch": 1.18, + "grad_norm": 37.120298451575536, + "learning_rate": 7.609326794670388e-06, + "loss": 0.9425, + "step": 20910 + }, + { + "epoch": 1.18, + "grad_norm": 21.478949909311023, + "learning_rate": 7.6079285315826315e-06, + "loss": 0.9318, + "step": 20915 + }, + { + "epoch": 1.18, + "grad_norm": 14.427274097955634, + "learning_rate": 7.606529988261444e-06, + "loss": 0.9863, + "step": 20920 + }, + { + "epoch": 1.18, + "grad_norm": 7.10229688250555, + "learning_rate": 7.605131164857107e-06, + "loss": 0.9543, + "step": 20925 + }, + { + "epoch": 1.18, + "grad_norm": 6.778828961352319, + "learning_rate": 7.603732061519928e-06, + "loss": 0.941, + "step": 20930 + }, + { + "epoch": 1.18, + "grad_norm": 5.1941642933224035, + "learning_rate": 7.602332678400248e-06, + "loss": 0.9274, + "step": 20935 + }, + { + "epoch": 1.18, + "grad_norm": 11.879156158554867, + "learning_rate": 7.6009330156484354e-06, + "loss": 0.94, + "step": 20940 + }, + { + "epoch": 1.18, + "grad_norm": 9.057378736263681, + "learning_rate": 7.5995330734148935e-06, + "loss": 0.9357, + "step": 20945 + }, + { + "epoch": 1.18, + "grad_norm": 7.615523835821663, + "learning_rate": 7.598132851850048e-06, + "loss": 0.9163, + "step": 20950 + }, + { + "epoch": 1.18, + "grad_norm": 10.296204281967132, + "learning_rate": 7.5967323511043635e-06, + "loss": 0.9588, + "step": 20955 + }, + { + "epoch": 1.18, + "grad_norm": 11.269780691696742, + "learning_rate": 7.595331571328326e-06, + "loss": 0.9373, + "step": 20960 + }, + { + "epoch": 1.18, + "grad_norm": 19.5791004987557, + "learning_rate": 7.593930512672459e-06, + "loss": 0.9454, + "step": 20965 + }, + { + "epoch": 1.18, + "grad_norm": 5.579961147436789, + "learning_rate": 7.592529175287309e-06, + "loss": 0.9707, + "step": 20970 + }, + { + "epoch": 1.18, + "grad_norm": 5.361535473011104, + "learning_rate": 7.591127559323457e-06, + "loss": 0.9777, + "step": 20975 + }, + { + "epoch": 1.18, + "grad_norm": 5.935288657655972, + "learning_rate": 7.589725664931515e-06, + "loss": 0.9889, + "step": 20980 + }, + { + "epoch": 1.18, + "grad_norm": 18.982290661280405, + "learning_rate": 7.588323492262119e-06, + "loss": 0.9921, + "step": 20985 + }, + { + "epoch": 1.18, + "grad_norm": 6.236125844110351, + "learning_rate": 7.586921041465942e-06, + "loss": 0.9336, + "step": 20990 + }, + { + "epoch": 1.18, + "grad_norm": 8.854237049488104, + "learning_rate": 7.58551831269368e-06, + "loss": 0.9502, + "step": 20995 + }, + { + "epoch": 1.18, + "grad_norm": 6.0312585768173905, + "learning_rate": 7.584115306096067e-06, + "loss": 0.9287, + "step": 21000 + }, + { + "epoch": 1.18, + "grad_norm": 5.346789162775415, + "learning_rate": 7.5827120218238595e-06, + "loss": 0.8808, + "step": 21005 + }, + { + "epoch": 1.18, + "grad_norm": 13.504551615242983, + "learning_rate": 7.581308460027847e-06, + "loss": 0.9754, + "step": 21010 + }, + { + "epoch": 1.18, + "grad_norm": 8.901105302557985, + "learning_rate": 7.579904620858847e-06, + "loss": 0.966, + "step": 21015 + }, + { + "epoch": 1.18, + "grad_norm": 11.525381604737314, + "learning_rate": 7.578500504467709e-06, + "loss": 0.9657, + "step": 21020 + }, + { + "epoch": 1.18, + "grad_norm": 9.326836924303326, + "learning_rate": 7.577096111005314e-06, + "loss": 0.9968, + "step": 21025 + }, + { + "epoch": 1.18, + "grad_norm": 10.398970644385333, + "learning_rate": 7.575691440622568e-06, + "loss": 0.9566, + "step": 21030 + }, + { + "epoch": 1.19, + "grad_norm": 11.001077947538999, + "learning_rate": 7.574286493470408e-06, + "loss": 0.9313, + "step": 21035 + }, + { + "epoch": 1.19, + "grad_norm": 5.791664139210824, + "learning_rate": 7.572881269699805e-06, + "loss": 0.888, + "step": 21040 + }, + { + "epoch": 1.19, + "grad_norm": 8.22368264307788, + "learning_rate": 7.571475769461752e-06, + "loss": 0.9074, + "step": 21045 + }, + { + "epoch": 1.19, + "grad_norm": 5.9213401332893145, + "learning_rate": 7.570069992907281e-06, + "loss": 0.8855, + "step": 21050 + }, + { + "epoch": 1.19, + "grad_norm": 7.82363035520253, + "learning_rate": 7.568663940187446e-06, + "loss": 0.9138, + "step": 21055 + }, + { + "epoch": 1.19, + "grad_norm": 6.680108664597871, + "learning_rate": 7.567257611453334e-06, + "loss": 0.9481, + "step": 21060 + }, + { + "epoch": 1.19, + "grad_norm": 6.627214895767383, + "learning_rate": 7.565851006856062e-06, + "loss": 0.9298, + "step": 21065 + }, + { + "epoch": 1.19, + "grad_norm": 6.12112975983704, + "learning_rate": 7.564444126546775e-06, + "loss": 0.9185, + "step": 21070 + }, + { + "epoch": 1.19, + "grad_norm": 9.583574689912782, + "learning_rate": 7.5630369706766475e-06, + "loss": 0.9142, + "step": 21075 + }, + { + "epoch": 1.19, + "grad_norm": 10.27272563823132, + "learning_rate": 7.561629539396888e-06, + "loss": 0.9386, + "step": 21080 + }, + { + "epoch": 1.19, + "grad_norm": 6.290102595138414, + "learning_rate": 7.5602218328587276e-06, + "loss": 0.9312, + "step": 21085 + }, + { + "epoch": 1.19, + "grad_norm": 16.313924833437394, + "learning_rate": 7.5588138512134336e-06, + "loss": 0.931, + "step": 21090 + }, + { + "epoch": 1.19, + "grad_norm": 13.190736544064523, + "learning_rate": 7.5574055946122985e-06, + "loss": 0.9084, + "step": 21095 + }, + { + "epoch": 1.19, + "grad_norm": 5.6721350480756465, + "learning_rate": 7.555997063206644e-06, + "loss": 0.9605, + "step": 21100 + }, + { + "epoch": 1.19, + "grad_norm": 11.521803459395151, + "learning_rate": 7.554588257147826e-06, + "loss": 0.9516, + "step": 21105 + }, + { + "epoch": 1.19, + "grad_norm": 20.17096644772755, + "learning_rate": 7.553179176587224e-06, + "loss": 0.9885, + "step": 21110 + }, + { + "epoch": 1.19, + "grad_norm": 6.025598792959729, + "learning_rate": 7.5517698216762524e-06, + "loss": 0.9559, + "step": 21115 + }, + { + "epoch": 1.19, + "grad_norm": 10.96985925137222, + "learning_rate": 7.550360192566352e-06, + "loss": 0.9067, + "step": 21120 + }, + { + "epoch": 1.19, + "grad_norm": 5.77244294940966, + "learning_rate": 7.548950289408992e-06, + "loss": 0.9356, + "step": 21125 + }, + { + "epoch": 1.19, + "grad_norm": 6.119061544556094, + "learning_rate": 7.547540112355676e-06, + "loss": 0.891, + "step": 21130 + }, + { + "epoch": 1.19, + "grad_norm": 5.777681245297325, + "learning_rate": 7.546129661557931e-06, + "loss": 0.9037, + "step": 21135 + }, + { + "epoch": 1.19, + "grad_norm": 6.032709466254947, + "learning_rate": 7.544718937167316e-06, + "loss": 0.8482, + "step": 21140 + }, + { + "epoch": 1.19, + "grad_norm": 8.529650137045095, + "learning_rate": 7.54330793933542e-06, + "loss": 0.9236, + "step": 21145 + }, + { + "epoch": 1.19, + "grad_norm": 5.360157463969724, + "learning_rate": 7.541896668213863e-06, + "loss": 0.9694, + "step": 21150 + }, + { + "epoch": 1.19, + "grad_norm": 7.318288730102102, + "learning_rate": 7.540485123954291e-06, + "loss": 0.9218, + "step": 21155 + }, + { + "epoch": 1.19, + "grad_norm": 6.789017705133796, + "learning_rate": 7.5390733067083795e-06, + "loss": 0.9922, + "step": 21160 + }, + { + "epoch": 1.19, + "grad_norm": 15.418787983032518, + "learning_rate": 7.537661216627836e-06, + "loss": 0.9248, + "step": 21165 + }, + { + "epoch": 1.19, + "grad_norm": 7.534560377496262, + "learning_rate": 7.536248853864394e-06, + "loss": 0.9383, + "step": 21170 + }, + { + "epoch": 1.19, + "grad_norm": 5.449451161814772, + "learning_rate": 7.5348362185698195e-06, + "loss": 0.932, + "step": 21175 + }, + { + "epoch": 1.19, + "grad_norm": 6.34518898711807, + "learning_rate": 7.533423310895909e-06, + "loss": 0.9081, + "step": 21180 + }, + { + "epoch": 1.19, + "grad_norm": 11.198989088263259, + "learning_rate": 7.53201013099448e-06, + "loss": 0.9204, + "step": 21185 + }, + { + "epoch": 1.19, + "grad_norm": 15.046402607382952, + "learning_rate": 7.530596679017388e-06, + "loss": 0.9475, + "step": 21190 + }, + { + "epoch": 1.19, + "grad_norm": 5.303387107951348, + "learning_rate": 7.529182955116516e-06, + "loss": 0.952, + "step": 21195 + }, + { + "epoch": 1.19, + "grad_norm": 6.185400883833296, + "learning_rate": 7.527768959443772e-06, + "loss": 0.9683, + "step": 21200 + }, + { + "epoch": 1.19, + "grad_norm": 18.214167831779232, + "learning_rate": 7.526354692151098e-06, + "loss": 0.9326, + "step": 21205 + }, + { + "epoch": 1.2, + "grad_norm": 5.274897879322006, + "learning_rate": 7.524940153390462e-06, + "loss": 0.9173, + "step": 21210 + }, + { + "epoch": 1.2, + "grad_norm": 20.37349961726135, + "learning_rate": 7.523525343313864e-06, + "loss": 0.8891, + "step": 21215 + }, + { + "epoch": 1.2, + "grad_norm": 9.138431594683539, + "learning_rate": 7.52211026207333e-06, + "loss": 0.948, + "step": 21220 + }, + { + "epoch": 1.2, + "grad_norm": 7.763648864022242, + "learning_rate": 7.520694909820918e-06, + "loss": 0.8954, + "step": 21225 + }, + { + "epoch": 1.2, + "grad_norm": 9.48402849122678, + "learning_rate": 7.519279286708713e-06, + "loss": 0.9023, + "step": 21230 + }, + { + "epoch": 1.2, + "grad_norm": 18.763390265065176, + "learning_rate": 7.517863392888829e-06, + "loss": 0.909, + "step": 21235 + }, + { + "epoch": 1.2, + "grad_norm": 5.440875544119596, + "learning_rate": 7.516447228513414e-06, + "loss": 0.932, + "step": 21240 + }, + { + "epoch": 1.2, + "grad_norm": 11.305933083943643, + "learning_rate": 7.515030793734637e-06, + "loss": 0.9577, + "step": 21245 + }, + { + "epoch": 1.2, + "grad_norm": 11.556069501823375, + "learning_rate": 7.5136140887047016e-06, + "loss": 0.9316, + "step": 21250 + }, + { + "epoch": 1.2, + "grad_norm": 9.41055193641902, + "learning_rate": 7.512197113575839e-06, + "loss": 0.9318, + "step": 21255 + }, + { + "epoch": 1.2, + "grad_norm": 6.195446472655102, + "learning_rate": 7.510779868500309e-06, + "loss": 0.9352, + "step": 21260 + }, + { + "epoch": 1.2, + "grad_norm": 6.844228105465982, + "learning_rate": 7.509362353630401e-06, + "loss": 0.9383, + "step": 21265 + }, + { + "epoch": 1.2, + "grad_norm": 6.837580825549168, + "learning_rate": 7.507944569118433e-06, + "loss": 0.9371, + "step": 21270 + }, + { + "epoch": 1.2, + "grad_norm": 10.876513523293854, + "learning_rate": 7.506526515116753e-06, + "loss": 0.9466, + "step": 21275 + }, + { + "epoch": 1.2, + "grad_norm": 22.946683256536083, + "learning_rate": 7.5051081917777364e-06, + "loss": 0.9951, + "step": 21280 + }, + { + "epoch": 1.2, + "grad_norm": 6.975980402928209, + "learning_rate": 7.5036895992537905e-06, + "loss": 0.9339, + "step": 21285 + }, + { + "epoch": 1.2, + "grad_norm": 17.188663623906173, + "learning_rate": 7.5022707376973455e-06, + "loss": 0.9763, + "step": 21290 + }, + { + "epoch": 1.2, + "grad_norm": 5.7174814513903005, + "learning_rate": 7.500851607260869e-06, + "loss": 0.9151, + "step": 21295 + }, + { + "epoch": 1.2, + "grad_norm": 5.202518259656641, + "learning_rate": 7.4994322080968486e-06, + "loss": 0.9213, + "step": 21300 + }, + { + "epoch": 1.2, + "grad_norm": 8.760280416397578, + "learning_rate": 7.498012540357806e-06, + "loss": 0.9213, + "step": 21305 + }, + { + "epoch": 1.2, + "grad_norm": 5.546815554784757, + "learning_rate": 7.4965926041962925e-06, + "loss": 0.934, + "step": 21310 + }, + { + "epoch": 1.2, + "grad_norm": 5.195872702862795, + "learning_rate": 7.495172399764884e-06, + "loss": 0.8817, + "step": 21315 + }, + { + "epoch": 1.2, + "grad_norm": 5.558399093100658, + "learning_rate": 7.49375192721619e-06, + "loss": 0.8942, + "step": 21320 + }, + { + "epoch": 1.2, + "grad_norm": 13.761034264753144, + "learning_rate": 7.4923311867028456e-06, + "loss": 0.9103, + "step": 21325 + }, + { + "epoch": 1.2, + "grad_norm": 5.009522992860807, + "learning_rate": 7.490910178377516e-06, + "loss": 0.9288, + "step": 21330 + }, + { + "epoch": 1.2, + "grad_norm": 29.430300437780726, + "learning_rate": 7.489488902392894e-06, + "loss": 0.9224, + "step": 21335 + }, + { + "epoch": 1.2, + "grad_norm": 15.850009935907973, + "learning_rate": 7.488067358901703e-06, + "loss": 0.9613, + "step": 21340 + }, + { + "epoch": 1.2, + "grad_norm": 23.770443191064665, + "learning_rate": 7.486645548056694e-06, + "loss": 0.9567, + "step": 21345 + }, + { + "epoch": 1.2, + "grad_norm": 5.809567856585966, + "learning_rate": 7.485223470010645e-06, + "loss": 0.8932, + "step": 21350 + }, + { + "epoch": 1.2, + "grad_norm": 14.508141063152426, + "learning_rate": 7.4838011249163665e-06, + "loss": 0.9094, + "step": 21355 + }, + { + "epoch": 1.2, + "grad_norm": 17.768026330311038, + "learning_rate": 7.482378512926696e-06, + "loss": 0.8966, + "step": 21360 + }, + { + "epoch": 1.2, + "grad_norm": 10.05579203141165, + "learning_rate": 7.480955634194498e-06, + "loss": 0.9502, + "step": 21365 + }, + { + "epoch": 1.2, + "grad_norm": 7.104265135011229, + "learning_rate": 7.479532488872669e-06, + "loss": 0.9326, + "step": 21370 + }, + { + "epoch": 1.2, + "grad_norm": 11.467875400644987, + "learning_rate": 7.4781090771141295e-06, + "loss": 0.9207, + "step": 21375 + }, + { + "epoch": 1.2, + "grad_norm": 7.78996501239046, + "learning_rate": 7.476685399071834e-06, + "loss": 0.9533, + "step": 21380 + }, + { + "epoch": 1.2, + "grad_norm": 5.328382881534821, + "learning_rate": 7.475261454898761e-06, + "loss": 0.9038, + "step": 21385 + }, + { + "epoch": 1.21, + "grad_norm": 7.785160044829789, + "learning_rate": 7.473837244747921e-06, + "loss": 0.9649, + "step": 21390 + }, + { + "epoch": 1.21, + "grad_norm": 10.399980164405193, + "learning_rate": 7.472412768772351e-06, + "loss": 0.9478, + "step": 21395 + }, + { + "epoch": 1.21, + "grad_norm": 8.415761600421625, + "learning_rate": 7.470988027125118e-06, + "loss": 0.9874, + "step": 21400 + }, + { + "epoch": 1.21, + "grad_norm": 5.732511549440401, + "learning_rate": 7.469563019959315e-06, + "loss": 0.96, + "step": 21405 + }, + { + "epoch": 1.21, + "grad_norm": 10.220960037971349, + "learning_rate": 7.468137747428068e-06, + "loss": 0.9026, + "step": 21410 + }, + { + "epoch": 1.21, + "grad_norm": 30.03476848739393, + "learning_rate": 7.466712209684527e-06, + "loss": 0.9466, + "step": 21415 + }, + { + "epoch": 1.21, + "grad_norm": 14.309195809786209, + "learning_rate": 7.465286406881872e-06, + "loss": 0.9114, + "step": 21420 + }, + { + "epoch": 1.21, + "grad_norm": 5.219077562920409, + "learning_rate": 7.463860339173312e-06, + "loss": 0.9188, + "step": 21425 + }, + { + "epoch": 1.21, + "grad_norm": 8.142719146553722, + "learning_rate": 7.462434006712086e-06, + "loss": 0.9276, + "step": 21430 + }, + { + "epoch": 1.21, + "grad_norm": 5.711057394672944, + "learning_rate": 7.461007409651458e-06, + "loss": 0.9247, + "step": 21435 + }, + { + "epoch": 1.21, + "grad_norm": 6.127355597514079, + "learning_rate": 7.459580548144722e-06, + "loss": 0.9162, + "step": 21440 + }, + { + "epoch": 1.21, + "grad_norm": 9.040901079276335, + "learning_rate": 7.458153422345201e-06, + "loss": 0.9605, + "step": 21445 + }, + { + "epoch": 1.21, + "grad_norm": 5.797276859772071, + "learning_rate": 7.456726032406248e-06, + "loss": 0.9631, + "step": 21450 + }, + { + "epoch": 1.21, + "grad_norm": 21.903885998288168, + "learning_rate": 7.455298378481239e-06, + "loss": 0.9342, + "step": 21455 + }, + { + "epoch": 1.21, + "grad_norm": 12.759976237902354, + "learning_rate": 7.453870460723582e-06, + "loss": 0.932, + "step": 21460 + }, + { + "epoch": 1.21, + "grad_norm": 13.48102783908614, + "learning_rate": 7.452442279286717e-06, + "loss": 0.8964, + "step": 21465 + }, + { + "epoch": 1.21, + "grad_norm": 5.488482415759527, + "learning_rate": 7.451013834324103e-06, + "loss": 0.9109, + "step": 21470 + }, + { + "epoch": 1.21, + "grad_norm": 5.331598137248048, + "learning_rate": 7.449585125989237e-06, + "loss": 0.9643, + "step": 21475 + }, + { + "epoch": 1.21, + "grad_norm": 8.654772828867728, + "learning_rate": 7.4481561544356385e-06, + "loss": 0.982, + "step": 21480 + }, + { + "epoch": 1.21, + "grad_norm": 11.096943034014457, + "learning_rate": 7.446726919816856e-06, + "loss": 0.8857, + "step": 21485 + }, + { + "epoch": 1.21, + "grad_norm": 14.639059797088008, + "learning_rate": 7.445297422286466e-06, + "loss": 0.9105, + "step": 21490 + }, + { + "epoch": 1.21, + "grad_norm": 9.773557868297447, + "learning_rate": 7.443867661998079e-06, + "loss": 0.8886, + "step": 21495 + }, + { + "epoch": 1.21, + "grad_norm": 16.9337824298745, + "learning_rate": 7.442437639105325e-06, + "loss": 0.9833, + "step": 21500 + }, + { + "epoch": 1.21, + "grad_norm": 8.773334279231266, + "learning_rate": 7.4410073537618675e-06, + "loss": 0.934, + "step": 21505 + }, + { + "epoch": 1.21, + "grad_norm": 26.863583338083775, + "learning_rate": 7.439576806121397e-06, + "loss": 0.9491, + "step": 21510 + }, + { + "epoch": 1.21, + "grad_norm": 12.421138787209415, + "learning_rate": 7.438145996337631e-06, + "loss": 0.9447, + "step": 21515 + }, + { + "epoch": 1.21, + "grad_norm": 18.72126609755657, + "learning_rate": 7.436714924564318e-06, + "loss": 0.9403, + "step": 21520 + }, + { + "epoch": 1.21, + "grad_norm": 8.682958017245936, + "learning_rate": 7.4352835909552335e-06, + "loss": 0.8809, + "step": 21525 + }, + { + "epoch": 1.21, + "grad_norm": 11.099390859872644, + "learning_rate": 7.433851995664176e-06, + "loss": 0.9099, + "step": 21530 + }, + { + "epoch": 1.21, + "grad_norm": 8.537055772326045, + "learning_rate": 7.432420138844983e-06, + "loss": 0.9575, + "step": 21535 + }, + { + "epoch": 1.21, + "grad_norm": 12.920949571431883, + "learning_rate": 7.4309880206515105e-06, + "loss": 0.9711, + "step": 21540 + }, + { + "epoch": 1.21, + "grad_norm": 12.757306521300025, + "learning_rate": 7.4295556412376464e-06, + "loss": 0.9453, + "step": 21545 + }, + { + "epoch": 1.21, + "grad_norm": 14.033007300876887, + "learning_rate": 7.4281230007573056e-06, + "loss": 0.9286, + "step": 21550 + }, + { + "epoch": 1.21, + "grad_norm": 7.266553157499411, + "learning_rate": 7.426690099364434e-06, + "loss": 0.9516, + "step": 21555 + }, + { + "epoch": 1.21, + "grad_norm": 6.135476926450496, + "learning_rate": 7.425256937213e-06, + "loss": 0.9494, + "step": 21560 + }, + { + "epoch": 1.22, + "grad_norm": 9.423487480789941, + "learning_rate": 7.423823514457005e-06, + "loss": 0.9238, + "step": 21565 + }, + { + "epoch": 1.22, + "grad_norm": 9.736953209377333, + "learning_rate": 7.4223898312504765e-06, + "loss": 0.9434, + "step": 21570 + }, + { + "epoch": 1.22, + "grad_norm": 16.3092760122703, + "learning_rate": 7.420955887747469e-06, + "loss": 0.9125, + "step": 21575 + }, + { + "epoch": 1.22, + "grad_norm": 7.908835585561126, + "learning_rate": 7.419521684102067e-06, + "loss": 0.9766, + "step": 21580 + }, + { + "epoch": 1.22, + "grad_norm": 5.8815841406236, + "learning_rate": 7.418087220468382e-06, + "loss": 0.879, + "step": 21585 + }, + { + "epoch": 1.22, + "grad_norm": 5.906136238731885, + "learning_rate": 7.4166524970005536e-06, + "loss": 0.9168, + "step": 21590 + }, + { + "epoch": 1.22, + "grad_norm": 7.799089867629139, + "learning_rate": 7.415217513852748e-06, + "loss": 0.9258, + "step": 21595 + }, + { + "epoch": 1.22, + "grad_norm": 5.953161289922542, + "learning_rate": 7.413782271179162e-06, + "loss": 0.9014, + "step": 21600 + }, + { + "epoch": 1.22, + "grad_norm": 6.923640837621932, + "learning_rate": 7.412346769134019e-06, + "loss": 0.9164, + "step": 21605 + }, + { + "epoch": 1.22, + "grad_norm": 14.616238957462768, + "learning_rate": 7.410911007871568e-06, + "loss": 0.9133, + "step": 21610 + }, + { + "epoch": 1.22, + "grad_norm": 8.579105032762657, + "learning_rate": 7.4094749875460906e-06, + "loss": 0.9495, + "step": 21615 + }, + { + "epoch": 1.22, + "grad_norm": 19.97138268909543, + "learning_rate": 7.408038708311891e-06, + "loss": 0.9299, + "step": 21620 + }, + { + "epoch": 1.22, + "grad_norm": 15.012023377241391, + "learning_rate": 7.406602170323303e-06, + "loss": 0.9391, + "step": 21625 + }, + { + "epoch": 1.22, + "grad_norm": 13.149749929869508, + "learning_rate": 7.405165373734692e-06, + "loss": 0.9527, + "step": 21630 + }, + { + "epoch": 1.22, + "grad_norm": 10.553269110539492, + "learning_rate": 7.4037283187004475e-06, + "loss": 0.9511, + "step": 21635 + }, + { + "epoch": 1.22, + "grad_norm": 17.591005957525322, + "learning_rate": 7.402291005374986e-06, + "loss": 0.9664, + "step": 21640 + }, + { + "epoch": 1.22, + "grad_norm": 9.285002619746813, + "learning_rate": 7.400853433912754e-06, + "loss": 0.915, + "step": 21645 + }, + { + "epoch": 1.22, + "grad_norm": 9.896674721777744, + "learning_rate": 7.3994156044682255e-06, + "loss": 0.9036, + "step": 21650 + }, + { + "epoch": 1.22, + "grad_norm": 20.636618262462036, + "learning_rate": 7.3979775171959e-06, + "loss": 0.9748, + "step": 21655 + }, + { + "epoch": 1.22, + "grad_norm": 16.722986156127867, + "learning_rate": 7.396539172250306e-06, + "loss": 0.9033, + "step": 21660 + }, + { + "epoch": 1.22, + "grad_norm": 15.127526630379863, + "learning_rate": 7.395100569786004e-06, + "loss": 0.8972, + "step": 21665 + }, + { + "epoch": 1.22, + "grad_norm": 13.817841883447162, + "learning_rate": 7.393661709957573e-06, + "loss": 0.9164, + "step": 21670 + }, + { + "epoch": 1.22, + "grad_norm": 20.949975362196888, + "learning_rate": 7.392222592919628e-06, + "loss": 0.8962, + "step": 21675 + }, + { + "epoch": 1.22, + "grad_norm": 19.791113841050933, + "learning_rate": 7.390783218826808e-06, + "loss": 0.9042, + "step": 21680 + }, + { + "epoch": 1.22, + "grad_norm": 19.47892949124888, + "learning_rate": 7.389343587833779e-06, + "loss": 0.938, + "step": 21685 + }, + { + "epoch": 1.22, + "grad_norm": 27.056777358833994, + "learning_rate": 7.387903700095238e-06, + "loss": 0.9364, + "step": 21690 + }, + { + "epoch": 1.22, + "grad_norm": 6.166185386748777, + "learning_rate": 7.386463555765906e-06, + "loss": 0.8985, + "step": 21695 + }, + { + "epoch": 1.22, + "grad_norm": 6.714153460423674, + "learning_rate": 7.38502315500053e-06, + "loss": 0.911, + "step": 21700 + }, + { + "epoch": 1.22, + "grad_norm": 11.387898092159356, + "learning_rate": 7.383582497953893e-06, + "loss": 0.9294, + "step": 21705 + }, + { + "epoch": 1.22, + "grad_norm": 14.956411073682496, + "learning_rate": 7.382141584780795e-06, + "loss": 0.9063, + "step": 21710 + }, + { + "epoch": 1.22, + "grad_norm": 6.356845930989436, + "learning_rate": 7.380700415636071e-06, + "loss": 0.9089, + "step": 21715 + }, + { + "epoch": 1.22, + "grad_norm": 5.568343275081348, + "learning_rate": 7.379258990674581e-06, + "loss": 0.9163, + "step": 21720 + }, + { + "epoch": 1.22, + "grad_norm": 20.280649080030962, + "learning_rate": 7.377817310051212e-06, + "loss": 0.9707, + "step": 21725 + }, + { + "epoch": 1.22, + "grad_norm": 5.924771144652284, + "learning_rate": 7.376375373920879e-06, + "loss": 0.9232, + "step": 21730 + }, + { + "epoch": 1.22, + "grad_norm": 8.244365524235691, + "learning_rate": 7.374933182438524e-06, + "loss": 0.89, + "step": 21735 + }, + { + "epoch": 1.22, + "grad_norm": 10.816340246901364, + "learning_rate": 7.373490735759117e-06, + "loss": 0.9262, + "step": 21740 + }, + { + "epoch": 1.23, + "grad_norm": 4.861367749097658, + "learning_rate": 7.372048034037655e-06, + "loss": 0.9358, + "step": 21745 + }, + { + "epoch": 1.23, + "grad_norm": 12.850857046818518, + "learning_rate": 7.370605077429165e-06, + "loss": 0.9211, + "step": 21750 + }, + { + "epoch": 1.23, + "grad_norm": 6.384409376966101, + "learning_rate": 7.369161866088695e-06, + "loss": 0.959, + "step": 21755 + }, + { + "epoch": 1.23, + "grad_norm": 5.583561928352578, + "learning_rate": 7.367718400171326e-06, + "loss": 0.9876, + "step": 21760 + }, + { + "epoch": 1.23, + "grad_norm": 12.657757260203732, + "learning_rate": 7.366274679832167e-06, + "loss": 0.9265, + "step": 21765 + }, + { + "epoch": 1.23, + "grad_norm": 6.278416652005298, + "learning_rate": 7.364830705226351e-06, + "loss": 0.9088, + "step": 21770 + }, + { + "epoch": 1.23, + "grad_norm": 5.627095533447477, + "learning_rate": 7.363386476509038e-06, + "loss": 0.9117, + "step": 21775 + }, + { + "epoch": 1.23, + "grad_norm": 10.149637475153247, + "learning_rate": 7.361941993835418e-06, + "loss": 0.894, + "step": 21780 + }, + { + "epoch": 1.23, + "grad_norm": 7.040082047973201, + "learning_rate": 7.360497257360707e-06, + "loss": 0.9465, + "step": 21785 + }, + { + "epoch": 1.23, + "grad_norm": 15.855224616200738, + "learning_rate": 7.3590522672401475e-06, + "loss": 0.9296, + "step": 21790 + }, + { + "epoch": 1.23, + "grad_norm": 11.378131125789015, + "learning_rate": 7.357607023629011e-06, + "loss": 0.9305, + "step": 21795 + }, + { + "epoch": 1.23, + "grad_norm": 7.04528733289192, + "learning_rate": 7.356161526682595e-06, + "loss": 0.9826, + "step": 21800 + }, + { + "epoch": 1.23, + "grad_norm": 6.635493519426648, + "learning_rate": 7.354715776556224e-06, + "loss": 0.9241, + "step": 21805 + }, + { + "epoch": 1.23, + "grad_norm": 5.598379898325025, + "learning_rate": 7.35326977340525e-06, + "loss": 0.9403, + "step": 21810 + }, + { + "epoch": 1.23, + "grad_norm": 5.225927086509728, + "learning_rate": 7.351823517385054e-06, + "loss": 0.8884, + "step": 21815 + }, + { + "epoch": 1.23, + "grad_norm": 12.162349727177647, + "learning_rate": 7.350377008651042e-06, + "loss": 0.9734, + "step": 21820 + }, + { + "epoch": 1.23, + "grad_norm": 26.460339598551055, + "learning_rate": 7.348930247358646e-06, + "loss": 0.9369, + "step": 21825 + }, + { + "epoch": 1.23, + "grad_norm": 14.16657239966183, + "learning_rate": 7.347483233663329e-06, + "loss": 0.9436, + "step": 21830 + }, + { + "epoch": 1.23, + "grad_norm": 10.147835912037744, + "learning_rate": 7.346035967720579e-06, + "loss": 0.898, + "step": 21835 + }, + { + "epoch": 1.23, + "grad_norm": 25.073306251006585, + "learning_rate": 7.34458844968591e-06, + "loss": 0.9495, + "step": 21840 + }, + { + "epoch": 1.23, + "grad_norm": 9.828376469408942, + "learning_rate": 7.343140679714865e-06, + "loss": 0.8966, + "step": 21845 + }, + { + "epoch": 1.23, + "grad_norm": 6.967412542946501, + "learning_rate": 7.341692657963012e-06, + "loss": 0.9045, + "step": 21850 + }, + { + "epoch": 1.23, + "grad_norm": 15.15952437296701, + "learning_rate": 7.340244384585948e-06, + "loss": 0.9121, + "step": 21855 + }, + { + "epoch": 1.23, + "grad_norm": 7.247090804048587, + "learning_rate": 7.338795859739297e-06, + "loss": 0.9536, + "step": 21860 + }, + { + "epoch": 1.23, + "grad_norm": 5.935108018325918, + "learning_rate": 7.33734708357871e-06, + "loss": 0.9342, + "step": 21865 + }, + { + "epoch": 1.23, + "grad_norm": 10.413797889212892, + "learning_rate": 7.335898056259862e-06, + "loss": 0.9729, + "step": 21870 + }, + { + "epoch": 1.23, + "grad_norm": 5.6783359022760145, + "learning_rate": 7.33444877793846e-06, + "loss": 0.935, + "step": 21875 + }, + { + "epoch": 1.23, + "grad_norm": 24.12750081090191, + "learning_rate": 7.332999248770233e-06, + "loss": 0.9092, + "step": 21880 + }, + { + "epoch": 1.23, + "grad_norm": 13.755652262046134, + "learning_rate": 7.331549468910941e-06, + "loss": 0.9301, + "step": 21885 + }, + { + "epoch": 1.23, + "grad_norm": 7.591485909480699, + "learning_rate": 7.3300994385163684e-06, + "loss": 0.8947, + "step": 21890 + }, + { + "epoch": 1.23, + "grad_norm": 10.388719757046221, + "learning_rate": 7.328649157742328e-06, + "loss": 0.8905, + "step": 21895 + }, + { + "epoch": 1.23, + "grad_norm": 11.494169298269728, + "learning_rate": 7.327198626744658e-06, + "loss": 0.8947, + "step": 21900 + }, + { + "epoch": 1.23, + "grad_norm": 51.140332101414835, + "learning_rate": 7.325747845679225e-06, + "loss": 0.9236, + "step": 21905 + }, + { + "epoch": 1.23, + "grad_norm": 20.862028523405275, + "learning_rate": 7.324296814701921e-06, + "loss": 0.9417, + "step": 21910 + }, + { + "epoch": 1.23, + "grad_norm": 12.58923384462542, + "learning_rate": 7.322845533968668e-06, + "loss": 0.9265, + "step": 21915 + }, + { + "epoch": 1.24, + "grad_norm": 15.91100161901468, + "learning_rate": 7.321394003635409e-06, + "loss": 0.9087, + "step": 21920 + }, + { + "epoch": 1.24, + "grad_norm": 5.736212626380114, + "learning_rate": 7.319942223858121e-06, + "loss": 0.9458, + "step": 21925 + }, + { + "epoch": 1.24, + "grad_norm": 22.091160883422937, + "learning_rate": 7.318490194792803e-06, + "loss": 0.9176, + "step": 21930 + }, + { + "epoch": 1.24, + "grad_norm": 6.153667035667461, + "learning_rate": 7.317037916595479e-06, + "loss": 0.9271, + "step": 21935 + }, + { + "epoch": 1.24, + "grad_norm": 13.627680599228054, + "learning_rate": 7.315585389422205e-06, + "loss": 0.9498, + "step": 21940 + }, + { + "epoch": 1.24, + "grad_norm": 7.114959405659302, + "learning_rate": 7.3141326134290624e-06, + "loss": 0.9397, + "step": 21945 + }, + { + "epoch": 1.24, + "grad_norm": 20.562798526521103, + "learning_rate": 7.312679588772158e-06, + "loss": 0.9259, + "step": 21950 + }, + { + "epoch": 1.24, + "grad_norm": 9.942286974927281, + "learning_rate": 7.3112263156076236e-06, + "loss": 0.9455, + "step": 21955 + }, + { + "epoch": 1.24, + "grad_norm": 9.377704135118599, + "learning_rate": 7.309772794091623e-06, + "loss": 0.8948, + "step": 21960 + }, + { + "epoch": 1.24, + "grad_norm": 7.064833454227902, + "learning_rate": 7.30831902438034e-06, + "loss": 0.8722, + "step": 21965 + }, + { + "epoch": 1.24, + "grad_norm": 5.788425899394792, + "learning_rate": 7.3068650066299915e-06, + "loss": 0.9434, + "step": 21970 + }, + { + "epoch": 1.24, + "grad_norm": 6.477282345638749, + "learning_rate": 7.305410740996818e-06, + "loss": 0.8903, + "step": 21975 + }, + { + "epoch": 1.24, + "grad_norm": 11.534544233389502, + "learning_rate": 7.303956227637086e-06, + "loss": 0.9083, + "step": 21980 + }, + { + "epoch": 1.24, + "grad_norm": 18.62849064233974, + "learning_rate": 7.302501466707087e-06, + "loss": 0.8949, + "step": 21985 + }, + { + "epoch": 1.24, + "grad_norm": 5.411750978777702, + "learning_rate": 7.301046458363143e-06, + "loss": 0.9314, + "step": 21990 + }, + { + "epoch": 1.24, + "grad_norm": 9.274626813199493, + "learning_rate": 7.299591202761604e-06, + "loss": 0.9111, + "step": 21995 + }, + { + "epoch": 1.24, + "grad_norm": 13.468538903210115, + "learning_rate": 7.298135700058842e-06, + "loss": 0.9599, + "step": 22000 + }, + { + "epoch": 1.24, + "grad_norm": 14.67525963744242, + "learning_rate": 7.296679950411255e-06, + "loss": 0.9702, + "step": 22005 + }, + { + "epoch": 1.24, + "grad_norm": 52.370173525848614, + "learning_rate": 7.295223953975272e-06, + "loss": 0.9188, + "step": 22010 + }, + { + "epoch": 1.24, + "grad_norm": 27.721089084373194, + "learning_rate": 7.293767710907347e-06, + "loss": 0.936, + "step": 22015 + }, + { + "epoch": 1.24, + "grad_norm": 13.564599315104779, + "learning_rate": 7.292311221363956e-06, + "loss": 0.9012, + "step": 22020 + }, + { + "epoch": 1.24, + "grad_norm": 6.409185560916445, + "learning_rate": 7.2908544855016086e-06, + "loss": 0.9962, + "step": 22025 + }, + { + "epoch": 1.24, + "grad_norm": 9.684631487459809, + "learning_rate": 7.289397503476836e-06, + "loss": 0.9282, + "step": 22030 + }, + { + "epoch": 1.24, + "grad_norm": 20.845526471512144, + "learning_rate": 7.287940275446197e-06, + "loss": 0.9362, + "step": 22035 + }, + { + "epoch": 1.24, + "grad_norm": 7.776968972980009, + "learning_rate": 7.286482801566279e-06, + "loss": 0.9185, + "step": 22040 + }, + { + "epoch": 1.24, + "grad_norm": 12.349950934724852, + "learning_rate": 7.285025081993693e-06, + "loss": 0.9342, + "step": 22045 + }, + { + "epoch": 1.24, + "grad_norm": 6.598907784480035, + "learning_rate": 7.283567116885076e-06, + "loss": 0.9633, + "step": 22050 + }, + { + "epoch": 1.24, + "grad_norm": 10.723900511725269, + "learning_rate": 7.282108906397096e-06, + "loss": 0.9109, + "step": 22055 + }, + { + "epoch": 1.24, + "grad_norm": 7.507360648128066, + "learning_rate": 7.2806504506864415e-06, + "loss": 0.9383, + "step": 22060 + }, + { + "epoch": 1.24, + "grad_norm": 10.991026950664851, + "learning_rate": 7.2791917499098304e-06, + "loss": 0.9638, + "step": 22065 + }, + { + "epoch": 1.24, + "grad_norm": 6.61456900114781, + "learning_rate": 7.277732804224008e-06, + "loss": 0.9348, + "step": 22070 + }, + { + "epoch": 1.24, + "grad_norm": 13.136769181931975, + "learning_rate": 7.2762736137857425e-06, + "loss": 0.9043, + "step": 22075 + }, + { + "epoch": 1.24, + "grad_norm": 7.535947743066793, + "learning_rate": 7.274814178751832e-06, + "loss": 0.9371, + "step": 22080 + }, + { + "epoch": 1.24, + "grad_norm": 6.149668446261899, + "learning_rate": 7.273354499279097e-06, + "loss": 0.9109, + "step": 22085 + }, + { + "epoch": 1.24, + "grad_norm": 11.259569954106862, + "learning_rate": 7.271894575524389e-06, + "loss": 0.9372, + "step": 22090 + }, + { + "epoch": 1.24, + "grad_norm": 6.520044363602685, + "learning_rate": 7.270434407644582e-06, + "loss": 0.9342, + "step": 22095 + }, + { + "epoch": 1.25, + "grad_norm": 7.69510894085174, + "learning_rate": 7.2689739957965786e-06, + "loss": 0.9227, + "step": 22100 + }, + { + "epoch": 1.25, + "grad_norm": 6.798343423538765, + "learning_rate": 7.267513340137304e-06, + "loss": 0.9684, + "step": 22105 + }, + { + "epoch": 1.25, + "grad_norm": 17.145059397665772, + "learning_rate": 7.266052440823715e-06, + "loss": 0.9256, + "step": 22110 + }, + { + "epoch": 1.25, + "grad_norm": 12.415302947740033, + "learning_rate": 7.26459129801279e-06, + "loss": 0.9039, + "step": 22115 + }, + { + "epoch": 1.25, + "grad_norm": 6.714704079424068, + "learning_rate": 7.263129911861535e-06, + "loss": 0.9035, + "step": 22120 + }, + { + "epoch": 1.25, + "grad_norm": 12.01531219978725, + "learning_rate": 7.2616682825269835e-06, + "loss": 0.9721, + "step": 22125 + }, + { + "epoch": 1.25, + "grad_norm": 9.33725618236222, + "learning_rate": 7.260206410166194e-06, + "loss": 0.8885, + "step": 22130 + }, + { + "epoch": 1.25, + "grad_norm": 12.944860265352075, + "learning_rate": 7.2587442949362505e-06, + "loss": 0.9016, + "step": 22135 + }, + { + "epoch": 1.25, + "grad_norm": 5.607350250519633, + "learning_rate": 7.257281936994264e-06, + "loss": 0.9153, + "step": 22140 + }, + { + "epoch": 1.25, + "grad_norm": 5.831291594951576, + "learning_rate": 7.255819336497371e-06, + "loss": 0.9105, + "step": 22145 + }, + { + "epoch": 1.25, + "grad_norm": 10.073914295879671, + "learning_rate": 7.254356493602734e-06, + "loss": 0.9161, + "step": 22150 + }, + { + "epoch": 1.25, + "grad_norm": 12.058757566653028, + "learning_rate": 7.252893408467544e-06, + "loss": 0.9368, + "step": 22155 + }, + { + "epoch": 1.25, + "grad_norm": 10.284533039182037, + "learning_rate": 7.251430081249013e-06, + "loss": 0.9216, + "step": 22160 + }, + { + "epoch": 1.25, + "grad_norm": 17.434267380137317, + "learning_rate": 7.249966512104384e-06, + "loss": 0.9177, + "step": 22165 + }, + { + "epoch": 1.25, + "grad_norm": 6.465102217859059, + "learning_rate": 7.2485027011909234e-06, + "loss": 1.0202, + "step": 22170 + }, + { + "epoch": 1.25, + "grad_norm": 6.6467907833389726, + "learning_rate": 7.247038648665924e-06, + "loss": 0.8691, + "step": 22175 + }, + { + "epoch": 1.25, + "grad_norm": 6.7271965196415815, + "learning_rate": 7.245574354686705e-06, + "loss": 0.9186, + "step": 22180 + }, + { + "epoch": 1.25, + "grad_norm": 5.83492658218162, + "learning_rate": 7.244109819410612e-06, + "loss": 0.8958, + "step": 22185 + }, + { + "epoch": 1.25, + "grad_norm": 14.424020275522027, + "learning_rate": 7.2426450429950134e-06, + "loss": 0.9232, + "step": 22190 + }, + { + "epoch": 1.25, + "grad_norm": 26.204666743484392, + "learning_rate": 7.241180025597307e-06, + "loss": 0.9251, + "step": 22195 + }, + { + "epoch": 1.25, + "grad_norm": 8.926425544551613, + "learning_rate": 7.2397147673749165e-06, + "loss": 0.947, + "step": 22200 + }, + { + "epoch": 1.25, + "grad_norm": 6.067536442785645, + "learning_rate": 7.2382492684852914e-06, + "loss": 0.9836, + "step": 22205 + }, + { + "epoch": 1.25, + "grad_norm": 6.695775362583674, + "learning_rate": 7.236783529085903e-06, + "loss": 0.9223, + "step": 22210 + }, + { + "epoch": 1.25, + "grad_norm": 11.453321780226416, + "learning_rate": 7.235317549334253e-06, + "loss": 0.9266, + "step": 22215 + }, + { + "epoch": 1.25, + "grad_norm": 5.275123053036545, + "learning_rate": 7.233851329387865e-06, + "loss": 0.9022, + "step": 22220 + }, + { + "epoch": 1.25, + "grad_norm": 6.7271395119769375, + "learning_rate": 7.232384869404294e-06, + "loss": 0.9248, + "step": 22225 + }, + { + "epoch": 1.25, + "grad_norm": 7.428463664840372, + "learning_rate": 7.230918169541117e-06, + "loss": 0.8752, + "step": 22230 + }, + { + "epoch": 1.25, + "grad_norm": 6.946358985156184, + "learning_rate": 7.229451229955937e-06, + "loss": 0.8365, + "step": 22235 + }, + { + "epoch": 1.25, + "grad_norm": 42.88190560959908, + "learning_rate": 7.227984050806385e-06, + "loss": 0.9866, + "step": 22240 + }, + { + "epoch": 1.25, + "grad_norm": 21.488661025559217, + "learning_rate": 7.226516632250111e-06, + "loss": 0.9286, + "step": 22245 + }, + { + "epoch": 1.25, + "grad_norm": 10.548351512433426, + "learning_rate": 7.225048974444799e-06, + "loss": 0.9127, + "step": 22250 + }, + { + "epoch": 1.25, + "grad_norm": 5.534061759613512, + "learning_rate": 7.223581077548155e-06, + "loss": 0.8978, + "step": 22255 + }, + { + "epoch": 1.25, + "grad_norm": 12.266938124903895, + "learning_rate": 7.222112941717911e-06, + "loss": 0.955, + "step": 22260 + }, + { + "epoch": 1.25, + "grad_norm": 10.432951922879372, + "learning_rate": 7.220644567111826e-06, + "loss": 0.929, + "step": 22265 + }, + { + "epoch": 1.25, + "grad_norm": 11.41367274447374, + "learning_rate": 7.2191759538876784e-06, + "loss": 0.9076, + "step": 22270 + }, + { + "epoch": 1.26, + "grad_norm": 14.087754794968259, + "learning_rate": 7.217707102203283e-06, + "loss": 0.935, + "step": 22275 + }, + { + "epoch": 1.26, + "grad_norm": 18.714156003685, + "learning_rate": 7.216238012216471e-06, + "loss": 0.9085, + "step": 22280 + }, + { + "epoch": 1.26, + "grad_norm": 44.57032979041386, + "learning_rate": 7.214768684085104e-06, + "loss": 0.9322, + "step": 22285 + }, + { + "epoch": 1.26, + "grad_norm": 16.515869012725076, + "learning_rate": 7.213299117967068e-06, + "loss": 0.949, + "step": 22290 + }, + { + "epoch": 1.26, + "grad_norm": 14.425743156187252, + "learning_rate": 7.211829314020273e-06, + "loss": 0.9036, + "step": 22295 + }, + { + "epoch": 1.26, + "grad_norm": 27.190723723781804, + "learning_rate": 7.2103592724026575e-06, + "loss": 0.8982, + "step": 22300 + }, + { + "epoch": 1.26, + "grad_norm": 5.355329808517583, + "learning_rate": 7.208888993272182e-06, + "loss": 0.9073, + "step": 22305 + }, + { + "epoch": 1.26, + "grad_norm": 8.013564120365336, + "learning_rate": 7.207418476786836e-06, + "loss": 0.9493, + "step": 22310 + }, + { + "epoch": 1.26, + "grad_norm": 7.906256703208929, + "learning_rate": 7.205947723104632e-06, + "loss": 0.9332, + "step": 22315 + }, + { + "epoch": 1.26, + "grad_norm": 24.562258183054485, + "learning_rate": 7.2044767323836095e-06, + "loss": 0.9374, + "step": 22320 + }, + { + "epoch": 1.26, + "grad_norm": 18.557564362257907, + "learning_rate": 7.203005504781835e-06, + "loss": 0.9434, + "step": 22325 + }, + { + "epoch": 1.26, + "grad_norm": 11.335231646004091, + "learning_rate": 7.201534040457394e-06, + "loss": 0.8839, + "step": 22330 + }, + { + "epoch": 1.26, + "grad_norm": 8.212476331170947, + "learning_rate": 7.2000623395684055e-06, + "loss": 0.9261, + "step": 22335 + }, + { + "epoch": 1.26, + "grad_norm": 6.275979280902316, + "learning_rate": 7.198590402273009e-06, + "loss": 0.9194, + "step": 22340 + }, + { + "epoch": 1.26, + "grad_norm": 5.3114264949476, + "learning_rate": 7.1971182287293705e-06, + "loss": 0.894, + "step": 22345 + }, + { + "epoch": 1.26, + "grad_norm": 6.174337369339983, + "learning_rate": 7.195645819095682e-06, + "loss": 0.9127, + "step": 22350 + }, + { + "epoch": 1.26, + "grad_norm": 5.262321560933757, + "learning_rate": 7.194173173530159e-06, + "loss": 0.9321, + "step": 22355 + }, + { + "epoch": 1.26, + "grad_norm": 8.37327235806034, + "learning_rate": 7.192700292191046e-06, + "loss": 0.9063, + "step": 22360 + }, + { + "epoch": 1.26, + "grad_norm": 5.575711988987514, + "learning_rate": 7.191227175236609e-06, + "loss": 0.891, + "step": 22365 + }, + { + "epoch": 1.26, + "grad_norm": 5.208225346556525, + "learning_rate": 7.189753822825142e-06, + "loss": 0.8968, + "step": 22370 + }, + { + "epoch": 1.26, + "grad_norm": 5.264383245038347, + "learning_rate": 7.188280235114962e-06, + "loss": 0.9163, + "step": 22375 + }, + { + "epoch": 1.26, + "grad_norm": 7.749216252748451, + "learning_rate": 7.1868064122644135e-06, + "loss": 0.9285, + "step": 22380 + }, + { + "epoch": 1.26, + "grad_norm": 6.920590199400951, + "learning_rate": 7.185332354431864e-06, + "loss": 0.9474, + "step": 22385 + }, + { + "epoch": 1.26, + "grad_norm": 6.82044116577742, + "learning_rate": 7.18385806177571e-06, + "loss": 0.9428, + "step": 22390 + }, + { + "epoch": 1.26, + "grad_norm": 11.50893858938991, + "learning_rate": 7.182383534454368e-06, + "loss": 0.9068, + "step": 22395 + }, + { + "epoch": 1.26, + "grad_norm": 5.595875930238892, + "learning_rate": 7.180908772626284e-06, + "loss": 0.9012, + "step": 22400 + }, + { + "epoch": 1.26, + "grad_norm": 13.60964831714443, + "learning_rate": 7.179433776449927e-06, + "loss": 0.9271, + "step": 22405 + }, + { + "epoch": 1.26, + "grad_norm": 6.662641977688752, + "learning_rate": 7.177958546083792e-06, + "loss": 0.9508, + "step": 22410 + }, + { + "epoch": 1.26, + "grad_norm": 8.434300757203431, + "learning_rate": 7.1764830816864e-06, + "loss": 0.9888, + "step": 22415 + }, + { + "epoch": 1.26, + "grad_norm": 11.975524181675334, + "learning_rate": 7.175007383416295e-06, + "loss": 1.0058, + "step": 22420 + }, + { + "epoch": 1.26, + "grad_norm": 17.26329533226023, + "learning_rate": 7.173531451432047e-06, + "loss": 0.9366, + "step": 22425 + }, + { + "epoch": 1.26, + "grad_norm": 6.718118615736259, + "learning_rate": 7.1720552858922525e-06, + "loss": 0.8634, + "step": 22430 + }, + { + "epoch": 1.26, + "grad_norm": 8.28238513878075, + "learning_rate": 7.170578886955531e-06, + "loss": 0.9186, + "step": 22435 + }, + { + "epoch": 1.26, + "grad_norm": 15.053497009922532, + "learning_rate": 7.16910225478053e-06, + "loss": 0.9129, + "step": 22440 + }, + { + "epoch": 1.26, + "grad_norm": 14.21849780760886, + "learning_rate": 7.167625389525917e-06, + "loss": 0.9374, + "step": 22445 + }, + { + "epoch": 1.26, + "grad_norm": 30.521970668345666, + "learning_rate": 7.16614829135039e-06, + "loss": 0.8727, + "step": 22450 + }, + { + "epoch": 1.27, + "grad_norm": 4.898277938310586, + "learning_rate": 7.164670960412668e-06, + "loss": 0.9136, + "step": 22455 + }, + { + "epoch": 1.27, + "grad_norm": 20.87439964665969, + "learning_rate": 7.163193396871499e-06, + "loss": 0.8712, + "step": 22460 + }, + { + "epoch": 1.27, + "grad_norm": 35.99702937196072, + "learning_rate": 7.1617156008856545e-06, + "loss": 0.9381, + "step": 22465 + }, + { + "epoch": 1.27, + "grad_norm": 7.976611763911491, + "learning_rate": 7.160237572613927e-06, + "loss": 0.9553, + "step": 22470 + }, + { + "epoch": 1.27, + "grad_norm": 21.086741200334796, + "learning_rate": 7.158759312215138e-06, + "loss": 0.9335, + "step": 22475 + }, + { + "epoch": 1.27, + "grad_norm": 49.651339677398546, + "learning_rate": 7.157280819848133e-06, + "loss": 0.9425, + "step": 22480 + }, + { + "epoch": 1.27, + "grad_norm": 11.191181905501226, + "learning_rate": 7.155802095671784e-06, + "loss": 0.9171, + "step": 22485 + }, + { + "epoch": 1.27, + "grad_norm": 55.63138015363943, + "learning_rate": 7.154323139844985e-06, + "loss": 0.9206, + "step": 22490 + }, + { + "epoch": 1.27, + "grad_norm": 8.696177285242326, + "learning_rate": 7.152843952526658e-06, + "loss": 0.9754, + "step": 22495 + }, + { + "epoch": 1.27, + "grad_norm": 24.476771225750387, + "learning_rate": 7.151364533875746e-06, + "loss": 0.8878, + "step": 22500 + }, + { + "epoch": 1.27, + "grad_norm": 9.538184329075, + "learning_rate": 7.149884884051221e-06, + "loss": 0.9269, + "step": 22505 + }, + { + "epoch": 1.27, + "grad_norm": 43.2767332850137, + "learning_rate": 7.1484050032120755e-06, + "loss": 0.9051, + "step": 22510 + }, + { + "epoch": 1.27, + "grad_norm": 14.320841922432187, + "learning_rate": 7.146924891517332e-06, + "loss": 0.9053, + "step": 22515 + }, + { + "epoch": 1.27, + "grad_norm": 10.649112120286738, + "learning_rate": 7.1454445491260325e-06, + "loss": 0.9733, + "step": 22520 + }, + { + "epoch": 1.27, + "grad_norm": 6.267161596705898, + "learning_rate": 7.14396397619725e-06, + "loss": 0.9231, + "step": 22525 + }, + { + "epoch": 1.27, + "grad_norm": 15.930588218952579, + "learning_rate": 7.142483172890076e-06, + "loss": 0.9248, + "step": 22530 + }, + { + "epoch": 1.27, + "grad_norm": 6.6200028199088905, + "learning_rate": 7.141002139363627e-06, + "loss": 0.8843, + "step": 22535 + }, + { + "epoch": 1.27, + "grad_norm": 9.556827139306353, + "learning_rate": 7.139520875777051e-06, + "loss": 0.9483, + "step": 22540 + }, + { + "epoch": 1.27, + "grad_norm": 6.165119676902172, + "learning_rate": 7.138039382289513e-06, + "loss": 0.8952, + "step": 22545 + }, + { + "epoch": 1.27, + "grad_norm": 7.303111649125197, + "learning_rate": 7.136557659060211e-06, + "loss": 0.9237, + "step": 22550 + }, + { + "epoch": 1.27, + "grad_norm": 5.704846927534164, + "learning_rate": 7.135075706248355e-06, + "loss": 0.9604, + "step": 22555 + }, + { + "epoch": 1.27, + "grad_norm": 10.77677861571753, + "learning_rate": 7.133593524013194e-06, + "loss": 0.9121, + "step": 22560 + }, + { + "epoch": 1.27, + "grad_norm": 7.5412843873595525, + "learning_rate": 7.132111112513992e-06, + "loss": 0.912, + "step": 22565 + }, + { + "epoch": 1.27, + "grad_norm": 9.670663112543473, + "learning_rate": 7.130628471910041e-06, + "loss": 0.9618, + "step": 22570 + }, + { + "epoch": 1.27, + "grad_norm": 5.585732050084596, + "learning_rate": 7.129145602360657e-06, + "loss": 0.9062, + "step": 22575 + }, + { + "epoch": 1.27, + "grad_norm": 5.103107598510329, + "learning_rate": 7.127662504025183e-06, + "loss": 0.9101, + "step": 22580 + }, + { + "epoch": 1.27, + "grad_norm": 5.01943233725791, + "learning_rate": 7.126179177062981e-06, + "loss": 0.9261, + "step": 22585 + }, + { + "epoch": 1.27, + "grad_norm": 5.997821653017817, + "learning_rate": 7.124695621633444e-06, + "loss": 0.9328, + "step": 22590 + }, + { + "epoch": 1.27, + "grad_norm": 5.141342936339158, + "learning_rate": 7.123211837895985e-06, + "loss": 0.9232, + "step": 22595 + }, + { + "epoch": 1.27, + "grad_norm": 5.72750955764383, + "learning_rate": 7.121727826010043e-06, + "loss": 0.9172, + "step": 22600 + }, + { + "epoch": 1.27, + "grad_norm": 7.058597414414063, + "learning_rate": 7.1202435861350835e-06, + "loss": 0.9407, + "step": 22605 + }, + { + "epoch": 1.27, + "grad_norm": 5.845098908601672, + "learning_rate": 7.118759118430594e-06, + "loss": 0.9316, + "step": 22610 + }, + { + "epoch": 1.27, + "grad_norm": 11.768034101918666, + "learning_rate": 7.117274423056085e-06, + "loss": 0.9312, + "step": 22615 + }, + { + "epoch": 1.27, + "grad_norm": 6.769306039901317, + "learning_rate": 7.115789500171095e-06, + "loss": 0.9556, + "step": 22620 + }, + { + "epoch": 1.27, + "grad_norm": 7.0519093999596185, + "learning_rate": 7.1143043499351864e-06, + "loss": 0.8963, + "step": 22625 + }, + { + "epoch": 1.28, + "grad_norm": 5.907905854675806, + "learning_rate": 7.112818972507943e-06, + "loss": 0.9511, + "step": 22630 + }, + { + "epoch": 1.28, + "grad_norm": 14.598332059571938, + "learning_rate": 7.111333368048977e-06, + "loss": 0.9201, + "step": 22635 + }, + { + "epoch": 1.28, + "grad_norm": 5.369370519283578, + "learning_rate": 7.109847536717922e-06, + "loss": 0.881, + "step": 22640 + }, + { + "epoch": 1.28, + "grad_norm": 8.71384453100486, + "learning_rate": 7.10836147867444e-06, + "loss": 0.869, + "step": 22645 + }, + { + "epoch": 1.28, + "grad_norm": 8.675248917546769, + "learning_rate": 7.106875194078209e-06, + "loss": 0.9442, + "step": 22650 + }, + { + "epoch": 1.28, + "grad_norm": 15.930285510478638, + "learning_rate": 7.105388683088942e-06, + "loss": 0.9294, + "step": 22655 + }, + { + "epoch": 1.28, + "grad_norm": 15.24951190329833, + "learning_rate": 7.1039019458663695e-06, + "loss": 0.9477, + "step": 22660 + }, + { + "epoch": 1.28, + "grad_norm": 9.348142857270698, + "learning_rate": 7.1024149825702475e-06, + "loss": 0.8899, + "step": 22665 + }, + { + "epoch": 1.28, + "grad_norm": 15.67447699654083, + "learning_rate": 7.100927793360356e-06, + "loss": 0.9256, + "step": 22670 + }, + { + "epoch": 1.28, + "grad_norm": 14.115533987088446, + "learning_rate": 7.099440378396502e-06, + "loss": 0.9057, + "step": 22675 + }, + { + "epoch": 1.28, + "grad_norm": 14.366326363607728, + "learning_rate": 7.097952737838511e-06, + "loss": 0.9024, + "step": 22680 + }, + { + "epoch": 1.28, + "grad_norm": 9.404456823283693, + "learning_rate": 7.0964648718462426e-06, + "loss": 0.9132, + "step": 22685 + }, + { + "epoch": 1.28, + "grad_norm": 6.792306925925805, + "learning_rate": 7.09497678057957e-06, + "loss": 0.8965, + "step": 22690 + }, + { + "epoch": 1.28, + "grad_norm": 9.340332273780737, + "learning_rate": 7.093488464198397e-06, + "loss": 0.9729, + "step": 22695 + }, + { + "epoch": 1.28, + "grad_norm": 7.601090418333475, + "learning_rate": 7.091999922862649e-06, + "loss": 0.9433, + "step": 22700 + }, + { + "epoch": 1.28, + "grad_norm": 18.122373273003937, + "learning_rate": 7.090511156732277e-06, + "loss": 0.9546, + "step": 22705 + }, + { + "epoch": 1.28, + "grad_norm": 6.473023619409803, + "learning_rate": 7.089022165967254e-06, + "loss": 0.906, + "step": 22710 + }, + { + "epoch": 1.28, + "grad_norm": 18.409357024355955, + "learning_rate": 7.087532950727582e-06, + "loss": 0.8626, + "step": 22715 + }, + { + "epoch": 1.28, + "grad_norm": 25.322002531201004, + "learning_rate": 7.08604351117328e-06, + "loss": 0.9287, + "step": 22720 + }, + { + "epoch": 1.28, + "grad_norm": 23.513069694962137, + "learning_rate": 7.084553847464395e-06, + "loss": 0.885, + "step": 22725 + }, + { + "epoch": 1.28, + "grad_norm": 24.602574775502212, + "learning_rate": 7.083063959761001e-06, + "loss": 0.9411, + "step": 22730 + }, + { + "epoch": 1.28, + "grad_norm": 38.43463450946847, + "learning_rate": 7.08157384822319e-06, + "loss": 0.9622, + "step": 22735 + }, + { + "epoch": 1.28, + "grad_norm": 7.837521503763455, + "learning_rate": 7.0800835130110834e-06, + "loss": 0.9552, + "step": 22740 + }, + { + "epoch": 1.28, + "grad_norm": 19.27927981728115, + "learning_rate": 7.0785929542848215e-06, + "loss": 0.9136, + "step": 22745 + }, + { + "epoch": 1.28, + "grad_norm": 9.121110550348236, + "learning_rate": 7.0771021722045765e-06, + "loss": 0.8846, + "step": 22750 + }, + { + "epoch": 1.28, + "grad_norm": 37.53688475301775, + "learning_rate": 7.075611166930534e-06, + "loss": 0.9124, + "step": 22755 + }, + { + "epoch": 1.28, + "grad_norm": 18.175021177370883, + "learning_rate": 7.074119938622912e-06, + "loss": 0.9364, + "step": 22760 + }, + { + "epoch": 1.28, + "grad_norm": 20.262302868378036, + "learning_rate": 7.072628487441948e-06, + "loss": 0.9637, + "step": 22765 + }, + { + "epoch": 1.28, + "grad_norm": 14.775707461774358, + "learning_rate": 7.071136813547905e-06, + "loss": 0.9108, + "step": 22770 + }, + { + "epoch": 1.28, + "grad_norm": 9.753441719972129, + "learning_rate": 7.069644917101071e-06, + "loss": 0.8669, + "step": 22775 + }, + { + "epoch": 1.28, + "grad_norm": 8.98038321878649, + "learning_rate": 7.068152798261758e-06, + "loss": 0.943, + "step": 22780 + }, + { + "epoch": 1.28, + "grad_norm": 8.688091548978663, + "learning_rate": 7.0666604571903e-06, + "loss": 0.9335, + "step": 22785 + }, + { + "epoch": 1.28, + "grad_norm": 5.089918928021166, + "learning_rate": 7.065167894047054e-06, + "loss": 0.9484, + "step": 22790 + }, + { + "epoch": 1.28, + "grad_norm": 10.118865353599768, + "learning_rate": 7.063675108992403e-06, + "loss": 0.9033, + "step": 22795 + }, + { + "epoch": 1.28, + "grad_norm": 16.963601352405306, + "learning_rate": 7.062182102186753e-06, + "loss": 0.9264, + "step": 22800 + }, + { + "epoch": 1.28, + "grad_norm": 7.1220859970529915, + "learning_rate": 7.0606888737905365e-06, + "loss": 0.8986, + "step": 22805 + }, + { + "epoch": 1.29, + "grad_norm": 5.487584795061593, + "learning_rate": 7.0591954239642065e-06, + "loss": 0.9296, + "step": 22810 + }, + { + "epoch": 1.29, + "grad_norm": 9.99782311670588, + "learning_rate": 7.057701752868238e-06, + "loss": 0.8505, + "step": 22815 + }, + { + "epoch": 1.29, + "grad_norm": 11.579620852800494, + "learning_rate": 7.056207860663136e-06, + "loss": 0.8783, + "step": 22820 + }, + { + "epoch": 1.29, + "grad_norm": 10.296017321736796, + "learning_rate": 7.0547137475094255e-06, + "loss": 0.9371, + "step": 22825 + }, + { + "epoch": 1.29, + "grad_norm": 5.960629404843116, + "learning_rate": 7.053219413567653e-06, + "loss": 0.9411, + "step": 22830 + }, + { + "epoch": 1.29, + "grad_norm": 18.384737208117784, + "learning_rate": 7.051724858998395e-06, + "loss": 0.9514, + "step": 22835 + }, + { + "epoch": 1.29, + "grad_norm": 12.022686875452175, + "learning_rate": 7.050230083962245e-06, + "loss": 0.879, + "step": 22840 + }, + { + "epoch": 1.29, + "grad_norm": 5.3519032214357996, + "learning_rate": 7.048735088619825e-06, + "loss": 0.8871, + "step": 22845 + }, + { + "epoch": 1.29, + "grad_norm": 21.28589690201938, + "learning_rate": 7.047239873131777e-06, + "loss": 0.8918, + "step": 22850 + }, + { + "epoch": 1.29, + "grad_norm": 11.823643019813286, + "learning_rate": 7.045744437658769e-06, + "loss": 0.9036, + "step": 22855 + }, + { + "epoch": 1.29, + "grad_norm": 8.923494877479909, + "learning_rate": 7.044248782361494e-06, + "loss": 0.8931, + "step": 22860 + }, + { + "epoch": 1.29, + "grad_norm": 4.833656736894615, + "learning_rate": 7.042752907400663e-06, + "loss": 0.8937, + "step": 22865 + }, + { + "epoch": 1.29, + "grad_norm": 5.326339595268569, + "learning_rate": 7.041256812937019e-06, + "loss": 0.9264, + "step": 22870 + }, + { + "epoch": 1.29, + "grad_norm": 8.170266178527214, + "learning_rate": 7.0397604991313205e-06, + "loss": 0.8944, + "step": 22875 + }, + { + "epoch": 1.29, + "grad_norm": 6.544711546545316, + "learning_rate": 7.038263966144354e-06, + "loss": 0.8782, + "step": 22880 + }, + { + "epoch": 1.29, + "grad_norm": 5.8314676607732245, + "learning_rate": 7.0367672141369305e-06, + "loss": 0.9347, + "step": 22885 + }, + { + "epoch": 1.29, + "grad_norm": 8.92429482550247, + "learning_rate": 7.035270243269879e-06, + "loss": 0.9339, + "step": 22890 + }, + { + "epoch": 1.29, + "grad_norm": 9.94715772583652, + "learning_rate": 7.03377305370406e-06, + "loss": 0.9287, + "step": 22895 + }, + { + "epoch": 1.29, + "grad_norm": 6.271521713882415, + "learning_rate": 7.032275645600348e-06, + "loss": 0.8781, + "step": 22900 + }, + { + "epoch": 1.29, + "grad_norm": 11.115409480836378, + "learning_rate": 7.030778019119651e-06, + "loss": 0.8808, + "step": 22905 + }, + { + "epoch": 1.29, + "grad_norm": 5.607515014167464, + "learning_rate": 7.02928017442289e-06, + "loss": 0.9299, + "step": 22910 + }, + { + "epoch": 1.29, + "grad_norm": 5.112423378133695, + "learning_rate": 7.027782111671021e-06, + "loss": 0.9115, + "step": 22915 + }, + { + "epoch": 1.29, + "grad_norm": 10.803563609203383, + "learning_rate": 7.026283831025014e-06, + "loss": 0.931, + "step": 22920 + }, + { + "epoch": 1.29, + "grad_norm": 5.6766259783063875, + "learning_rate": 7.024785332645868e-06, + "loss": 0.9264, + "step": 22925 + }, + { + "epoch": 1.29, + "grad_norm": 6.894090944961063, + "learning_rate": 7.023286616694601e-06, + "loss": 0.8574, + "step": 22930 + }, + { + "epoch": 1.29, + "grad_norm": 7.4690923806329605, + "learning_rate": 7.021787683332258e-06, + "loss": 0.8889, + "step": 22935 + }, + { + "epoch": 1.29, + "grad_norm": 6.252728078940986, + "learning_rate": 7.020288532719904e-06, + "loss": 0.8472, + "step": 22940 + }, + { + "epoch": 1.29, + "grad_norm": 26.9965026541785, + "learning_rate": 7.018789165018632e-06, + "loss": 0.9592, + "step": 22945 + }, + { + "epoch": 1.29, + "grad_norm": 11.968393944045484, + "learning_rate": 7.017289580389556e-06, + "loss": 0.9045, + "step": 22950 + }, + { + "epoch": 1.29, + "grad_norm": 26.179560119848876, + "learning_rate": 7.015789778993809e-06, + "loss": 0.9276, + "step": 22955 + }, + { + "epoch": 1.29, + "grad_norm": 4.937481508419991, + "learning_rate": 7.014289760992557e-06, + "loss": 0.9017, + "step": 22960 + }, + { + "epoch": 1.29, + "grad_norm": 10.451652369480728, + "learning_rate": 7.012789526546977e-06, + "loss": 0.9887, + "step": 22965 + }, + { + "epoch": 1.29, + "grad_norm": 15.803117245864987, + "learning_rate": 7.011289075818282e-06, + "loss": 0.9478, + "step": 22970 + }, + { + "epoch": 1.29, + "grad_norm": 6.2883053599456105, + "learning_rate": 7.009788408967697e-06, + "loss": 0.9335, + "step": 22975 + }, + { + "epoch": 1.29, + "grad_norm": 23.497441395526586, + "learning_rate": 7.00828752615648e-06, + "loss": 0.8702, + "step": 22980 + }, + { + "epoch": 1.3, + "grad_norm": 17.855532617397643, + "learning_rate": 7.006786427545904e-06, + "loss": 0.9082, + "step": 22985 + }, + { + "epoch": 1.3, + "grad_norm": 5.048931952110115, + "learning_rate": 7.00528511329727e-06, + "loss": 0.9059, + "step": 22990 + }, + { + "epoch": 1.3, + "grad_norm": 8.66372780157301, + "learning_rate": 7.003783583571899e-06, + "loss": 0.9761, + "step": 22995 + }, + { + "epoch": 1.3, + "grad_norm": 8.552353020347688, + "learning_rate": 7.00228183853114e-06, + "loss": 0.9035, + "step": 23000 + }, + { + "epoch": 1.3, + "grad_norm": 24.534899451912995, + "learning_rate": 7.00077987833636e-06, + "loss": 0.9455, + "step": 23005 + }, + { + "epoch": 1.3, + "grad_norm": 6.467100650492112, + "learning_rate": 6.9992777031489535e-06, + "loss": 0.9418, + "step": 23010 + }, + { + "epoch": 1.3, + "grad_norm": 16.698976620601943, + "learning_rate": 6.997775313130333e-06, + "loss": 0.8872, + "step": 23015 + }, + { + "epoch": 1.3, + "grad_norm": 12.832633983636459, + "learning_rate": 6.996272708441938e-06, + "loss": 0.9282, + "step": 23020 + }, + { + "epoch": 1.3, + "grad_norm": 12.352439488233575, + "learning_rate": 6.994769889245231e-06, + "loss": 0.8995, + "step": 23025 + }, + { + "epoch": 1.3, + "grad_norm": 16.48147182731211, + "learning_rate": 6.993266855701695e-06, + "loss": 0.892, + "step": 23030 + }, + { + "epoch": 1.3, + "grad_norm": 18.849754403821247, + "learning_rate": 6.9917636079728406e-06, + "loss": 0.9528, + "step": 23035 + }, + { + "epoch": 1.3, + "grad_norm": 8.651359787938244, + "learning_rate": 6.990260146220194e-06, + "loss": 0.8351, + "step": 23040 + }, + { + "epoch": 1.3, + "grad_norm": 6.370217410537741, + "learning_rate": 6.9887564706053115e-06, + "loss": 0.9426, + "step": 23045 + }, + { + "epoch": 1.3, + "grad_norm": 6.70394133087629, + "learning_rate": 6.987252581289768e-06, + "loss": 0.9055, + "step": 23050 + }, + { + "epoch": 1.3, + "grad_norm": 27.32885678822093, + "learning_rate": 6.9857484784351655e-06, + "loss": 0.8945, + "step": 23055 + }, + { + "epoch": 1.3, + "grad_norm": 17.621572619727342, + "learning_rate": 6.984244162203125e-06, + "loss": 0.8696, + "step": 23060 + }, + { + "epoch": 1.3, + "grad_norm": 14.562334679762717, + "learning_rate": 6.982739632755293e-06, + "loss": 0.938, + "step": 23065 + }, + { + "epoch": 1.3, + "grad_norm": 6.061149977987767, + "learning_rate": 6.981234890253337e-06, + "loss": 0.9163, + "step": 23070 + }, + { + "epoch": 1.3, + "grad_norm": 7.873704539780729, + "learning_rate": 6.979729934858948e-06, + "loss": 0.8843, + "step": 23075 + }, + { + "epoch": 1.3, + "grad_norm": 5.7028884439878444, + "learning_rate": 6.978224766733839e-06, + "loss": 0.8695, + "step": 23080 + }, + { + "epoch": 1.3, + "grad_norm": 5.88299343636864, + "learning_rate": 6.976719386039749e-06, + "loss": 0.9143, + "step": 23085 + }, + { + "epoch": 1.3, + "grad_norm": 19.677385200318884, + "learning_rate": 6.975213792938437e-06, + "loss": 0.9459, + "step": 23090 + }, + { + "epoch": 1.3, + "grad_norm": 15.283570981343427, + "learning_rate": 6.973707987591685e-06, + "loss": 0.8779, + "step": 23095 + }, + { + "epoch": 1.3, + "grad_norm": 5.7902067354862, + "learning_rate": 6.9722019701613e-06, + "loss": 0.9108, + "step": 23100 + }, + { + "epoch": 1.3, + "grad_norm": 10.374454764423398, + "learning_rate": 6.970695740809109e-06, + "loss": 0.918, + "step": 23105 + }, + { + "epoch": 1.3, + "grad_norm": 40.44961824942354, + "learning_rate": 6.969189299696964e-06, + "loss": 0.939, + "step": 23110 + }, + { + "epoch": 1.3, + "grad_norm": 25.486534692469263, + "learning_rate": 6.967682646986737e-06, + "loss": 0.8774, + "step": 23115 + }, + { + "epoch": 1.3, + "grad_norm": 6.439612198161807, + "learning_rate": 6.966175782840326e-06, + "loss": 0.8995, + "step": 23120 + }, + { + "epoch": 1.3, + "grad_norm": 9.447008413712133, + "learning_rate": 6.96466870741965e-06, + "loss": 0.9588, + "step": 23125 + }, + { + "epoch": 1.3, + "grad_norm": 7.018192043437732, + "learning_rate": 6.96316142088665e-06, + "loss": 0.8905, + "step": 23130 + }, + { + "epoch": 1.3, + "grad_norm": 15.739310783834362, + "learning_rate": 6.961653923403292e-06, + "loss": 0.8888, + "step": 23135 + }, + { + "epoch": 1.3, + "grad_norm": 6.560477693119511, + "learning_rate": 6.960146215131561e-06, + "loss": 0.9398, + "step": 23140 + }, + { + "epoch": 1.3, + "grad_norm": 7.404104210076437, + "learning_rate": 6.95863829623347e-06, + "loss": 0.9036, + "step": 23145 + }, + { + "epoch": 1.3, + "grad_norm": 7.404842343876951, + "learning_rate": 6.95713016687105e-06, + "loss": 0.9098, + "step": 23150 + }, + { + "epoch": 1.3, + "grad_norm": 9.287855837477224, + "learning_rate": 6.9556218272063555e-06, + "loss": 0.8953, + "step": 23155 + }, + { + "epoch": 1.3, + "grad_norm": 14.665724258424726, + "learning_rate": 6.9541132774014655e-06, + "loss": 0.939, + "step": 23160 + }, + { + "epoch": 1.31, + "grad_norm": 15.411905802724334, + "learning_rate": 6.952604517618478e-06, + "loss": 0.8758, + "step": 23165 + }, + { + "epoch": 1.31, + "grad_norm": 10.087428296554743, + "learning_rate": 6.95109554801952e-06, + "loss": 0.8978, + "step": 23170 + }, + { + "epoch": 1.31, + "grad_norm": 17.981740311835065, + "learning_rate": 6.949586368766732e-06, + "loss": 0.9031, + "step": 23175 + }, + { + "epoch": 1.31, + "grad_norm": 8.693184545791876, + "learning_rate": 6.948076980022286e-06, + "loss": 0.8938, + "step": 23180 + }, + { + "epoch": 1.31, + "grad_norm": 11.721662877574692, + "learning_rate": 6.94656738194837e-06, + "loss": 0.882, + "step": 23185 + }, + { + "epoch": 1.31, + "grad_norm": 19.52912853443083, + "learning_rate": 6.945057574707199e-06, + "loss": 0.9026, + "step": 23190 + }, + { + "epoch": 1.31, + "grad_norm": 41.80736345299809, + "learning_rate": 6.943547558461007e-06, + "loss": 0.8934, + "step": 23195 + }, + { + "epoch": 1.31, + "grad_norm": 8.204201458994314, + "learning_rate": 6.942037333372051e-06, + "loss": 0.9153, + "step": 23200 + }, + { + "epoch": 1.31, + "grad_norm": 12.818834373576317, + "learning_rate": 6.940526899602616e-06, + "loss": 0.9235, + "step": 23205 + }, + { + "epoch": 1.31, + "grad_norm": 13.72989382073166, + "learning_rate": 6.939016257314999e-06, + "loss": 0.8852, + "step": 23210 + }, + { + "epoch": 1.31, + "grad_norm": 5.944495771991438, + "learning_rate": 6.937505406671529e-06, + "loss": 0.932, + "step": 23215 + }, + { + "epoch": 1.31, + "grad_norm": 12.85102067979384, + "learning_rate": 6.93599434783455e-06, + "loss": 0.9483, + "step": 23220 + }, + { + "epoch": 1.31, + "grad_norm": 11.551513722779058, + "learning_rate": 6.934483080966436e-06, + "loss": 0.8869, + "step": 23225 + }, + { + "epoch": 1.31, + "grad_norm": 8.127174041000742, + "learning_rate": 6.932971606229577e-06, + "loss": 0.9309, + "step": 23230 + }, + { + "epoch": 1.31, + "grad_norm": 8.26645524750772, + "learning_rate": 6.931459923786388e-06, + "loss": 0.9358, + "step": 23235 + }, + { + "epoch": 1.31, + "grad_norm": 11.405720641063699, + "learning_rate": 6.929948033799306e-06, + "loss": 0.9149, + "step": 23240 + }, + { + "epoch": 1.31, + "grad_norm": 11.41260620788048, + "learning_rate": 6.928435936430789e-06, + "loss": 0.9381, + "step": 23245 + }, + { + "epoch": 1.31, + "grad_norm": 8.027273959987731, + "learning_rate": 6.926923631843321e-06, + "loss": 0.9639, + "step": 23250 + }, + { + "epoch": 1.31, + "grad_norm": 6.816868218385879, + "learning_rate": 6.925411120199405e-06, + "loss": 0.8901, + "step": 23255 + }, + { + "epoch": 1.31, + "grad_norm": 6.012856734903804, + "learning_rate": 6.923898401661565e-06, + "loss": 0.8995, + "step": 23260 + }, + { + "epoch": 1.31, + "grad_norm": 5.684332794187576, + "learning_rate": 6.9223854763923535e-06, + "loss": 0.895, + "step": 23265 + }, + { + "epoch": 1.31, + "grad_norm": 9.935038240368268, + "learning_rate": 6.920872344554337e-06, + "loss": 0.8969, + "step": 23270 + }, + { + "epoch": 1.31, + "grad_norm": 11.931894701975166, + "learning_rate": 6.919359006310109e-06, + "loss": 0.9714, + "step": 23275 + }, + { + "epoch": 1.31, + "grad_norm": 8.630957657303016, + "learning_rate": 6.917845461822286e-06, + "loss": 0.9567, + "step": 23280 + }, + { + "epoch": 1.31, + "grad_norm": 9.424482875536688, + "learning_rate": 6.916331711253502e-06, + "loss": 0.9343, + "step": 23285 + }, + { + "epoch": 1.31, + "grad_norm": 6.557035393427002, + "learning_rate": 6.914817754766421e-06, + "loss": 0.915, + "step": 23290 + }, + { + "epoch": 1.31, + "grad_norm": 6.670466170417317, + "learning_rate": 6.913303592523721e-06, + "loss": 0.8685, + "step": 23295 + }, + { + "epoch": 1.31, + "grad_norm": 8.83666483915853, + "learning_rate": 6.9117892246881045e-06, + "loss": 0.8855, + "step": 23300 + }, + { + "epoch": 1.31, + "grad_norm": 4.782540122639701, + "learning_rate": 6.9102746514222995e-06, + "loss": 0.9422, + "step": 23305 + }, + { + "epoch": 1.31, + "grad_norm": 5.961461722029677, + "learning_rate": 6.9087598728890525e-06, + "loss": 0.9122, + "step": 23310 + }, + { + "epoch": 1.31, + "grad_norm": 5.241198898764172, + "learning_rate": 6.907244889251132e-06, + "loss": 0.9071, + "step": 23315 + }, + { + "epoch": 1.31, + "grad_norm": 8.891236683405234, + "learning_rate": 6.905729700671332e-06, + "loss": 0.8945, + "step": 23320 + }, + { + "epoch": 1.31, + "grad_norm": 17.243215176915836, + "learning_rate": 6.904214307312465e-06, + "loss": 0.9009, + "step": 23325 + }, + { + "epoch": 1.31, + "grad_norm": 12.069419232486801, + "learning_rate": 6.902698709337367e-06, + "loss": 0.93, + "step": 23330 + }, + { + "epoch": 1.31, + "grad_norm": 7.738426661549367, + "learning_rate": 6.901182906908894e-06, + "loss": 0.896, + "step": 23335 + }, + { + "epoch": 1.32, + "grad_norm": 5.792474467053207, + "learning_rate": 6.8996669001899285e-06, + "loss": 0.8695, + "step": 23340 + }, + { + "epoch": 1.32, + "grad_norm": 6.660481984924338, + "learning_rate": 6.898150689343371e-06, + "loss": 0.9335, + "step": 23345 + }, + { + "epoch": 1.32, + "grad_norm": 5.224931352082289, + "learning_rate": 6.896634274532143e-06, + "loss": 0.9077, + "step": 23350 + }, + { + "epoch": 1.32, + "grad_norm": 10.88243518031466, + "learning_rate": 6.895117655919194e-06, + "loss": 0.9098, + "step": 23355 + }, + { + "epoch": 1.32, + "grad_norm": 5.746063824332506, + "learning_rate": 6.893600833667488e-06, + "loss": 0.8758, + "step": 23360 + }, + { + "epoch": 1.32, + "grad_norm": 5.598247819320011, + "learning_rate": 6.892083807940014e-06, + "loss": 0.9587, + "step": 23365 + }, + { + "epoch": 1.32, + "grad_norm": 6.9956490645739855, + "learning_rate": 6.890566578899785e-06, + "loss": 0.9347, + "step": 23370 + }, + { + "epoch": 1.32, + "grad_norm": 14.814640034413959, + "learning_rate": 6.8890491467098346e-06, + "loss": 0.8782, + "step": 23375 + }, + { + "epoch": 1.32, + "grad_norm": 5.075841787547497, + "learning_rate": 6.887531511533217e-06, + "loss": 0.9052, + "step": 23380 + }, + { + "epoch": 1.32, + "grad_norm": 5.795366425007409, + "learning_rate": 6.886013673533005e-06, + "loss": 0.8573, + "step": 23385 + }, + { + "epoch": 1.32, + "grad_norm": 8.231542374109674, + "learning_rate": 6.8844956328723035e-06, + "loss": 0.9334, + "step": 23390 + }, + { + "epoch": 1.32, + "grad_norm": 23.192404416044255, + "learning_rate": 6.8829773897142275e-06, + "loss": 0.907, + "step": 23395 + }, + { + "epoch": 1.32, + "grad_norm": 17.393423054625753, + "learning_rate": 6.88145894422192e-06, + "loss": 0.9124, + "step": 23400 + }, + { + "epoch": 1.32, + "grad_norm": 6.43558683335424, + "learning_rate": 6.879940296558547e-06, + "loss": 0.8925, + "step": 23405 + }, + { + "epoch": 1.32, + "grad_norm": 8.044044488230632, + "learning_rate": 6.878421446887293e-06, + "loss": 0.9474, + "step": 23410 + }, + { + "epoch": 1.32, + "grad_norm": 6.177135488545063, + "learning_rate": 6.8769023953713635e-06, + "loss": 0.9493, + "step": 23415 + }, + { + "epoch": 1.32, + "grad_norm": 5.354104539621598, + "learning_rate": 6.875383142173987e-06, + "loss": 0.9003, + "step": 23420 + }, + { + "epoch": 1.32, + "grad_norm": 18.2051141430026, + "learning_rate": 6.873863687458416e-06, + "loss": 0.8885, + "step": 23425 + }, + { + "epoch": 1.32, + "grad_norm": 7.121902676775598, + "learning_rate": 6.872344031387922e-06, + "loss": 0.9174, + "step": 23430 + }, + { + "epoch": 1.32, + "grad_norm": 5.5423910468005415, + "learning_rate": 6.8708241741258e-06, + "loss": 0.865, + "step": 23435 + }, + { + "epoch": 1.32, + "grad_norm": 6.002572637031899, + "learning_rate": 6.869304115835363e-06, + "loss": 0.907, + "step": 23440 + }, + { + "epoch": 1.32, + "grad_norm": 25.44987537229217, + "learning_rate": 6.867783856679947e-06, + "loss": 0.9445, + "step": 23445 + }, + { + "epoch": 1.32, + "grad_norm": 16.73885257888077, + "learning_rate": 6.866263396822914e-06, + "loss": 0.9328, + "step": 23450 + }, + { + "epoch": 1.32, + "grad_norm": 14.086310910129086, + "learning_rate": 6.864742736427643e-06, + "loss": 0.8993, + "step": 23455 + }, + { + "epoch": 1.32, + "grad_norm": 14.437841497147433, + "learning_rate": 6.863221875657534e-06, + "loss": 0.9174, + "step": 23460 + }, + { + "epoch": 1.32, + "grad_norm": 13.703303964336197, + "learning_rate": 6.861700814676012e-06, + "loss": 0.8807, + "step": 23465 + }, + { + "epoch": 1.32, + "grad_norm": 14.506898203069278, + "learning_rate": 6.860179553646521e-06, + "loss": 0.9135, + "step": 23470 + }, + { + "epoch": 1.32, + "grad_norm": 28.302665131505723, + "learning_rate": 6.858658092732528e-06, + "loss": 0.892, + "step": 23475 + }, + { + "epoch": 1.32, + "grad_norm": 10.875087356710619, + "learning_rate": 6.85713643209752e-06, + "loss": 0.9282, + "step": 23480 + }, + { + "epoch": 1.32, + "grad_norm": 8.691872392237835, + "learning_rate": 6.855614571905005e-06, + "loss": 0.9049, + "step": 23485 + }, + { + "epoch": 1.32, + "grad_norm": 9.514972680015523, + "learning_rate": 6.854092512318515e-06, + "loss": 0.8664, + "step": 23490 + }, + { + "epoch": 1.32, + "grad_norm": 15.619277007034693, + "learning_rate": 6.852570253501603e-06, + "loss": 0.9035, + "step": 23495 + }, + { + "epoch": 1.32, + "grad_norm": 14.62043661242379, + "learning_rate": 6.851047795617841e-06, + "loss": 0.8775, + "step": 23500 + }, + { + "epoch": 1.32, + "grad_norm": 6.446867241351837, + "learning_rate": 6.849525138830824e-06, + "loss": 0.865, + "step": 23505 + }, + { + "epoch": 1.32, + "grad_norm": 9.438148316920962, + "learning_rate": 6.848002283304167e-06, + "loss": 0.9548, + "step": 23510 + }, + { + "epoch": 1.32, + "grad_norm": 11.359072627366345, + "learning_rate": 6.84647922920151e-06, + "loss": 0.9274, + "step": 23515 + }, + { + "epoch": 1.33, + "grad_norm": 13.989361218067277, + "learning_rate": 6.844955976686512e-06, + "loss": 0.9021, + "step": 23520 + }, + { + "epoch": 1.33, + "grad_norm": 13.230364599716836, + "learning_rate": 6.84343252592285e-06, + "loss": 0.8948, + "step": 23525 + }, + { + "epoch": 1.33, + "grad_norm": 15.98758940404333, + "learning_rate": 6.841908877074227e-06, + "loss": 0.9016, + "step": 23530 + }, + { + "epoch": 1.33, + "grad_norm": 27.255757499231205, + "learning_rate": 6.840385030304368e-06, + "loss": 0.8917, + "step": 23535 + }, + { + "epoch": 1.33, + "grad_norm": 16.143384001163476, + "learning_rate": 6.838860985777014e-06, + "loss": 0.9043, + "step": 23540 + }, + { + "epoch": 1.33, + "grad_norm": 12.476389444237618, + "learning_rate": 6.837336743655931e-06, + "loss": 0.9477, + "step": 23545 + }, + { + "epoch": 1.33, + "grad_norm": 9.800654232457841, + "learning_rate": 6.835812304104907e-06, + "loss": 0.8902, + "step": 23550 + }, + { + "epoch": 1.33, + "grad_norm": 5.544692458361583, + "learning_rate": 6.834287667287749e-06, + "loss": 0.9201, + "step": 23555 + }, + { + "epoch": 1.33, + "grad_norm": 5.319286052191164, + "learning_rate": 6.832762833368285e-06, + "loss": 0.928, + "step": 23560 + }, + { + "epoch": 1.33, + "grad_norm": 7.0828580472573535, + "learning_rate": 6.831237802510367e-06, + "loss": 0.9598, + "step": 23565 + }, + { + "epoch": 1.33, + "grad_norm": 10.409833697299415, + "learning_rate": 6.829712574877865e-06, + "loss": 0.9373, + "step": 23570 + }, + { + "epoch": 1.33, + "grad_norm": 11.706241323475199, + "learning_rate": 6.828187150634671e-06, + "loss": 0.9079, + "step": 23575 + }, + { + "epoch": 1.33, + "grad_norm": 8.403233187821114, + "learning_rate": 6.8266615299447e-06, + "loss": 0.8927, + "step": 23580 + }, + { + "epoch": 1.33, + "grad_norm": 4.851444257744309, + "learning_rate": 6.8251357129718865e-06, + "loss": 0.8721, + "step": 23585 + }, + { + "epoch": 1.33, + "grad_norm": 16.0422070223834, + "learning_rate": 6.823609699880185e-06, + "loss": 0.9054, + "step": 23590 + }, + { + "epoch": 1.33, + "grad_norm": 6.247800976842763, + "learning_rate": 6.822083490833572e-06, + "loss": 0.8728, + "step": 23595 + }, + { + "epoch": 1.33, + "grad_norm": 55.45705994144435, + "learning_rate": 6.820557085996047e-06, + "loss": 0.9218, + "step": 23600 + }, + { + "epoch": 1.33, + "grad_norm": 17.18753143594485, + "learning_rate": 6.819030485531629e-06, + "loss": 0.9119, + "step": 23605 + }, + { + "epoch": 1.33, + "grad_norm": 9.866445006459287, + "learning_rate": 6.817503689604358e-06, + "loss": 0.882, + "step": 23610 + }, + { + "epoch": 1.33, + "grad_norm": 25.334152410579883, + "learning_rate": 6.815976698378293e-06, + "loss": 0.8768, + "step": 23615 + }, + { + "epoch": 1.33, + "grad_norm": 7.220031067608439, + "learning_rate": 6.814449512017517e-06, + "loss": 0.8812, + "step": 23620 + }, + { + "epoch": 1.33, + "grad_norm": 14.619646587220572, + "learning_rate": 6.812922130686134e-06, + "loss": 0.9145, + "step": 23625 + }, + { + "epoch": 1.33, + "grad_norm": 5.900802967511703, + "learning_rate": 6.8113945545482675e-06, + "loss": 0.8857, + "step": 23630 + }, + { + "epoch": 1.33, + "grad_norm": 5.076090197548098, + "learning_rate": 6.809866783768061e-06, + "loss": 0.9291, + "step": 23635 + }, + { + "epoch": 1.33, + "grad_norm": 7.410453301923219, + "learning_rate": 6.808338818509681e-06, + "loss": 0.9378, + "step": 23640 + }, + { + "epoch": 1.33, + "grad_norm": 4.998593463902603, + "learning_rate": 6.806810658937314e-06, + "loss": 0.9021, + "step": 23645 + }, + { + "epoch": 1.33, + "grad_norm": 4.890474846801583, + "learning_rate": 6.805282305215167e-06, + "loss": 0.9017, + "step": 23650 + }, + { + "epoch": 1.33, + "grad_norm": 12.055329875681373, + "learning_rate": 6.803753757507471e-06, + "loss": 0.8884, + "step": 23655 + }, + { + "epoch": 1.33, + "grad_norm": 15.75440217956922, + "learning_rate": 6.802225015978472e-06, + "loss": 0.8596, + "step": 23660 + }, + { + "epoch": 1.33, + "grad_norm": 12.427630934113202, + "learning_rate": 6.800696080792442e-06, + "loss": 0.8856, + "step": 23665 + }, + { + "epoch": 1.33, + "grad_norm": 5.886267425253317, + "learning_rate": 6.79916695211367e-06, + "loss": 0.9021, + "step": 23670 + }, + { + "epoch": 1.33, + "grad_norm": 5.015285032522612, + "learning_rate": 6.797637630106468e-06, + "loss": 0.937, + "step": 23675 + }, + { + "epoch": 1.33, + "grad_norm": 7.529029572300798, + "learning_rate": 6.796108114935171e-06, + "loss": 0.9225, + "step": 23680 + }, + { + "epoch": 1.33, + "grad_norm": 5.32900836607524, + "learning_rate": 6.7945784067641295e-06, + "loss": 0.8797, + "step": 23685 + }, + { + "epoch": 1.33, + "grad_norm": 5.687494127214395, + "learning_rate": 6.793048505757718e-06, + "loss": 0.9021, + "step": 23690 + }, + { + "epoch": 1.34, + "grad_norm": 7.758744075524426, + "learning_rate": 6.791518412080331e-06, + "loss": 0.9482, + "step": 23695 + }, + { + "epoch": 1.34, + "grad_norm": 13.61880923103712, + "learning_rate": 6.789988125896384e-06, + "loss": 0.9058, + "step": 23700 + }, + { + "epoch": 1.34, + "grad_norm": 14.022768199244858, + "learning_rate": 6.788457647370313e-06, + "loss": 0.8848, + "step": 23705 + }, + { + "epoch": 1.34, + "grad_norm": 7.060705404701003, + "learning_rate": 6.786926976666575e-06, + "loss": 0.8892, + "step": 23710 + }, + { + "epoch": 1.34, + "grad_norm": 18.479713213199258, + "learning_rate": 6.785396113949647e-06, + "loss": 0.9402, + "step": 23715 + }, + { + "epoch": 1.34, + "grad_norm": 5.928263873223764, + "learning_rate": 6.783865059384029e-06, + "loss": 0.8923, + "step": 23720 + }, + { + "epoch": 1.34, + "grad_norm": 9.566092359449536, + "learning_rate": 6.782333813134234e-06, + "loss": 0.9144, + "step": 23725 + }, + { + "epoch": 1.34, + "grad_norm": 18.25578266659643, + "learning_rate": 6.780802375364806e-06, + "loss": 0.9016, + "step": 23730 + }, + { + "epoch": 1.34, + "grad_norm": 9.726431794649871, + "learning_rate": 6.779270746240303e-06, + "loss": 0.9566, + "step": 23735 + }, + { + "epoch": 1.34, + "grad_norm": 5.915722491732196, + "learning_rate": 6.777738925925306e-06, + "loss": 0.8913, + "step": 23740 + }, + { + "epoch": 1.34, + "grad_norm": 15.003518216733056, + "learning_rate": 6.776206914584418e-06, + "loss": 0.8772, + "step": 23745 + }, + { + "epoch": 1.34, + "grad_norm": 6.632009317832925, + "learning_rate": 6.774674712382256e-06, + "loss": 0.9134, + "step": 23750 + }, + { + "epoch": 1.34, + "grad_norm": 6.188689870427255, + "learning_rate": 6.773142319483465e-06, + "loss": 0.9276, + "step": 23755 + }, + { + "epoch": 1.34, + "grad_norm": 6.402133030778085, + "learning_rate": 6.771609736052704e-06, + "loss": 0.9102, + "step": 23760 + }, + { + "epoch": 1.34, + "grad_norm": 5.9249351411386435, + "learning_rate": 6.770076962254661e-06, + "loss": 0.9158, + "step": 23765 + }, + { + "epoch": 1.34, + "grad_norm": 5.750023153531884, + "learning_rate": 6.768543998254035e-06, + "loss": 0.8807, + "step": 23770 + }, + { + "epoch": 1.34, + "grad_norm": 4.909869868765495, + "learning_rate": 6.767010844215551e-06, + "loss": 0.884, + "step": 23775 + }, + { + "epoch": 1.34, + "grad_norm": 5.326120300976482, + "learning_rate": 6.765477500303954e-06, + "loss": 0.8567, + "step": 23780 + }, + { + "epoch": 1.34, + "grad_norm": 11.578031638349115, + "learning_rate": 6.763943966684008e-06, + "loss": 0.8706, + "step": 23785 + }, + { + "epoch": 1.34, + "grad_norm": 8.261813353698697, + "learning_rate": 6.7624102435204965e-06, + "loss": 0.8838, + "step": 23790 + }, + { + "epoch": 1.34, + "grad_norm": 29.741810587824805, + "learning_rate": 6.760876330978228e-06, + "loss": 0.8639, + "step": 23795 + }, + { + "epoch": 1.34, + "grad_norm": 16.270226875512073, + "learning_rate": 6.7593422292220255e-06, + "loss": 0.9144, + "step": 23800 + }, + { + "epoch": 1.34, + "grad_norm": 13.448881661424943, + "learning_rate": 6.7578079384167365e-06, + "loss": 0.896, + "step": 23805 + }, + { + "epoch": 1.34, + "grad_norm": 21.202110394989795, + "learning_rate": 6.756273458727226e-06, + "loss": 0.916, + "step": 23810 + }, + { + "epoch": 1.34, + "grad_norm": 12.903833267320922, + "learning_rate": 6.754738790318383e-06, + "loss": 0.8663, + "step": 23815 + }, + { + "epoch": 1.34, + "grad_norm": 6.391739423170614, + "learning_rate": 6.75320393335511e-06, + "loss": 0.8905, + "step": 23820 + }, + { + "epoch": 1.34, + "grad_norm": 6.077050977707632, + "learning_rate": 6.7516688880023385e-06, + "loss": 0.9332, + "step": 23825 + }, + { + "epoch": 1.34, + "grad_norm": 23.743628173305318, + "learning_rate": 6.750133654425013e-06, + "loss": 0.9161, + "step": 23830 + }, + { + "epoch": 1.34, + "grad_norm": 16.745657377829414, + "learning_rate": 6.748598232788104e-06, + "loss": 0.8957, + "step": 23835 + }, + { + "epoch": 1.34, + "grad_norm": 8.446130902764844, + "learning_rate": 6.747062623256597e-06, + "loss": 0.8879, + "step": 23840 + }, + { + "epoch": 1.34, + "grad_norm": 23.66458250447321, + "learning_rate": 6.745526825995501e-06, + "loss": 0.9368, + "step": 23845 + }, + { + "epoch": 1.34, + "grad_norm": 7.361233466436759, + "learning_rate": 6.743990841169844e-06, + "loss": 0.8773, + "step": 23850 + }, + { + "epoch": 1.34, + "grad_norm": 7.162267733768624, + "learning_rate": 6.7424546689446735e-06, + "loss": 0.9123, + "step": 23855 + }, + { + "epoch": 1.34, + "grad_norm": 6.6656794383814075, + "learning_rate": 6.7409183094850606e-06, + "loss": 0.8381, + "step": 23860 + }, + { + "epoch": 1.34, + "grad_norm": 6.420367992270999, + "learning_rate": 6.739381762956091e-06, + "loss": 0.8862, + "step": 23865 + }, + { + "epoch": 1.34, + "grad_norm": 6.007625253825814, + "learning_rate": 6.737845029522873e-06, + "loss": 0.9083, + "step": 23870 + }, + { + "epoch": 1.35, + "grad_norm": 10.152991556675145, + "learning_rate": 6.7363081093505385e-06, + "loss": 0.948, + "step": 23875 + }, + { + "epoch": 1.35, + "grad_norm": 5.763319672444526, + "learning_rate": 6.734771002604234e-06, + "loss": 0.9215, + "step": 23880 + }, + { + "epoch": 1.35, + "grad_norm": 6.940980897857443, + "learning_rate": 6.73323370944913e-06, + "loss": 0.8981, + "step": 23885 + }, + { + "epoch": 1.35, + "grad_norm": 15.008914292982135, + "learning_rate": 6.731696230050415e-06, + "loss": 0.9224, + "step": 23890 + }, + { + "epoch": 1.35, + "grad_norm": 23.381809517188863, + "learning_rate": 6.730158564573297e-06, + "loss": 0.9244, + "step": 23895 + }, + { + "epoch": 1.35, + "grad_norm": 8.774575802257527, + "learning_rate": 6.728620713183005e-06, + "loss": 0.9301, + "step": 23900 + }, + { + "epoch": 1.35, + "grad_norm": 12.993235258233463, + "learning_rate": 6.727082676044789e-06, + "loss": 0.8784, + "step": 23905 + }, + { + "epoch": 1.35, + "grad_norm": 19.364974308253316, + "learning_rate": 6.725544453323917e-06, + "loss": 0.8971, + "step": 23910 + }, + { + "epoch": 1.35, + "grad_norm": 13.348752077592053, + "learning_rate": 6.724006045185678e-06, + "loss": 0.8947, + "step": 23915 + }, + { + "epoch": 1.35, + "grad_norm": 6.945222014315064, + "learning_rate": 6.72246745179538e-06, + "loss": 0.9303, + "step": 23920 + }, + { + "epoch": 1.35, + "grad_norm": 6.256884730233514, + "learning_rate": 6.720928673318354e-06, + "loss": 0.908, + "step": 23925 + }, + { + "epoch": 1.35, + "grad_norm": 5.2846890661713175, + "learning_rate": 6.719389709919946e-06, + "loss": 0.8654, + "step": 23930 + }, + { + "epoch": 1.35, + "grad_norm": 15.431304333951894, + "learning_rate": 6.717850561765526e-06, + "loss": 0.8463, + "step": 23935 + }, + { + "epoch": 1.35, + "grad_norm": 11.647786409774287, + "learning_rate": 6.716311229020482e-06, + "loss": 0.9063, + "step": 23940 + }, + { + "epoch": 1.35, + "grad_norm": 11.275085440626508, + "learning_rate": 6.7147717118502195e-06, + "loss": 0.8848, + "step": 23945 + }, + { + "epoch": 1.35, + "grad_norm": 6.451789215212424, + "learning_rate": 6.713232010420172e-06, + "loss": 0.9511, + "step": 23950 + }, + { + "epoch": 1.35, + "grad_norm": 6.311045463061686, + "learning_rate": 6.7116921248957825e-06, + "loss": 0.9176, + "step": 23955 + }, + { + "epoch": 1.35, + "grad_norm": 6.385579122070995, + "learning_rate": 6.71015205544252e-06, + "loss": 0.9255, + "step": 23960 + }, + { + "epoch": 1.35, + "grad_norm": 7.113695843819294, + "learning_rate": 6.70861180222587e-06, + "loss": 0.9171, + "step": 23965 + }, + { + "epoch": 1.35, + "grad_norm": 5.7170783559669385, + "learning_rate": 6.707071365411342e-06, + "loss": 0.861, + "step": 23970 + }, + { + "epoch": 1.35, + "grad_norm": 6.540916617313485, + "learning_rate": 6.705530745164462e-06, + "loss": 0.9204, + "step": 23975 + }, + { + "epoch": 1.35, + "grad_norm": 5.6984347449119905, + "learning_rate": 6.703989941650778e-06, + "loss": 0.923, + "step": 23980 + }, + { + "epoch": 1.35, + "grad_norm": 10.954047441243636, + "learning_rate": 6.702448955035854e-06, + "loss": 0.9062, + "step": 23985 + }, + { + "epoch": 1.35, + "grad_norm": 7.854859766330289, + "learning_rate": 6.700907785485276e-06, + "loss": 0.9246, + "step": 23990 + }, + { + "epoch": 1.35, + "grad_norm": 5.4202023328370075, + "learning_rate": 6.699366433164648e-06, + "loss": 0.8483, + "step": 23995 + }, + { + "epoch": 1.35, + "grad_norm": 7.144621933045308, + "learning_rate": 6.6978248982395985e-06, + "loss": 0.9012, + "step": 24000 + }, + { + "epoch": 1.35, + "grad_norm": 6.063579652154793, + "learning_rate": 6.69628318087577e-06, + "loss": 0.8871, + "step": 24005 + }, + { + "epoch": 1.35, + "grad_norm": 5.171947794665156, + "learning_rate": 6.694741281238828e-06, + "loss": 0.8746, + "step": 24010 + }, + { + "epoch": 1.35, + "grad_norm": 6.753600198946728, + "learning_rate": 6.693199199494455e-06, + "loss": 0.8552, + "step": 24015 + }, + { + "epoch": 1.35, + "grad_norm": 6.824389911709043, + "learning_rate": 6.691656935808356e-06, + "loss": 0.9, + "step": 24020 + }, + { + "epoch": 1.35, + "grad_norm": 5.070175980848593, + "learning_rate": 6.690114490346253e-06, + "loss": 0.9461, + "step": 24025 + }, + { + "epoch": 1.35, + "grad_norm": 4.3289723176474615, + "learning_rate": 6.6885718632738905e-06, + "loss": 0.8524, + "step": 24030 + }, + { + "epoch": 1.35, + "grad_norm": 5.3145039574700155, + "learning_rate": 6.687029054757028e-06, + "loss": 0.9205, + "step": 24035 + }, + { + "epoch": 1.35, + "grad_norm": 4.822699493157175, + "learning_rate": 6.6854860649614485e-06, + "loss": 0.9234, + "step": 24040 + }, + { + "epoch": 1.35, + "grad_norm": 4.996966757584824, + "learning_rate": 6.683942894052953e-06, + "loss": 0.9018, + "step": 24045 + }, + { + "epoch": 1.36, + "grad_norm": 7.993873202858208, + "learning_rate": 6.6823995421973595e-06, + "loss": 0.875, + "step": 24050 + }, + { + "epoch": 1.36, + "grad_norm": 18.882760470552306, + "learning_rate": 6.680856009560512e-06, + "loss": 0.9592, + "step": 24055 + }, + { + "epoch": 1.36, + "grad_norm": 9.374507719422299, + "learning_rate": 6.679312296308267e-06, + "loss": 0.8681, + "step": 24060 + }, + { + "epoch": 1.36, + "grad_norm": 15.972104971967129, + "learning_rate": 6.677768402606506e-06, + "loss": 0.8594, + "step": 24065 + }, + { + "epoch": 1.36, + "grad_norm": 5.961894260257484, + "learning_rate": 6.676224328621125e-06, + "loss": 0.9086, + "step": 24070 + }, + { + "epoch": 1.36, + "grad_norm": 15.207913770094201, + "learning_rate": 6.674680074518042e-06, + "loss": 0.9557, + "step": 24075 + }, + { + "epoch": 1.36, + "grad_norm": 8.48564843157255, + "learning_rate": 6.673135640463193e-06, + "loss": 0.9001, + "step": 24080 + }, + { + "epoch": 1.36, + "grad_norm": 18.833650036343723, + "learning_rate": 6.671591026622537e-06, + "loss": 0.9063, + "step": 24085 + }, + { + "epoch": 1.36, + "grad_norm": 5.149536944530058, + "learning_rate": 6.670046233162047e-06, + "loss": 0.8651, + "step": 24090 + }, + { + "epoch": 1.36, + "grad_norm": 34.02702911959087, + "learning_rate": 6.6685012602477195e-06, + "loss": 0.8629, + "step": 24095 + }, + { + "epoch": 1.36, + "grad_norm": 12.067272681334696, + "learning_rate": 6.666956108045567e-06, + "loss": 0.9668, + "step": 24100 + }, + { + "epoch": 1.36, + "grad_norm": 4.837599270410714, + "learning_rate": 6.665410776721624e-06, + "loss": 0.8588, + "step": 24105 + }, + { + "epoch": 1.36, + "grad_norm": 6.400501710036545, + "learning_rate": 6.6638652664419435e-06, + "loss": 0.8645, + "step": 24110 + }, + { + "epoch": 1.36, + "grad_norm": 11.535531093312578, + "learning_rate": 6.662319577372595e-06, + "loss": 0.8897, + "step": 24115 + }, + { + "epoch": 1.36, + "grad_norm": 5.668187540853762, + "learning_rate": 6.660773709679675e-06, + "loss": 0.9188, + "step": 24120 + }, + { + "epoch": 1.36, + "grad_norm": 5.22530715349334, + "learning_rate": 6.659227663529289e-06, + "loss": 0.9295, + "step": 24125 + }, + { + "epoch": 1.36, + "grad_norm": 5.4353611915676865, + "learning_rate": 6.657681439087568e-06, + "loss": 0.9431, + "step": 24130 + }, + { + "epoch": 1.36, + "grad_norm": 5.163732371466336, + "learning_rate": 6.656135036520659e-06, + "loss": 0.9214, + "step": 24135 + }, + { + "epoch": 1.36, + "grad_norm": 6.221908914196966, + "learning_rate": 6.6545884559947326e-06, + "loss": 0.9004, + "step": 24140 + }, + { + "epoch": 1.36, + "grad_norm": 16.785413527142993, + "learning_rate": 6.6530416976759726e-06, + "loss": 0.903, + "step": 24145 + }, + { + "epoch": 1.36, + "grad_norm": 14.236098941288745, + "learning_rate": 6.651494761730588e-06, + "loss": 0.8998, + "step": 24150 + }, + { + "epoch": 1.36, + "grad_norm": 7.807280501531137, + "learning_rate": 6.649947648324804e-06, + "loss": 0.8643, + "step": 24155 + }, + { + "epoch": 1.36, + "grad_norm": 7.15111923004061, + "learning_rate": 6.648400357624861e-06, + "loss": 0.9288, + "step": 24160 + }, + { + "epoch": 1.36, + "grad_norm": 13.499821742039364, + "learning_rate": 6.646852889797026e-06, + "loss": 0.8844, + "step": 24165 + }, + { + "epoch": 1.36, + "grad_norm": 5.0951989170665595, + "learning_rate": 6.645305245007579e-06, + "loss": 0.877, + "step": 24170 + }, + { + "epoch": 1.36, + "grad_norm": 12.205097681339211, + "learning_rate": 6.643757423422823e-06, + "loss": 0.9341, + "step": 24175 + }, + { + "epoch": 1.36, + "grad_norm": 10.533925349318908, + "learning_rate": 6.6422094252090785e-06, + "loss": 0.8331, + "step": 24180 + }, + { + "epoch": 1.36, + "grad_norm": 5.514997555381436, + "learning_rate": 6.640661250532682e-06, + "loss": 0.8939, + "step": 24185 + }, + { + "epoch": 1.36, + "grad_norm": 5.497389223312976, + "learning_rate": 6.639112899559993e-06, + "loss": 0.9044, + "step": 24190 + }, + { + "epoch": 1.36, + "grad_norm": 6.689805575524457, + "learning_rate": 6.637564372457389e-06, + "loss": 0.8848, + "step": 24195 + }, + { + "epoch": 1.36, + "grad_norm": 8.161155418796742, + "learning_rate": 6.636015669391267e-06, + "loss": 0.8491, + "step": 24200 + }, + { + "epoch": 1.36, + "grad_norm": 7.322713922986848, + "learning_rate": 6.634466790528041e-06, + "loss": 0.9094, + "step": 24205 + }, + { + "epoch": 1.36, + "grad_norm": 7.73281249455527, + "learning_rate": 6.632917736034144e-06, + "loss": 0.8755, + "step": 24210 + }, + { + "epoch": 1.36, + "grad_norm": 19.709596528897627, + "learning_rate": 6.63136850607603e-06, + "loss": 0.8604, + "step": 24215 + }, + { + "epoch": 1.36, + "grad_norm": 7.4034785590760555, + "learning_rate": 6.629819100820171e-06, + "loss": 0.8708, + "step": 24220 + }, + { + "epoch": 1.36, + "grad_norm": 5.1557035973551235, + "learning_rate": 6.628269520433056e-06, + "loss": 0.9246, + "step": 24225 + }, + { + "epoch": 1.37, + "grad_norm": 7.652696342299771, + "learning_rate": 6.626719765081196e-06, + "loss": 0.8771, + "step": 24230 + }, + { + "epoch": 1.37, + "grad_norm": 8.037612278057251, + "learning_rate": 6.6251698349311155e-06, + "loss": 0.9395, + "step": 24235 + }, + { + "epoch": 1.37, + "grad_norm": 7.725450029748061, + "learning_rate": 6.623619730149366e-06, + "loss": 0.9113, + "step": 24240 + }, + { + "epoch": 1.37, + "grad_norm": 11.9854179888099, + "learning_rate": 6.622069450902511e-06, + "loss": 0.976, + "step": 24245 + }, + { + "epoch": 1.37, + "grad_norm": 13.286664327455405, + "learning_rate": 6.6205189973571326e-06, + "loss": 0.8414, + "step": 24250 + }, + { + "epoch": 1.37, + "grad_norm": 5.455908245073016, + "learning_rate": 6.6189683696798375e-06, + "loss": 0.8579, + "step": 24255 + }, + { + "epoch": 1.37, + "grad_norm": 26.273909501813236, + "learning_rate": 6.617417568037246e-06, + "loss": 0.9301, + "step": 24260 + }, + { + "epoch": 1.37, + "grad_norm": 6.677031404711934, + "learning_rate": 6.615866592596e-06, + "loss": 0.8775, + "step": 24265 + }, + { + "epoch": 1.37, + "grad_norm": 8.576371235769397, + "learning_rate": 6.614315443522756e-06, + "loss": 0.8928, + "step": 24270 + }, + { + "epoch": 1.37, + "grad_norm": 5.643525558880776, + "learning_rate": 6.612764120984192e-06, + "loss": 0.8931, + "step": 24275 + }, + { + "epoch": 1.37, + "grad_norm": 6.599883259262377, + "learning_rate": 6.6112126251470075e-06, + "loss": 0.8728, + "step": 24280 + }, + { + "epoch": 1.37, + "grad_norm": 5.8797725768296605, + "learning_rate": 6.609660956177915e-06, + "loss": 0.877, + "step": 24285 + }, + { + "epoch": 1.37, + "grad_norm": 8.531567409636851, + "learning_rate": 6.6081091142436475e-06, + "loss": 0.8984, + "step": 24290 + }, + { + "epoch": 1.37, + "grad_norm": 7.849773411092357, + "learning_rate": 6.60655709951096e-06, + "loss": 0.8277, + "step": 24295 + }, + { + "epoch": 1.37, + "grad_norm": 5.6100089507358035, + "learning_rate": 6.60500491214662e-06, + "loss": 0.8839, + "step": 24300 + }, + { + "epoch": 1.37, + "grad_norm": 6.848983897832184, + "learning_rate": 6.60345255231742e-06, + "loss": 0.9342, + "step": 24305 + }, + { + "epoch": 1.37, + "grad_norm": 6.013188110826276, + "learning_rate": 6.601900020190167e-06, + "loss": 0.8687, + "step": 24310 + }, + { + "epoch": 1.37, + "grad_norm": 7.455077724905622, + "learning_rate": 6.600347315931687e-06, + "loss": 0.8886, + "step": 24315 + }, + { + "epoch": 1.37, + "grad_norm": 5.825880326441571, + "learning_rate": 6.598794439708825e-06, + "loss": 0.9145, + "step": 24320 + }, + { + "epoch": 1.37, + "grad_norm": 11.456776446291894, + "learning_rate": 6.597241391688445e-06, + "loss": 0.9086, + "step": 24325 + }, + { + "epoch": 1.37, + "grad_norm": 5.479615448910597, + "learning_rate": 6.595688172037426e-06, + "loss": 0.8931, + "step": 24330 + }, + { + "epoch": 1.37, + "grad_norm": 6.65624270643566, + "learning_rate": 6.594134780922672e-06, + "loss": 0.8784, + "step": 24335 + }, + { + "epoch": 1.37, + "grad_norm": 8.385421035507843, + "learning_rate": 6.5925812185111e-06, + "loss": 0.854, + "step": 24340 + }, + { + "epoch": 1.37, + "grad_norm": 8.20468556438624, + "learning_rate": 6.591027484969649e-06, + "loss": 0.9047, + "step": 24345 + }, + { + "epoch": 1.37, + "grad_norm": 6.081358205395726, + "learning_rate": 6.589473580465273e-06, + "loss": 0.8766, + "step": 24350 + }, + { + "epoch": 1.37, + "grad_norm": 18.60755075072945, + "learning_rate": 6.587919505164944e-06, + "loss": 0.8444, + "step": 24355 + }, + { + "epoch": 1.37, + "grad_norm": 6.975445326040262, + "learning_rate": 6.586365259235658e-06, + "loss": 0.9368, + "step": 24360 + }, + { + "epoch": 1.37, + "grad_norm": 11.440889850561119, + "learning_rate": 6.584810842844423e-06, + "loss": 0.8926, + "step": 24365 + }, + { + "epoch": 1.37, + "grad_norm": 7.97226779666037, + "learning_rate": 6.583256256158268e-06, + "loss": 0.9052, + "step": 24370 + }, + { + "epoch": 1.37, + "grad_norm": 12.614871942084818, + "learning_rate": 6.581701499344241e-06, + "loss": 0.891, + "step": 24375 + }, + { + "epoch": 1.37, + "grad_norm": 18.98225230714363, + "learning_rate": 6.5801465725694084e-06, + "loss": 0.8997, + "step": 24380 + }, + { + "epoch": 1.37, + "grad_norm": 12.141527249054406, + "learning_rate": 6.57859147600085e-06, + "loss": 0.9336, + "step": 24385 + }, + { + "epoch": 1.37, + "grad_norm": 26.978978481650447, + "learning_rate": 6.577036209805674e-06, + "loss": 0.9623, + "step": 24390 + }, + { + "epoch": 1.37, + "grad_norm": 5.0085285157054145, + "learning_rate": 6.575480774150995e-06, + "loss": 0.883, + "step": 24395 + }, + { + "epoch": 1.37, + "grad_norm": 9.973297400687372, + "learning_rate": 6.573925169203955e-06, + "loss": 0.8907, + "step": 24400 + }, + { + "epoch": 1.38, + "grad_norm": 10.12874098597241, + "learning_rate": 6.57236939513171e-06, + "loss": 0.8681, + "step": 24405 + }, + { + "epoch": 1.38, + "grad_norm": 20.489405514091995, + "learning_rate": 6.570813452101433e-06, + "loss": 0.9485, + "step": 24410 + }, + { + "epoch": 1.38, + "grad_norm": 5.825511689833132, + "learning_rate": 6.5692573402803185e-06, + "loss": 0.9311, + "step": 24415 + }, + { + "epoch": 1.38, + "grad_norm": 11.207204710047804, + "learning_rate": 6.567701059835576e-06, + "loss": 0.9101, + "step": 24420 + }, + { + "epoch": 1.38, + "grad_norm": 12.565623632795624, + "learning_rate": 6.566144610934435e-06, + "loss": 0.8604, + "step": 24425 + }, + { + "epoch": 1.38, + "grad_norm": 10.191331791124046, + "learning_rate": 6.564587993744146e-06, + "loss": 0.9112, + "step": 24430 + }, + { + "epoch": 1.38, + "grad_norm": 6.719323186471554, + "learning_rate": 6.563031208431972e-06, + "loss": 0.8985, + "step": 24435 + }, + { + "epoch": 1.38, + "grad_norm": 6.472543929164538, + "learning_rate": 6.561474255165195e-06, + "loss": 0.8866, + "step": 24440 + }, + { + "epoch": 1.38, + "grad_norm": 13.278997033725792, + "learning_rate": 6.559917134111119e-06, + "loss": 0.8752, + "step": 24445 + }, + { + "epoch": 1.38, + "grad_norm": 4.931142163657672, + "learning_rate": 6.558359845437062e-06, + "loss": 0.8647, + "step": 24450 + }, + { + "epoch": 1.38, + "grad_norm": 8.655121288301975, + "learning_rate": 6.556802389310363e-06, + "loss": 0.8521, + "step": 24455 + }, + { + "epoch": 1.38, + "grad_norm": 15.483630269448815, + "learning_rate": 6.555244765898375e-06, + "loss": 0.9128, + "step": 24460 + }, + { + "epoch": 1.38, + "grad_norm": 11.528404552133885, + "learning_rate": 6.553686975368474e-06, + "loss": 0.9096, + "step": 24465 + }, + { + "epoch": 1.38, + "grad_norm": 5.20347045163404, + "learning_rate": 6.5521290178880506e-06, + "loss": 0.8642, + "step": 24470 + }, + { + "epoch": 1.38, + "grad_norm": 7.485085670272495, + "learning_rate": 6.550570893624514e-06, + "loss": 0.9377, + "step": 24475 + }, + { + "epoch": 1.38, + "grad_norm": 6.887696281800459, + "learning_rate": 6.549012602745291e-06, + "loss": 0.8747, + "step": 24480 + }, + { + "epoch": 1.38, + "grad_norm": 4.994828999328062, + "learning_rate": 6.547454145417829e-06, + "loss": 0.9385, + "step": 24485 + }, + { + "epoch": 1.38, + "grad_norm": 7.334231853516249, + "learning_rate": 6.545895521809589e-06, + "loss": 0.8794, + "step": 24490 + }, + { + "epoch": 1.38, + "grad_norm": 7.268624783916795, + "learning_rate": 6.5443367320880525e-06, + "loss": 0.8619, + "step": 24495 + }, + { + "epoch": 1.38, + "grad_norm": 6.783650959083377, + "learning_rate": 6.542777776420718e-06, + "loss": 0.8789, + "step": 24500 + }, + { + "epoch": 1.38, + "grad_norm": 7.9292372237481, + "learning_rate": 6.541218654975104e-06, + "loss": 0.8944, + "step": 24505 + }, + { + "epoch": 1.38, + "grad_norm": 7.230987610753485, + "learning_rate": 6.539659367918742e-06, + "loss": 0.8336, + "step": 24510 + }, + { + "epoch": 1.38, + "grad_norm": 5.402426235708, + "learning_rate": 6.538099915419186e-06, + "loss": 0.9086, + "step": 24515 + }, + { + "epoch": 1.38, + "grad_norm": 5.521747168511222, + "learning_rate": 6.536540297644006e-06, + "loss": 0.907, + "step": 24520 + }, + { + "epoch": 1.38, + "grad_norm": 11.288832044319982, + "learning_rate": 6.53498051476079e-06, + "loss": 0.8607, + "step": 24525 + }, + { + "epoch": 1.38, + "grad_norm": 11.257944223613798, + "learning_rate": 6.533420566937144e-06, + "loss": 0.8738, + "step": 24530 + }, + { + "epoch": 1.38, + "grad_norm": 4.697763669074525, + "learning_rate": 6.53186045434069e-06, + "loss": 0.8742, + "step": 24535 + }, + { + "epoch": 1.38, + "grad_norm": 6.057755241535447, + "learning_rate": 6.530300177139069e-06, + "loss": 0.9117, + "step": 24540 + }, + { + "epoch": 1.38, + "grad_norm": 5.195785411490776, + "learning_rate": 6.528739735499941e-06, + "loss": 0.9315, + "step": 24545 + }, + { + "epoch": 1.38, + "grad_norm": 8.077626964672405, + "learning_rate": 6.5271791295909825e-06, + "loss": 0.8673, + "step": 24550 + }, + { + "epoch": 1.38, + "grad_norm": 5.225630531251211, + "learning_rate": 6.525618359579886e-06, + "loss": 0.9096, + "step": 24555 + }, + { + "epoch": 1.38, + "grad_norm": 5.130510366921712, + "learning_rate": 6.5240574256343616e-06, + "loss": 0.8678, + "step": 24560 + }, + { + "epoch": 1.38, + "grad_norm": 7.621493811126335, + "learning_rate": 6.5224963279221425e-06, + "loss": 0.9051, + "step": 24565 + }, + { + "epoch": 1.38, + "grad_norm": 5.722220389242948, + "learning_rate": 6.520935066610974e-06, + "loss": 0.8903, + "step": 24570 + }, + { + "epoch": 1.38, + "grad_norm": 7.155998115487919, + "learning_rate": 6.519373641868618e-06, + "loss": 0.8764, + "step": 24575 + }, + { + "epoch": 1.38, + "grad_norm": 12.048569421418783, + "learning_rate": 6.517812053862862e-06, + "loss": 0.9247, + "step": 24580 + }, + { + "epoch": 1.39, + "grad_norm": 5.756624372537695, + "learning_rate": 6.5162503027615e-06, + "loss": 0.9014, + "step": 24585 + }, + { + "epoch": 1.39, + "grad_norm": 6.737979349085914, + "learning_rate": 6.514688388732352e-06, + "loss": 0.8604, + "step": 24590 + }, + { + "epoch": 1.39, + "grad_norm": 14.919735464378718, + "learning_rate": 6.513126311943252e-06, + "loss": 0.9148, + "step": 24595 + }, + { + "epoch": 1.39, + "grad_norm": 16.890362198732355, + "learning_rate": 6.511564072562051e-06, + "loss": 0.8921, + "step": 24600 + }, + { + "epoch": 1.39, + "grad_norm": 5.487482611124358, + "learning_rate": 6.51000167075662e-06, + "loss": 0.8393, + "step": 24605 + }, + { + "epoch": 1.39, + "grad_norm": 6.096843095711902, + "learning_rate": 6.508439106694845e-06, + "loss": 0.8853, + "step": 24610 + }, + { + "epoch": 1.39, + "grad_norm": 5.893865245494559, + "learning_rate": 6.50687638054463e-06, + "loss": 0.9308, + "step": 24615 + }, + { + "epoch": 1.39, + "grad_norm": 5.374757157768002, + "learning_rate": 6.505313492473899e-06, + "loss": 0.8687, + "step": 24620 + }, + { + "epoch": 1.39, + "grad_norm": 5.920382017173019, + "learning_rate": 6.503750442650589e-06, + "loss": 0.8968, + "step": 24625 + }, + { + "epoch": 1.39, + "grad_norm": 4.83421800476811, + "learning_rate": 6.502187231242657e-06, + "loss": 0.8701, + "step": 24630 + }, + { + "epoch": 1.39, + "grad_norm": 8.689279051469565, + "learning_rate": 6.500623858418079e-06, + "loss": 0.9134, + "step": 24635 + }, + { + "epoch": 1.39, + "grad_norm": 5.225474214139165, + "learning_rate": 6.499060324344844e-06, + "loss": 0.8801, + "step": 24640 + }, + { + "epoch": 1.39, + "grad_norm": 6.285216527163399, + "learning_rate": 6.497496629190961e-06, + "loss": 0.9135, + "step": 24645 + }, + { + "epoch": 1.39, + "grad_norm": 19.869406471728855, + "learning_rate": 6.4959327731244555e-06, + "loss": 0.8699, + "step": 24650 + }, + { + "epoch": 1.39, + "grad_norm": 5.684139317173707, + "learning_rate": 6.494368756313371e-06, + "loss": 0.8906, + "step": 24655 + }, + { + "epoch": 1.39, + "grad_norm": 13.71579311627586, + "learning_rate": 6.492804578925769e-06, + "loss": 0.893, + "step": 24660 + }, + { + "epoch": 1.39, + "grad_norm": 4.936805310285867, + "learning_rate": 6.4912402411297286e-06, + "loss": 0.8631, + "step": 24665 + }, + { + "epoch": 1.39, + "grad_norm": 19.05884857433253, + "learning_rate": 6.489675743093341e-06, + "loss": 0.9146, + "step": 24670 + }, + { + "epoch": 1.39, + "grad_norm": 6.3067112291437715, + "learning_rate": 6.4881110849847204e-06, + "loss": 0.8826, + "step": 24675 + }, + { + "epoch": 1.39, + "grad_norm": 8.880415774068949, + "learning_rate": 6.486546266971998e-06, + "loss": 0.7971, + "step": 24680 + }, + { + "epoch": 1.39, + "grad_norm": 10.686128962403247, + "learning_rate": 6.484981289223317e-06, + "loss": 0.842, + "step": 24685 + }, + { + "epoch": 1.39, + "grad_norm": 15.92598098721158, + "learning_rate": 6.483416151906844e-06, + "loss": 0.883, + "step": 24690 + }, + { + "epoch": 1.39, + "grad_norm": 5.630521797861853, + "learning_rate": 6.481850855190757e-06, + "loss": 0.9244, + "step": 24695 + }, + { + "epoch": 1.39, + "grad_norm": 27.8139426452066, + "learning_rate": 6.480285399243256e-06, + "loss": 0.8625, + "step": 24700 + }, + { + "epoch": 1.39, + "grad_norm": 6.430948094362927, + "learning_rate": 6.478719784232556e-06, + "loss": 0.8972, + "step": 24705 + }, + { + "epoch": 1.39, + "grad_norm": 12.999048720723598, + "learning_rate": 6.477154010326889e-06, + "loss": 0.9009, + "step": 24710 + }, + { + "epoch": 1.39, + "grad_norm": 6.965381645274808, + "learning_rate": 6.475588077694505e-06, + "loss": 0.8493, + "step": 24715 + }, + { + "epoch": 1.39, + "grad_norm": 15.18368555625259, + "learning_rate": 6.474021986503671e-06, + "loss": 0.8346, + "step": 24720 + }, + { + "epoch": 1.39, + "grad_norm": 6.818410445079008, + "learning_rate": 6.472455736922667e-06, + "loss": 0.8637, + "step": 24725 + }, + { + "epoch": 1.39, + "grad_norm": 9.831499519101065, + "learning_rate": 6.4708893291197975e-06, + "loss": 0.9107, + "step": 24730 + }, + { + "epoch": 1.39, + "grad_norm": 11.697058660581076, + "learning_rate": 6.469322763263376e-06, + "loss": 0.9177, + "step": 24735 + }, + { + "epoch": 1.39, + "grad_norm": 8.019008254570577, + "learning_rate": 6.46775603952174e-06, + "loss": 0.9133, + "step": 24740 + }, + { + "epoch": 1.39, + "grad_norm": 7.298237758721369, + "learning_rate": 6.466189158063239e-06, + "loss": 0.9255, + "step": 24745 + }, + { + "epoch": 1.39, + "grad_norm": 5.8914488766991715, + "learning_rate": 6.464622119056244e-06, + "loss": 0.9161, + "step": 24750 + }, + { + "epoch": 1.39, + "grad_norm": 5.43374631683808, + "learning_rate": 6.463054922669137e-06, + "loss": 0.9035, + "step": 24755 + }, + { + "epoch": 1.4, + "grad_norm": 6.132517055749973, + "learning_rate": 6.4614875690703215e-06, + "loss": 0.8667, + "step": 24760 + }, + { + "epoch": 1.4, + "grad_norm": 7.49444207450362, + "learning_rate": 6.459920058428217e-06, + "loss": 0.8921, + "step": 24765 + }, + { + "epoch": 1.4, + "grad_norm": 8.045145716078492, + "learning_rate": 6.45835239091126e-06, + "loss": 0.8992, + "step": 24770 + }, + { + "epoch": 1.4, + "grad_norm": 6.961172024303564, + "learning_rate": 6.456784566687902e-06, + "loss": 0.8938, + "step": 24775 + }, + { + "epoch": 1.4, + "grad_norm": 10.233519988942753, + "learning_rate": 6.455216585926613e-06, + "loss": 0.8617, + "step": 24780 + }, + { + "epoch": 1.4, + "grad_norm": 8.23949624083405, + "learning_rate": 6.45364844879588e-06, + "loss": 0.8431, + "step": 24785 + }, + { + "epoch": 1.4, + "grad_norm": 9.396077134941041, + "learning_rate": 6.452080155464203e-06, + "loss": 0.8919, + "step": 24790 + }, + { + "epoch": 1.4, + "grad_norm": 6.396749198929252, + "learning_rate": 6.450511706100107e-06, + "loss": 0.8904, + "step": 24795 + }, + { + "epoch": 1.4, + "grad_norm": 8.877715855336982, + "learning_rate": 6.448943100872126e-06, + "loss": 0.8361, + "step": 24800 + }, + { + "epoch": 1.4, + "grad_norm": 5.822409634067037, + "learning_rate": 6.4473743399488155e-06, + "loss": 0.9148, + "step": 24805 + }, + { + "epoch": 1.4, + "grad_norm": 7.7102776655145, + "learning_rate": 6.445805423498743e-06, + "loss": 0.9055, + "step": 24810 + }, + { + "epoch": 1.4, + "grad_norm": 5.858371842544252, + "learning_rate": 6.444236351690497e-06, + "loss": 0.914, + "step": 24815 + }, + { + "epoch": 1.4, + "grad_norm": 4.962813924582823, + "learning_rate": 6.442667124692682e-06, + "loss": 0.8621, + "step": 24820 + }, + { + "epoch": 1.4, + "grad_norm": 6.891505022894111, + "learning_rate": 6.441097742673916e-06, + "loss": 0.8471, + "step": 24825 + }, + { + "epoch": 1.4, + "grad_norm": 5.47898487299598, + "learning_rate": 6.439528205802839e-06, + "loss": 0.8807, + "step": 24830 + }, + { + "epoch": 1.4, + "grad_norm": 5.065318273903309, + "learning_rate": 6.437958514248103e-06, + "loss": 0.8413, + "step": 24835 + }, + { + "epoch": 1.4, + "grad_norm": 5.29291027692286, + "learning_rate": 6.436388668178377e-06, + "loss": 0.9162, + "step": 24840 + }, + { + "epoch": 1.4, + "grad_norm": 8.91946410906538, + "learning_rate": 6.434818667762351e-06, + "loss": 0.8399, + "step": 24845 + }, + { + "epoch": 1.4, + "grad_norm": 13.824563437322766, + "learning_rate": 6.433248513168726e-06, + "loss": 0.9025, + "step": 24850 + }, + { + "epoch": 1.4, + "grad_norm": 6.914602587605255, + "learning_rate": 6.431678204566223e-06, + "loss": 0.8224, + "step": 24855 + }, + { + "epoch": 1.4, + "grad_norm": 19.121135709261793, + "learning_rate": 6.430107742123578e-06, + "loss": 0.8606, + "step": 24860 + }, + { + "epoch": 1.4, + "grad_norm": 5.500416970879522, + "learning_rate": 6.428537126009547e-06, + "loss": 0.8898, + "step": 24865 + }, + { + "epoch": 1.4, + "grad_norm": 8.45941980315844, + "learning_rate": 6.4269663563928945e-06, + "loss": 0.8737, + "step": 24870 + }, + { + "epoch": 1.4, + "grad_norm": 7.411222889141725, + "learning_rate": 6.42539543344241e-06, + "loss": 0.8824, + "step": 24875 + }, + { + "epoch": 1.4, + "grad_norm": 5.010538313701555, + "learning_rate": 6.423824357326894e-06, + "loss": 0.912, + "step": 24880 + }, + { + "epoch": 1.4, + "grad_norm": 6.049221208392482, + "learning_rate": 6.422253128215168e-06, + "loss": 0.874, + "step": 24885 + }, + { + "epoch": 1.4, + "grad_norm": 8.948141376148575, + "learning_rate": 6.420681746276067e-06, + "loss": 0.8519, + "step": 24890 + }, + { + "epoch": 1.4, + "grad_norm": 8.051185069525754, + "learning_rate": 6.419110211678443e-06, + "loss": 0.8837, + "step": 24895 + }, + { + "epoch": 1.4, + "grad_norm": 5.6009875393177175, + "learning_rate": 6.4175385245911625e-06, + "loss": 0.8951, + "step": 24900 + }, + { + "epoch": 1.4, + "grad_norm": 7.436140089614032, + "learning_rate": 6.415966685183113e-06, + "loss": 0.9, + "step": 24905 + }, + { + "epoch": 1.4, + "grad_norm": 8.148240214992114, + "learning_rate": 6.414394693623193e-06, + "loss": 0.8465, + "step": 24910 + }, + { + "epoch": 1.4, + "grad_norm": 8.605172598122554, + "learning_rate": 6.4128225500803196e-06, + "loss": 0.8897, + "step": 24915 + }, + { + "epoch": 1.4, + "grad_norm": 9.289992920383536, + "learning_rate": 6.4112502547234315e-06, + "loss": 0.8595, + "step": 24920 + }, + { + "epoch": 1.4, + "grad_norm": 9.744894114120008, + "learning_rate": 6.409677807721472e-06, + "loss": 0.9081, + "step": 24925 + }, + { + "epoch": 1.4, + "grad_norm": 14.459954436924413, + "learning_rate": 6.408105209243411e-06, + "loss": 0.898, + "step": 24930 + }, + { + "epoch": 1.4, + "grad_norm": 8.015891739297883, + "learning_rate": 6.406532459458233e-06, + "loss": 0.8679, + "step": 24935 + }, + { + "epoch": 1.41, + "grad_norm": 14.921865963329948, + "learning_rate": 6.404959558534932e-06, + "loss": 0.8989, + "step": 24940 + }, + { + "epoch": 1.41, + "grad_norm": 7.06684515241933, + "learning_rate": 6.403386506642527e-06, + "loss": 0.8739, + "step": 24945 + }, + { + "epoch": 1.41, + "grad_norm": 5.934994574831989, + "learning_rate": 6.40181330395005e-06, + "loss": 0.8825, + "step": 24950 + }, + { + "epoch": 1.41, + "grad_norm": 5.9766846337708746, + "learning_rate": 6.400239950626545e-06, + "loss": 0.8815, + "step": 24955 + }, + { + "epoch": 1.41, + "grad_norm": 5.282948521741755, + "learning_rate": 6.398666446841077e-06, + "loss": 0.8781, + "step": 24960 + }, + { + "epoch": 1.41, + "grad_norm": 16.211274590780548, + "learning_rate": 6.397092792762728e-06, + "loss": 0.8597, + "step": 24965 + }, + { + "epoch": 1.41, + "grad_norm": 16.644971253752303, + "learning_rate": 6.395518988560592e-06, + "loss": 0.8741, + "step": 24970 + }, + { + "epoch": 1.41, + "grad_norm": 5.434603215972421, + "learning_rate": 6.393945034403781e-06, + "loss": 0.874, + "step": 24975 + }, + { + "epoch": 1.41, + "grad_norm": 6.6981290913609834, + "learning_rate": 6.392370930461425e-06, + "loss": 0.8454, + "step": 24980 + }, + { + "epoch": 1.41, + "grad_norm": 6.097345226606922, + "learning_rate": 6.390796676902668e-06, + "loss": 0.8954, + "step": 24985 + }, + { + "epoch": 1.41, + "grad_norm": 4.765114001394074, + "learning_rate": 6.3892222738966695e-06, + "loss": 0.8748, + "step": 24990 + }, + { + "epoch": 1.41, + "grad_norm": 5.81137370437781, + "learning_rate": 6.387647721612605e-06, + "loss": 0.8775, + "step": 24995 + }, + { + "epoch": 1.41, + "grad_norm": 5.5015110343649205, + "learning_rate": 6.38607302021967e-06, + "loss": 0.8673, + "step": 25000 + }, + { + "epoch": 1.41, + "grad_norm": 5.9944624280897525, + "learning_rate": 6.384498169887073e-06, + "loss": 0.8924, + "step": 25005 + }, + { + "epoch": 1.41, + "grad_norm": 5.728359410450134, + "learning_rate": 6.382923170784037e-06, + "loss": 0.8606, + "step": 25010 + }, + { + "epoch": 1.41, + "grad_norm": 16.327083871082298, + "learning_rate": 6.381348023079801e-06, + "loss": 0.8697, + "step": 25015 + }, + { + "epoch": 1.41, + "grad_norm": 7.699955445201512, + "learning_rate": 6.379772726943624e-06, + "loss": 0.904, + "step": 25020 + }, + { + "epoch": 1.41, + "grad_norm": 36.29050066769943, + "learning_rate": 6.378197282544779e-06, + "loss": 0.8883, + "step": 25025 + }, + { + "epoch": 1.41, + "grad_norm": 6.3506843223871, + "learning_rate": 6.376621690052553e-06, + "loss": 0.9365, + "step": 25030 + }, + { + "epoch": 1.41, + "grad_norm": 13.72574921360742, + "learning_rate": 6.375045949636253e-06, + "loss": 0.8563, + "step": 25035 + }, + { + "epoch": 1.41, + "grad_norm": 8.781050704121032, + "learning_rate": 6.3734700614651945e-06, + "loss": 0.9129, + "step": 25040 + }, + { + "epoch": 1.41, + "grad_norm": 8.732855766505102, + "learning_rate": 6.371894025708718e-06, + "loss": 0.8889, + "step": 25045 + }, + { + "epoch": 1.41, + "grad_norm": 5.536775001681233, + "learning_rate": 6.370317842536173e-06, + "loss": 0.9323, + "step": 25050 + }, + { + "epoch": 1.41, + "grad_norm": 5.937566040867825, + "learning_rate": 6.368741512116927e-06, + "loss": 0.915, + "step": 25055 + }, + { + "epoch": 1.41, + "grad_norm": 6.695035820102787, + "learning_rate": 6.367165034620366e-06, + "loss": 0.8801, + "step": 25060 + }, + { + "epoch": 1.41, + "grad_norm": 5.506692219149493, + "learning_rate": 6.365588410215888e-06, + "loss": 0.8918, + "step": 25065 + }, + { + "epoch": 1.41, + "grad_norm": 17.933956555174458, + "learning_rate": 6.364011639072909e-06, + "loss": 0.8454, + "step": 25070 + }, + { + "epoch": 1.41, + "grad_norm": 11.953535565751432, + "learning_rate": 6.362434721360859e-06, + "loss": 0.9239, + "step": 25075 + }, + { + "epoch": 1.41, + "grad_norm": 37.87355257476908, + "learning_rate": 6.3608576572491865e-06, + "loss": 0.9462, + "step": 25080 + }, + { + "epoch": 1.41, + "grad_norm": 20.470777727115546, + "learning_rate": 6.3592804469073525e-06, + "loss": 0.9016, + "step": 25085 + }, + { + "epoch": 1.41, + "grad_norm": 9.541677099796816, + "learning_rate": 6.357703090504835e-06, + "loss": 1.0141, + "step": 25090 + }, + { + "epoch": 1.41, + "grad_norm": 7.172382287645066, + "learning_rate": 6.356125588211128e-06, + "loss": 0.8746, + "step": 25095 + }, + { + "epoch": 1.41, + "grad_norm": 18.655241483180802, + "learning_rate": 6.354547940195743e-06, + "loss": 0.9102, + "step": 25100 + }, + { + "epoch": 1.41, + "grad_norm": 21.650089145484905, + "learning_rate": 6.352970146628203e-06, + "loss": 0.8752, + "step": 25105 + }, + { + "epoch": 1.41, + "grad_norm": 38.9052328367733, + "learning_rate": 6.351392207678049e-06, + "loss": 0.9005, + "step": 25110 + }, + { + "epoch": 1.42, + "grad_norm": 25.1361538119838, + "learning_rate": 6.349814123514837e-06, + "loss": 0.9217, + "step": 25115 + }, + { + "epoch": 1.42, + "grad_norm": 8.728783466528613, + "learning_rate": 6.3482358943081426e-06, + "loss": 0.9287, + "step": 25120 + }, + { + "epoch": 1.42, + "grad_norm": 25.17885962726306, + "learning_rate": 6.34665752022755e-06, + "loss": 0.8961, + "step": 25125 + }, + { + "epoch": 1.42, + "grad_norm": 28.414541659143694, + "learning_rate": 6.3450790014426645e-06, + "loss": 0.9085, + "step": 25130 + }, + { + "epoch": 1.42, + "grad_norm": 8.030625309670791, + "learning_rate": 6.343500338123103e-06, + "loss": 0.8608, + "step": 25135 + }, + { + "epoch": 1.42, + "grad_norm": 25.042553651334984, + "learning_rate": 6.341921530438501e-06, + "loss": 0.8593, + "step": 25140 + }, + { + "epoch": 1.42, + "grad_norm": 50.62621748569941, + "learning_rate": 6.340342578558508e-06, + "loss": 0.9097, + "step": 25145 + }, + { + "epoch": 1.42, + "grad_norm": 27.80881585341356, + "learning_rate": 6.338763482652791e-06, + "loss": 0.9368, + "step": 25150 + }, + { + "epoch": 1.42, + "grad_norm": 61.609395311695614, + "learning_rate": 6.3371842428910264e-06, + "loss": 0.8344, + "step": 25155 + }, + { + "epoch": 1.42, + "grad_norm": 19.816544428104855, + "learning_rate": 6.335604859442917e-06, + "loss": 0.8538, + "step": 25160 + }, + { + "epoch": 1.42, + "grad_norm": 19.890584371153086, + "learning_rate": 6.3340253324781686e-06, + "loss": 0.9208, + "step": 25165 + }, + { + "epoch": 1.42, + "grad_norm": 7.0517005574234455, + "learning_rate": 6.332445662166511e-06, + "loss": 0.8755, + "step": 25170 + }, + { + "epoch": 1.42, + "grad_norm": 8.851264428215037, + "learning_rate": 6.330865848677688e-06, + "loss": 0.8904, + "step": 25175 + }, + { + "epoch": 1.42, + "grad_norm": 5.452579228286874, + "learning_rate": 6.3292858921814545e-06, + "loss": 0.8392, + "step": 25180 + }, + { + "epoch": 1.42, + "grad_norm": 12.108128960824086, + "learning_rate": 6.327705792847586e-06, + "loss": 0.8849, + "step": 25185 + }, + { + "epoch": 1.42, + "grad_norm": 6.0810901540929905, + "learning_rate": 6.32612555084587e-06, + "loss": 0.9223, + "step": 25190 + }, + { + "epoch": 1.42, + "grad_norm": 8.215293047065598, + "learning_rate": 6.324545166346111e-06, + "loss": 0.9109, + "step": 25195 + }, + { + "epoch": 1.42, + "grad_norm": 14.738112141873573, + "learning_rate": 6.322964639518129e-06, + "loss": 0.85, + "step": 25200 + }, + { + "epoch": 1.42, + "grad_norm": 11.278897458302731, + "learning_rate": 6.321383970531757e-06, + "loss": 0.9094, + "step": 25205 + }, + { + "epoch": 1.42, + "grad_norm": 6.796606996809201, + "learning_rate": 6.3198031595568456e-06, + "loss": 0.8618, + "step": 25210 + }, + { + "epoch": 1.42, + "grad_norm": 5.086265816029416, + "learning_rate": 6.318222206763261e-06, + "loss": 0.8958, + "step": 25215 + }, + { + "epoch": 1.42, + "grad_norm": 5.688934732495671, + "learning_rate": 6.3166411123208805e-06, + "loss": 0.886, + "step": 25220 + }, + { + "epoch": 1.42, + "grad_norm": 7.798233746830581, + "learning_rate": 6.315059876399603e-06, + "loss": 0.8873, + "step": 25225 + }, + { + "epoch": 1.42, + "grad_norm": 5.26875537498929, + "learning_rate": 6.313478499169338e-06, + "loss": 0.8936, + "step": 25230 + }, + { + "epoch": 1.42, + "grad_norm": 6.271522126023936, + "learning_rate": 6.311896980800012e-06, + "loss": 0.8911, + "step": 25235 + }, + { + "epoch": 1.42, + "grad_norm": 10.11105347007813, + "learning_rate": 6.310315321461564e-06, + "loss": 0.8793, + "step": 25240 + }, + { + "epoch": 1.42, + "grad_norm": 5.537558613075709, + "learning_rate": 6.308733521323953e-06, + "loss": 0.8907, + "step": 25245 + }, + { + "epoch": 1.42, + "grad_norm": 6.966864864219315, + "learning_rate": 6.307151580557148e-06, + "loss": 0.8813, + "step": 25250 + }, + { + "epoch": 1.42, + "grad_norm": 5.0511101707481165, + "learning_rate": 6.305569499331138e-06, + "loss": 0.8534, + "step": 25255 + }, + { + "epoch": 1.42, + "grad_norm": 13.625865022087279, + "learning_rate": 6.303987277815923e-06, + "loss": 0.8478, + "step": 25260 + }, + { + "epoch": 1.42, + "grad_norm": 8.251346315040928, + "learning_rate": 6.302404916181521e-06, + "loss": 0.8918, + "step": 25265 + }, + { + "epoch": 1.42, + "grad_norm": 8.968406670522626, + "learning_rate": 6.300822414597962e-06, + "loss": 0.9049, + "step": 25270 + }, + { + "epoch": 1.42, + "grad_norm": 5.945161424095243, + "learning_rate": 6.299239773235293e-06, + "loss": 0.9005, + "step": 25275 + }, + { + "epoch": 1.42, + "grad_norm": 4.92724483468996, + "learning_rate": 6.297656992263577e-06, + "loss": 0.8903, + "step": 25280 + }, + { + "epoch": 1.42, + "grad_norm": 8.52748808648185, + "learning_rate": 6.29607407185289e-06, + "loss": 0.8991, + "step": 25285 + }, + { + "epoch": 1.42, + "grad_norm": 10.440892827504172, + "learning_rate": 6.294491012173325e-06, + "loss": 0.8745, + "step": 25290 + }, + { + "epoch": 1.43, + "grad_norm": 6.138088117123807, + "learning_rate": 6.2929078133949865e-06, + "loss": 0.8244, + "step": 25295 + }, + { + "epoch": 1.43, + "grad_norm": 11.072088311293955, + "learning_rate": 6.291324475687997e-06, + "loss": 0.8408, + "step": 25300 + }, + { + "epoch": 1.43, + "grad_norm": 5.80878787504207, + "learning_rate": 6.289740999222495e-06, + "loss": 0.8961, + "step": 25305 + }, + { + "epoch": 1.43, + "grad_norm": 13.100257753467226, + "learning_rate": 6.28815738416863e-06, + "loss": 0.8426, + "step": 25310 + }, + { + "epoch": 1.43, + "grad_norm": 7.961299535659332, + "learning_rate": 6.286573630696569e-06, + "loss": 0.8381, + "step": 25315 + }, + { + "epoch": 1.43, + "grad_norm": 4.865925335649451, + "learning_rate": 6.284989738976492e-06, + "loss": 0.8848, + "step": 25320 + }, + { + "epoch": 1.43, + "grad_norm": 22.30800108665002, + "learning_rate": 6.283405709178598e-06, + "loss": 0.8981, + "step": 25325 + }, + { + "epoch": 1.43, + "grad_norm": 5.862355353675248, + "learning_rate": 6.2818215414730965e-06, + "loss": 0.8677, + "step": 25330 + }, + { + "epoch": 1.43, + "grad_norm": 6.434766157425761, + "learning_rate": 6.280237236030213e-06, + "loss": 0.8778, + "step": 25335 + }, + { + "epoch": 1.43, + "grad_norm": 4.9510276868521865, + "learning_rate": 6.278652793020187e-06, + "loss": 0.8422, + "step": 25340 + }, + { + "epoch": 1.43, + "grad_norm": 6.428288294984716, + "learning_rate": 6.277068212613276e-06, + "loss": 0.8578, + "step": 25345 + }, + { + "epoch": 1.43, + "grad_norm": 11.324861746825624, + "learning_rate": 6.27548349497975e-06, + "loss": 0.8391, + "step": 25350 + }, + { + "epoch": 1.43, + "grad_norm": 5.275352353676312, + "learning_rate": 6.273898640289894e-06, + "loss": 0.8343, + "step": 25355 + }, + { + "epoch": 1.43, + "grad_norm": 5.770407383546127, + "learning_rate": 6.272313648714006e-06, + "loss": 0.8451, + "step": 25360 + }, + { + "epoch": 1.43, + "grad_norm": 7.403999619241585, + "learning_rate": 6.270728520422402e-06, + "loss": 0.8631, + "step": 25365 + }, + { + "epoch": 1.43, + "grad_norm": 17.52725588838729, + "learning_rate": 6.26914325558541e-06, + "loss": 0.8681, + "step": 25370 + }, + { + "epoch": 1.43, + "grad_norm": 22.16613945415085, + "learning_rate": 6.2675578543733735e-06, + "loss": 0.8837, + "step": 25375 + }, + { + "epoch": 1.43, + "grad_norm": 11.09014252592451, + "learning_rate": 6.2659723169566515e-06, + "loss": 0.9024, + "step": 25380 + }, + { + "epoch": 1.43, + "grad_norm": 5.956239606675524, + "learning_rate": 6.2643866435056165e-06, + "loss": 0.9503, + "step": 25385 + }, + { + "epoch": 1.43, + "grad_norm": 4.7753885835698435, + "learning_rate": 6.262800834190657e-06, + "loss": 0.8486, + "step": 25390 + }, + { + "epoch": 1.43, + "grad_norm": 5.343432226257339, + "learning_rate": 6.261214889182174e-06, + "loss": 0.8774, + "step": 25395 + }, + { + "epoch": 1.43, + "grad_norm": 12.291199180118161, + "learning_rate": 6.259628808650586e-06, + "loss": 0.9142, + "step": 25400 + }, + { + "epoch": 1.43, + "grad_norm": 5.1642766099559285, + "learning_rate": 6.258042592766325e-06, + "loss": 0.8584, + "step": 25405 + }, + { + "epoch": 1.43, + "grad_norm": 8.878353611957847, + "learning_rate": 6.256456241699833e-06, + "loss": 0.8818, + "step": 25410 + }, + { + "epoch": 1.43, + "grad_norm": 12.138105161693613, + "learning_rate": 6.254869755621573e-06, + "loss": 0.8619, + "step": 25415 + }, + { + "epoch": 1.43, + "grad_norm": 5.734859138661207, + "learning_rate": 6.253283134702019e-06, + "loss": 0.8822, + "step": 25420 + }, + { + "epoch": 1.43, + "grad_norm": 18.454157811035056, + "learning_rate": 6.251696379111662e-06, + "loss": 0.8738, + "step": 25425 + }, + { + "epoch": 1.43, + "grad_norm": 7.731280489534711, + "learning_rate": 6.250109489021004e-06, + "loss": 0.8902, + "step": 25430 + }, + { + "epoch": 1.43, + "grad_norm": 6.863324415171886, + "learning_rate": 6.248522464600565e-06, + "loss": 0.9407, + "step": 25435 + }, + { + "epoch": 1.43, + "grad_norm": 5.242334946554647, + "learning_rate": 6.246935306020877e-06, + "loss": 0.8491, + "step": 25440 + }, + { + "epoch": 1.43, + "grad_norm": 18.69778428285142, + "learning_rate": 6.245348013452487e-06, + "loss": 0.8757, + "step": 25445 + }, + { + "epoch": 1.43, + "grad_norm": 8.971461688685569, + "learning_rate": 6.2437605870659565e-06, + "loss": 0.9059, + "step": 25450 + }, + { + "epoch": 1.43, + "grad_norm": 9.889903360165224, + "learning_rate": 6.242173027031862e-06, + "loss": 0.8762, + "step": 25455 + }, + { + "epoch": 1.43, + "grad_norm": 15.757845037341863, + "learning_rate": 6.240585333520793e-06, + "loss": 0.8612, + "step": 25460 + }, + { + "epoch": 1.43, + "grad_norm": 5.4613410407274925, + "learning_rate": 6.238997506703357e-06, + "loss": 0.8858, + "step": 25465 + }, + { + "epoch": 1.44, + "grad_norm": 6.133444905801013, + "learning_rate": 6.237409546750169e-06, + "loss": 0.8537, + "step": 25470 + }, + { + "epoch": 1.44, + "grad_norm": 25.661710255151302, + "learning_rate": 6.235821453831864e-06, + "loss": 0.8836, + "step": 25475 + }, + { + "epoch": 1.44, + "grad_norm": 10.957654960348783, + "learning_rate": 6.23423322811909e-06, + "loss": 0.895, + "step": 25480 + }, + { + "epoch": 1.44, + "grad_norm": 5.80550971294075, + "learning_rate": 6.232644869782508e-06, + "loss": 0.86, + "step": 25485 + }, + { + "epoch": 1.44, + "grad_norm": 6.606331000684494, + "learning_rate": 6.231056378992797e-06, + "loss": 0.9075, + "step": 25490 + }, + { + "epoch": 1.44, + "grad_norm": 8.300816701891632, + "learning_rate": 6.229467755920643e-06, + "loss": 0.8679, + "step": 25495 + }, + { + "epoch": 1.44, + "grad_norm": 6.333988336117034, + "learning_rate": 6.227879000736753e-06, + "loss": 0.9036, + "step": 25500 + }, + { + "epoch": 1.44, + "grad_norm": 5.192784693661378, + "learning_rate": 6.226290113611845e-06, + "loss": 0.9143, + "step": 25505 + }, + { + "epoch": 1.44, + "grad_norm": 5.473003754287832, + "learning_rate": 6.224701094716652e-06, + "loss": 0.8714, + "step": 25510 + }, + { + "epoch": 1.44, + "grad_norm": 7.326087216702297, + "learning_rate": 6.2231119442219225e-06, + "loss": 0.8601, + "step": 25515 + }, + { + "epoch": 1.44, + "grad_norm": 10.333098860162012, + "learning_rate": 6.2215226622984155e-06, + "loss": 0.8855, + "step": 25520 + }, + { + "epoch": 1.44, + "grad_norm": 11.376095325289636, + "learning_rate": 6.219933249116908e-06, + "loss": 0.8789, + "step": 25525 + }, + { + "epoch": 1.44, + "grad_norm": 7.533668475310641, + "learning_rate": 6.218343704848189e-06, + "loss": 0.9266, + "step": 25530 + }, + { + "epoch": 1.44, + "grad_norm": 5.932136890663288, + "learning_rate": 6.216754029663061e-06, + "loss": 0.8689, + "step": 25535 + }, + { + "epoch": 1.44, + "grad_norm": 5.184289943538942, + "learning_rate": 6.2151642237323436e-06, + "loss": 0.9176, + "step": 25540 + }, + { + "epoch": 1.44, + "grad_norm": 5.054911218312706, + "learning_rate": 6.2135742872268665e-06, + "loss": 0.8163, + "step": 25545 + }, + { + "epoch": 1.44, + "grad_norm": 5.531494243211934, + "learning_rate": 6.211984220317479e-06, + "loss": 0.8368, + "step": 25550 + }, + { + "epoch": 1.44, + "grad_norm": 6.439367060763998, + "learning_rate": 6.210394023175036e-06, + "loss": 0.895, + "step": 25555 + }, + { + "epoch": 1.44, + "grad_norm": 7.1587734485685965, + "learning_rate": 6.208803695970414e-06, + "loss": 0.8906, + "step": 25560 + }, + { + "epoch": 1.44, + "grad_norm": 5.0237307675450555, + "learning_rate": 6.207213238874498e-06, + "loss": 0.8784, + "step": 25565 + }, + { + "epoch": 1.44, + "grad_norm": 11.224329296527005, + "learning_rate": 6.205622652058192e-06, + "loss": 0.8613, + "step": 25570 + }, + { + "epoch": 1.44, + "grad_norm": 15.720986066416545, + "learning_rate": 6.2040319356924136e-06, + "loss": 0.856, + "step": 25575 + }, + { + "epoch": 1.44, + "grad_norm": 8.276578756297132, + "learning_rate": 6.202441089948088e-06, + "loss": 0.8646, + "step": 25580 + }, + { + "epoch": 1.44, + "grad_norm": 12.281021723958332, + "learning_rate": 6.200850114996162e-06, + "loss": 0.8785, + "step": 25585 + }, + { + "epoch": 1.44, + "grad_norm": 6.392830282437864, + "learning_rate": 6.19925901100759e-06, + "loss": 0.8752, + "step": 25590 + }, + { + "epoch": 1.44, + "grad_norm": 6.762018802190973, + "learning_rate": 6.197667778153345e-06, + "loss": 0.8433, + "step": 25595 + }, + { + "epoch": 1.44, + "grad_norm": 7.028238494354232, + "learning_rate": 6.196076416604412e-06, + "loss": 0.8203, + "step": 25600 + }, + { + "epoch": 1.44, + "grad_norm": 4.853377348294338, + "learning_rate": 6.19448492653179e-06, + "loss": 0.8125, + "step": 25605 + }, + { + "epoch": 1.44, + "grad_norm": 6.415364521392845, + "learning_rate": 6.192893308106488e-06, + "loss": 0.8794, + "step": 25610 + }, + { + "epoch": 1.44, + "grad_norm": 5.918289053201691, + "learning_rate": 6.191301561499537e-06, + "loss": 0.9117, + "step": 25615 + }, + { + "epoch": 1.44, + "grad_norm": 17.442732371636694, + "learning_rate": 6.1897096868819764e-06, + "loss": 0.9287, + "step": 25620 + }, + { + "epoch": 1.44, + "grad_norm": 7.7851664084420396, + "learning_rate": 6.188117684424859e-06, + "loss": 0.847, + "step": 25625 + }, + { + "epoch": 1.44, + "grad_norm": 6.601353429277399, + "learning_rate": 6.186525554299252e-06, + "loss": 0.8426, + "step": 25630 + }, + { + "epoch": 1.44, + "grad_norm": 4.7269618683439685, + "learning_rate": 6.18493329667624e-06, + "loss": 0.8715, + "step": 25635 + }, + { + "epoch": 1.44, + "grad_norm": 5.231706552876182, + "learning_rate": 6.183340911726914e-06, + "loss": 0.8436, + "step": 25640 + }, + { + "epoch": 1.44, + "grad_norm": 5.4517877216033455, + "learning_rate": 6.181748399622385e-06, + "loss": 0.9128, + "step": 25645 + }, + { + "epoch": 1.45, + "grad_norm": 4.578331933065757, + "learning_rate": 6.180155760533773e-06, + "loss": 0.814, + "step": 25650 + }, + { + "epoch": 1.45, + "grad_norm": 19.07324712853925, + "learning_rate": 6.178562994632219e-06, + "loss": 0.8264, + "step": 25655 + }, + { + "epoch": 1.45, + "grad_norm": 17.345564566748667, + "learning_rate": 6.176970102088869e-06, + "loss": 0.8393, + "step": 25660 + }, + { + "epoch": 1.45, + "grad_norm": 7.14160681854394, + "learning_rate": 6.175377083074886e-06, + "loss": 0.9265, + "step": 25665 + }, + { + "epoch": 1.45, + "grad_norm": 19.381696915187014, + "learning_rate": 6.1737839377614485e-06, + "loss": 0.8998, + "step": 25670 + }, + { + "epoch": 1.45, + "grad_norm": 5.339583608213842, + "learning_rate": 6.172190666319747e-06, + "loss": 0.8295, + "step": 25675 + }, + { + "epoch": 1.45, + "grad_norm": 5.249527397611181, + "learning_rate": 6.170597268920986e-06, + "loss": 0.8858, + "step": 25680 + }, + { + "epoch": 1.45, + "grad_norm": 7.136525711987327, + "learning_rate": 6.169003745736381e-06, + "loss": 0.8748, + "step": 25685 + }, + { + "epoch": 1.45, + "grad_norm": 4.6186409880445645, + "learning_rate": 6.1674100969371655e-06, + "loss": 0.8481, + "step": 25690 + }, + { + "epoch": 1.45, + "grad_norm": 5.968110246671394, + "learning_rate": 6.165816322694582e-06, + "loss": 0.8588, + "step": 25695 + }, + { + "epoch": 1.45, + "grad_norm": 4.981580135910331, + "learning_rate": 6.1642224231798885e-06, + "loss": 0.8405, + "step": 25700 + }, + { + "epoch": 1.45, + "grad_norm": 6.7395863009387424, + "learning_rate": 6.162628398564358e-06, + "loss": 0.9008, + "step": 25705 + }, + { + "epoch": 1.45, + "grad_norm": 13.99837936229072, + "learning_rate": 6.161034249019275e-06, + "loss": 0.8874, + "step": 25710 + }, + { + "epoch": 1.45, + "grad_norm": 10.393926376479731, + "learning_rate": 6.1594399747159396e-06, + "loss": 0.8948, + "step": 25715 + }, + { + "epoch": 1.45, + "grad_norm": 20.96398760200713, + "learning_rate": 6.15784557582566e-06, + "loss": 0.8327, + "step": 25720 + }, + { + "epoch": 1.45, + "grad_norm": 5.733698169791028, + "learning_rate": 6.1562510525197645e-06, + "loss": 0.8984, + "step": 25725 + }, + { + "epoch": 1.45, + "grad_norm": 6.229570441270813, + "learning_rate": 6.15465640496959e-06, + "loss": 0.8901, + "step": 25730 + }, + { + "epoch": 1.45, + "grad_norm": 9.39025469930247, + "learning_rate": 6.1530616333464895e-06, + "loss": 0.9036, + "step": 25735 + }, + { + "epoch": 1.45, + "grad_norm": 5.330411512117964, + "learning_rate": 6.151466737821827e-06, + "loss": 0.8556, + "step": 25740 + }, + { + "epoch": 1.45, + "grad_norm": 6.757175274270205, + "learning_rate": 6.149871718566982e-06, + "loss": 0.9078, + "step": 25745 + }, + { + "epoch": 1.45, + "grad_norm": 5.541428998012914, + "learning_rate": 6.1482765757533455e-06, + "loss": 0.8349, + "step": 25750 + }, + { + "epoch": 1.45, + "grad_norm": 5.20281935323804, + "learning_rate": 6.146681309552324e-06, + "loss": 0.8623, + "step": 25755 + }, + { + "epoch": 1.45, + "grad_norm": 5.684748046413062, + "learning_rate": 6.145085920135334e-06, + "loss": 0.8665, + "step": 25760 + }, + { + "epoch": 1.45, + "grad_norm": 4.782285328980653, + "learning_rate": 6.143490407673809e-06, + "loss": 0.8454, + "step": 25765 + }, + { + "epoch": 1.45, + "grad_norm": 8.793694387236938, + "learning_rate": 6.141894772339193e-06, + "loss": 0.8941, + "step": 25770 + }, + { + "epoch": 1.45, + "grad_norm": 8.392990744236515, + "learning_rate": 6.140299014302945e-06, + "loss": 0.8868, + "step": 25775 + }, + { + "epoch": 1.45, + "grad_norm": 5.538279445165557, + "learning_rate": 6.1387031337365346e-06, + "loss": 0.9097, + "step": 25780 + }, + { + "epoch": 1.45, + "grad_norm": 37.88139639419802, + "learning_rate": 6.137107130811447e-06, + "loss": 0.8684, + "step": 25785 + }, + { + "epoch": 1.45, + "grad_norm": 6.763698477540671, + "learning_rate": 6.13551100569918e-06, + "loss": 0.8093, + "step": 25790 + }, + { + "epoch": 1.45, + "grad_norm": 10.76037090425412, + "learning_rate": 6.1339147585712435e-06, + "loss": 0.878, + "step": 25795 + }, + { + "epoch": 1.45, + "grad_norm": 11.012779298640561, + "learning_rate": 6.132318389599161e-06, + "loss": 0.8586, + "step": 25800 + }, + { + "epoch": 1.45, + "grad_norm": 8.446305715107773, + "learning_rate": 6.130721898954472e-06, + "loss": 0.8847, + "step": 25805 + }, + { + "epoch": 1.45, + "grad_norm": 15.20015550247254, + "learning_rate": 6.129125286808723e-06, + "loss": 0.9221, + "step": 25810 + }, + { + "epoch": 1.45, + "grad_norm": 7.865817013173404, + "learning_rate": 6.12752855333348e-06, + "loss": 0.8899, + "step": 25815 + }, + { + "epoch": 1.45, + "grad_norm": 6.086374757712554, + "learning_rate": 6.125931698700318e-06, + "loss": 0.8559, + "step": 25820 + }, + { + "epoch": 1.46, + "grad_norm": 7.124075536029831, + "learning_rate": 6.124334723080826e-06, + "loss": 0.8383, + "step": 25825 + }, + { + "epoch": 1.46, + "grad_norm": 5.621989313643415, + "learning_rate": 6.122737626646605e-06, + "loss": 0.826, + "step": 25830 + }, + { + "epoch": 1.46, + "grad_norm": 9.507963313798742, + "learning_rate": 6.121140409569273e-06, + "loss": 0.8725, + "step": 25835 + }, + { + "epoch": 1.46, + "grad_norm": 5.085110510275198, + "learning_rate": 6.119543072020453e-06, + "loss": 0.8643, + "step": 25840 + }, + { + "epoch": 1.46, + "grad_norm": 7.8194383645461665, + "learning_rate": 6.117945614171789e-06, + "loss": 0.9087, + "step": 25845 + }, + { + "epoch": 1.46, + "grad_norm": 5.125330358039872, + "learning_rate": 6.1163480361949366e-06, + "loss": 0.8626, + "step": 25850 + }, + { + "epoch": 1.46, + "grad_norm": 9.824130877205306, + "learning_rate": 6.114750338261562e-06, + "loss": 0.8789, + "step": 25855 + }, + { + "epoch": 1.46, + "grad_norm": 6.466364782295777, + "learning_rate": 6.113152520543342e-06, + "loss": 0.8464, + "step": 25860 + }, + { + "epoch": 1.46, + "grad_norm": 27.772788330067467, + "learning_rate": 6.111554583211972e-06, + "loss": 0.8352, + "step": 25865 + }, + { + "epoch": 1.46, + "grad_norm": 5.724817065075063, + "learning_rate": 6.109956526439154e-06, + "loss": 0.894, + "step": 25870 + }, + { + "epoch": 1.46, + "grad_norm": 8.741257206597169, + "learning_rate": 6.108358350396612e-06, + "loss": 0.8662, + "step": 25875 + }, + { + "epoch": 1.46, + "grad_norm": 4.590538278157433, + "learning_rate": 6.106760055256071e-06, + "loss": 0.848, + "step": 25880 + }, + { + "epoch": 1.46, + "grad_norm": 6.634961584039799, + "learning_rate": 6.1051616411892785e-06, + "loss": 0.8735, + "step": 25885 + }, + { + "epoch": 1.46, + "grad_norm": 16.136378425841045, + "learning_rate": 6.10356310836799e-06, + "loss": 0.8728, + "step": 25890 + }, + { + "epoch": 1.46, + "grad_norm": 7.0965211451313905, + "learning_rate": 6.101964456963976e-06, + "loss": 0.9167, + "step": 25895 + }, + { + "epoch": 1.46, + "grad_norm": 19.23678514989524, + "learning_rate": 6.100365687149017e-06, + "loss": 0.8554, + "step": 25900 + }, + { + "epoch": 1.46, + "grad_norm": 18.77477074084548, + "learning_rate": 6.098766799094909e-06, + "loss": 0.8947, + "step": 25905 + }, + { + "epoch": 1.46, + "grad_norm": 6.899352467156238, + "learning_rate": 6.097167792973458e-06, + "loss": 0.8689, + "step": 25910 + }, + { + "epoch": 1.46, + "grad_norm": 17.032953600140125, + "learning_rate": 6.0955686689564865e-06, + "loss": 0.8986, + "step": 25915 + }, + { + "epoch": 1.46, + "grad_norm": 8.290130421515645, + "learning_rate": 6.093969427215827e-06, + "loss": 0.8044, + "step": 25920 + }, + { + "epoch": 1.46, + "grad_norm": 53.66729131657105, + "learning_rate": 6.092370067923323e-06, + "loss": 0.8699, + "step": 25925 + }, + { + "epoch": 1.46, + "grad_norm": 32.50415566669549, + "learning_rate": 6.090770591250835e-06, + "loss": 0.8465, + "step": 25930 + }, + { + "epoch": 1.46, + "grad_norm": 14.308652352161277, + "learning_rate": 6.089170997370233e-06, + "loss": 0.8476, + "step": 25935 + }, + { + "epoch": 1.46, + "grad_norm": 15.15751096832169, + "learning_rate": 6.0875712864534e-06, + "loss": 0.8467, + "step": 25940 + }, + { + "epoch": 1.46, + "grad_norm": 7.654747038364158, + "learning_rate": 6.085971458672233e-06, + "loss": 0.8663, + "step": 25945 + }, + { + "epoch": 1.46, + "grad_norm": 20.02574199828502, + "learning_rate": 6.084371514198641e-06, + "loss": 0.9138, + "step": 25950 + }, + { + "epoch": 1.46, + "grad_norm": 22.236670726774232, + "learning_rate": 6.0827714532045425e-06, + "loss": 0.8634, + "step": 25955 + }, + { + "epoch": 1.46, + "grad_norm": 29.715118450946402, + "learning_rate": 6.081171275861873e-06, + "loss": 0.8551, + "step": 25960 + }, + { + "epoch": 1.46, + "grad_norm": 5.779398096974892, + "learning_rate": 6.0795709823425784e-06, + "loss": 0.8719, + "step": 25965 + }, + { + "epoch": 1.46, + "grad_norm": 20.369736599047194, + "learning_rate": 6.077970572818618e-06, + "loss": 0.9093, + "step": 25970 + }, + { + "epoch": 1.46, + "grad_norm": 8.121210471723412, + "learning_rate": 6.076370047461961e-06, + "loss": 0.8434, + "step": 25975 + }, + { + "epoch": 1.46, + "grad_norm": 9.907771106995737, + "learning_rate": 6.074769406444592e-06, + "loss": 0.8584, + "step": 25980 + }, + { + "epoch": 1.46, + "grad_norm": 13.36401161040538, + "learning_rate": 6.073168649938508e-06, + "loss": 0.8572, + "step": 25985 + }, + { + "epoch": 1.46, + "grad_norm": 9.44906106688398, + "learning_rate": 6.071567778115717e-06, + "loss": 0.8313, + "step": 25990 + }, + { + "epoch": 1.46, + "grad_norm": 7.738736637561908, + "learning_rate": 6.069966791148237e-06, + "loss": 0.8937, + "step": 25995 + }, + { + "epoch": 1.46, + "grad_norm": 24.246733754892638, + "learning_rate": 6.068365689208105e-06, + "loss": 0.9566, + "step": 26000 + }, + { + "epoch": 1.47, + "grad_norm": 7.41108631179515, + "learning_rate": 6.066764472467364e-06, + "loss": 0.8699, + "step": 26005 + }, + { + "epoch": 1.47, + "grad_norm": 9.306770724715992, + "learning_rate": 6.065163141098072e-06, + "loss": 0.8423, + "step": 26010 + }, + { + "epoch": 1.47, + "grad_norm": 9.272191908306718, + "learning_rate": 6.063561695272299e-06, + "loss": 0.8599, + "step": 26015 + }, + { + "epoch": 1.47, + "grad_norm": 5.055455827171254, + "learning_rate": 6.061960135162129e-06, + "loss": 0.8612, + "step": 26020 + }, + { + "epoch": 1.47, + "grad_norm": 6.359373771983753, + "learning_rate": 6.060358460939655e-06, + "loss": 0.8299, + "step": 26025 + }, + { + "epoch": 1.47, + "grad_norm": 5.059908628353393, + "learning_rate": 6.058756672776984e-06, + "loss": 0.8333, + "step": 26030 + }, + { + "epoch": 1.47, + "grad_norm": 13.1028732743827, + "learning_rate": 6.057154770846239e-06, + "loss": 0.8876, + "step": 26035 + }, + { + "epoch": 1.47, + "grad_norm": 6.869030784604681, + "learning_rate": 6.055552755319545e-06, + "loss": 0.8168, + "step": 26040 + }, + { + "epoch": 1.47, + "grad_norm": 6.2862331595314, + "learning_rate": 6.053950626369051e-06, + "loss": 0.8742, + "step": 26045 + }, + { + "epoch": 1.47, + "grad_norm": 5.788579628932328, + "learning_rate": 6.0523483841669085e-06, + "loss": 0.8712, + "step": 26050 + }, + { + "epoch": 1.47, + "grad_norm": 8.030578341239467, + "learning_rate": 6.050746028885289e-06, + "loss": 0.9296, + "step": 26055 + }, + { + "epoch": 1.47, + "grad_norm": 5.468570397588891, + "learning_rate": 6.049143560696374e-06, + "loss": 0.8109, + "step": 26060 + }, + { + "epoch": 1.47, + "grad_norm": 5.8571517956563435, + "learning_rate": 6.04754097977235e-06, + "loss": 0.8524, + "step": 26065 + }, + { + "epoch": 1.47, + "grad_norm": 7.333615100612435, + "learning_rate": 6.045938286285424e-06, + "loss": 0.9171, + "step": 26070 + }, + { + "epoch": 1.47, + "grad_norm": 6.157846780627792, + "learning_rate": 6.044335480407815e-06, + "loss": 0.8542, + "step": 26075 + }, + { + "epoch": 1.47, + "grad_norm": 9.195776752013943, + "learning_rate": 6.042732562311749e-06, + "loss": 0.8581, + "step": 26080 + }, + { + "epoch": 1.47, + "grad_norm": 7.325092536158999, + "learning_rate": 6.041129532169466e-06, + "loss": 0.8757, + "step": 26085 + }, + { + "epoch": 1.47, + "grad_norm": 8.254949243153465, + "learning_rate": 6.039526390153222e-06, + "loss": 0.8842, + "step": 26090 + }, + { + "epoch": 1.47, + "grad_norm": 8.11847454594186, + "learning_rate": 6.037923136435279e-06, + "loss": 0.8913, + "step": 26095 + }, + { + "epoch": 1.47, + "grad_norm": 18.89783607734687, + "learning_rate": 6.036319771187914e-06, + "loss": 0.8695, + "step": 26100 + }, + { + "epoch": 1.47, + "grad_norm": 19.36171291397209, + "learning_rate": 6.034716294583416e-06, + "loss": 0.916, + "step": 26105 + }, + { + "epoch": 1.47, + "grad_norm": 5.259638650383265, + "learning_rate": 6.033112706794085e-06, + "loss": 0.8317, + "step": 26110 + }, + { + "epoch": 1.47, + "grad_norm": 18.1173258846941, + "learning_rate": 6.031509007992235e-06, + "loss": 0.841, + "step": 26115 + }, + { + "epoch": 1.47, + "grad_norm": 7.863621515780087, + "learning_rate": 6.029905198350189e-06, + "loss": 0.8602, + "step": 26120 + }, + { + "epoch": 1.47, + "grad_norm": 7.7387268398581375, + "learning_rate": 6.028301278040285e-06, + "loss": 0.8569, + "step": 26125 + }, + { + "epoch": 1.47, + "grad_norm": 7.180831099641892, + "learning_rate": 6.02669724723487e-06, + "loss": 0.8721, + "step": 26130 + }, + { + "epoch": 1.47, + "grad_norm": 6.1811762275798, + "learning_rate": 6.025093106106305e-06, + "loss": 0.8745, + "step": 26135 + }, + { + "epoch": 1.47, + "grad_norm": 5.04758399463794, + "learning_rate": 6.0234888548269596e-06, + "loss": 0.8167, + "step": 26140 + }, + { + "epoch": 1.47, + "grad_norm": 7.843301870825521, + "learning_rate": 6.0218844935692225e-06, + "loss": 0.8116, + "step": 26145 + }, + { + "epoch": 1.47, + "grad_norm": 5.742408571969738, + "learning_rate": 6.020280022505487e-06, + "loss": 0.8639, + "step": 26150 + }, + { + "epoch": 1.47, + "grad_norm": 17.883430245060165, + "learning_rate": 6.018675441808158e-06, + "loss": 0.8507, + "step": 26155 + }, + { + "epoch": 1.47, + "grad_norm": 9.442113503186823, + "learning_rate": 6.017070751649657e-06, + "loss": 0.8663, + "step": 26160 + }, + { + "epoch": 1.47, + "grad_norm": 9.87351851293303, + "learning_rate": 6.015465952202415e-06, + "loss": 0.8642, + "step": 26165 + }, + { + "epoch": 1.47, + "grad_norm": 5.980669194673036, + "learning_rate": 6.013861043638877e-06, + "loss": 0.8512, + "step": 26170 + }, + { + "epoch": 1.47, + "grad_norm": 10.7478501276795, + "learning_rate": 6.012256026131495e-06, + "loss": 0.7938, + "step": 26175 + }, + { + "epoch": 1.48, + "grad_norm": 5.796242570720157, + "learning_rate": 6.010650899852733e-06, + "loss": 0.8971, + "step": 26180 + }, + { + "epoch": 1.48, + "grad_norm": 4.763599465635087, + "learning_rate": 6.009045664975074e-06, + "loss": 0.8306, + "step": 26185 + }, + { + "epoch": 1.48, + "grad_norm": 6.694555344096171, + "learning_rate": 6.007440321671005e-06, + "loss": 0.8465, + "step": 26190 + }, + { + "epoch": 1.48, + "grad_norm": 8.829992844317758, + "learning_rate": 6.005834870113025e-06, + "loss": 0.8198, + "step": 26195 + }, + { + "epoch": 1.48, + "grad_norm": 6.949152588181757, + "learning_rate": 6.0042293104736505e-06, + "loss": 0.8348, + "step": 26200 + }, + { + "epoch": 1.48, + "grad_norm": 8.863277357566641, + "learning_rate": 6.002623642925405e-06, + "loss": 0.8549, + "step": 26205 + }, + { + "epoch": 1.48, + "grad_norm": 5.242921882279838, + "learning_rate": 6.001017867640824e-06, + "loss": 0.8744, + "step": 26210 + }, + { + "epoch": 1.48, + "grad_norm": 11.58829460970268, + "learning_rate": 5.999411984792455e-06, + "loss": 0.8297, + "step": 26215 + }, + { + "epoch": 1.48, + "grad_norm": 15.809968365257946, + "learning_rate": 5.9978059945528575e-06, + "loss": 0.8587, + "step": 26220 + }, + { + "epoch": 1.48, + "grad_norm": 14.15059994130675, + "learning_rate": 5.996199897094602e-06, + "loss": 0.876, + "step": 26225 + }, + { + "epoch": 1.48, + "grad_norm": 9.983579632267533, + "learning_rate": 5.994593692590272e-06, + "loss": 0.8651, + "step": 26230 + }, + { + "epoch": 1.48, + "grad_norm": 23.86432844905842, + "learning_rate": 5.9929873812124595e-06, + "loss": 0.8433, + "step": 26235 + }, + { + "epoch": 1.48, + "grad_norm": 20.35663173936695, + "learning_rate": 5.991380963133771e-06, + "loss": 0.8552, + "step": 26240 + }, + { + "epoch": 1.48, + "grad_norm": 15.216820046992261, + "learning_rate": 5.989774438526822e-06, + "loss": 0.8551, + "step": 26245 + }, + { + "epoch": 1.48, + "grad_norm": 6.447418647454864, + "learning_rate": 5.988167807564241e-06, + "loss": 0.8571, + "step": 26250 + }, + { + "epoch": 1.48, + "grad_norm": 7.416570965490965, + "learning_rate": 5.98656107041867e-06, + "loss": 0.8286, + "step": 26255 + }, + { + "epoch": 1.48, + "grad_norm": 10.353861550378891, + "learning_rate": 5.984954227262756e-06, + "loss": 0.8744, + "step": 26260 + }, + { + "epoch": 1.48, + "grad_norm": 6.267075615125807, + "learning_rate": 5.983347278269164e-06, + "loss": 0.859, + "step": 26265 + }, + { + "epoch": 1.48, + "grad_norm": 17.84684410680608, + "learning_rate": 5.981740223610566e-06, + "loss": 0.8676, + "step": 26270 + }, + { + "epoch": 1.48, + "grad_norm": 5.334338786403093, + "learning_rate": 5.98013306345965e-06, + "loss": 0.8853, + "step": 26275 + }, + { + "epoch": 1.48, + "grad_norm": 11.6244967967702, + "learning_rate": 5.97852579798911e-06, + "loss": 0.8575, + "step": 26280 + }, + { + "epoch": 1.48, + "grad_norm": 11.397749346676092, + "learning_rate": 5.976918427371654e-06, + "loss": 0.843, + "step": 26285 + }, + { + "epoch": 1.48, + "grad_norm": 4.880693302654921, + "learning_rate": 5.975310951780002e-06, + "loss": 0.8969, + "step": 26290 + }, + { + "epoch": 1.48, + "grad_norm": 15.217466712483155, + "learning_rate": 5.9737033713868846e-06, + "loss": 0.8426, + "step": 26295 + }, + { + "epoch": 1.48, + "grad_norm": 5.225582960100946, + "learning_rate": 5.9720956863650405e-06, + "loss": 0.8841, + "step": 26300 + }, + { + "epoch": 1.48, + "grad_norm": 4.776909165116245, + "learning_rate": 5.970487896887226e-06, + "loss": 0.8247, + "step": 26305 + }, + { + "epoch": 1.48, + "grad_norm": 5.292319378014117, + "learning_rate": 5.968880003126203e-06, + "loss": 0.8048, + "step": 26310 + }, + { + "epoch": 1.48, + "grad_norm": 6.6203860612520415, + "learning_rate": 5.96727200525475e-06, + "loss": 0.8799, + "step": 26315 + }, + { + "epoch": 1.48, + "grad_norm": 4.8572292503413985, + "learning_rate": 5.96566390344565e-06, + "loss": 0.8938, + "step": 26320 + }, + { + "epoch": 1.48, + "grad_norm": 6.877072267245683, + "learning_rate": 5.964055697871701e-06, + "loss": 0.8483, + "step": 26325 + }, + { + "epoch": 1.48, + "grad_norm": 4.612855111178123, + "learning_rate": 5.962447388705713e-06, + "loss": 0.8452, + "step": 26330 + }, + { + "epoch": 1.48, + "grad_norm": 5.170553941416647, + "learning_rate": 5.960838976120505e-06, + "loss": 0.8697, + "step": 26335 + }, + { + "epoch": 1.48, + "grad_norm": 5.846951533533244, + "learning_rate": 5.95923046028891e-06, + "loss": 0.8521, + "step": 26340 + }, + { + "epoch": 1.48, + "grad_norm": 6.58267276833323, + "learning_rate": 5.957621841383767e-06, + "loss": 0.9076, + "step": 26345 + }, + { + "epoch": 1.48, + "grad_norm": 11.529739733706275, + "learning_rate": 5.9560131195779315e-06, + "loss": 0.8344, + "step": 26350 + }, + { + "epoch": 1.48, + "grad_norm": 5.242639917064167, + "learning_rate": 5.954404295044267e-06, + "loss": 0.8724, + "step": 26355 + }, + { + "epoch": 1.49, + "grad_norm": 4.338437814497742, + "learning_rate": 5.952795367955649e-06, + "loss": 0.822, + "step": 26360 + }, + { + "epoch": 1.49, + "grad_norm": 6.543310735724762, + "learning_rate": 5.951186338484964e-06, + "loss": 0.8386, + "step": 26365 + }, + { + "epoch": 1.49, + "grad_norm": 11.260121446580111, + "learning_rate": 5.9495772068051085e-06, + "loss": 0.9259, + "step": 26370 + }, + { + "epoch": 1.49, + "grad_norm": 8.784040658635956, + "learning_rate": 5.947967973088992e-06, + "loss": 0.8829, + "step": 26375 + }, + { + "epoch": 1.49, + "grad_norm": 9.65086281036451, + "learning_rate": 5.946358637509533e-06, + "loss": 0.882, + "step": 26380 + }, + { + "epoch": 1.49, + "grad_norm": 8.808652705047955, + "learning_rate": 5.944749200239661e-06, + "loss": 0.8612, + "step": 26385 + }, + { + "epoch": 1.49, + "grad_norm": 15.622312197747917, + "learning_rate": 5.943139661452319e-06, + "loss": 0.8575, + "step": 26390 + }, + { + "epoch": 1.49, + "grad_norm": 10.163361460036183, + "learning_rate": 5.941530021320456e-06, + "loss": 0.8847, + "step": 26395 + }, + { + "epoch": 1.49, + "grad_norm": 14.766205106282325, + "learning_rate": 5.939920280017038e-06, + "loss": 0.8476, + "step": 26400 + }, + { + "epoch": 1.49, + "grad_norm": 5.365436882565832, + "learning_rate": 5.938310437715038e-06, + "loss": 0.868, + "step": 26405 + }, + { + "epoch": 1.49, + "grad_norm": 8.429499550649396, + "learning_rate": 5.936700494587441e-06, + "loss": 0.8616, + "step": 26410 + }, + { + "epoch": 1.49, + "grad_norm": 8.52896133360857, + "learning_rate": 5.935090450807242e-06, + "loss": 0.8435, + "step": 26415 + }, + { + "epoch": 1.49, + "grad_norm": 7.2322821234426815, + "learning_rate": 5.933480306547445e-06, + "loss": 0.8653, + "step": 26420 + }, + { + "epoch": 1.49, + "grad_norm": 8.154628254643407, + "learning_rate": 5.931870061981071e-06, + "loss": 0.8732, + "step": 26425 + }, + { + "epoch": 1.49, + "grad_norm": 5.227823427154511, + "learning_rate": 5.930259717281146e-06, + "loss": 0.8819, + "step": 26430 + }, + { + "epoch": 1.49, + "grad_norm": 6.9315025295618895, + "learning_rate": 5.928649272620709e-06, + "loss": 0.8692, + "step": 26435 + }, + { + "epoch": 1.49, + "grad_norm": 5.54624631506354, + "learning_rate": 5.927038728172809e-06, + "loss": 0.8403, + "step": 26440 + }, + { + "epoch": 1.49, + "grad_norm": 8.683362399276405, + "learning_rate": 5.925428084110506e-06, + "loss": 0.8477, + "step": 26445 + }, + { + "epoch": 1.49, + "grad_norm": 6.459554397267144, + "learning_rate": 5.923817340606871e-06, + "loss": 0.8812, + "step": 26450 + }, + { + "epoch": 1.49, + "grad_norm": 4.7061135470042155, + "learning_rate": 5.922206497834986e-06, + "loss": 0.854, + "step": 26455 + }, + { + "epoch": 1.49, + "grad_norm": 5.2193204632723065, + "learning_rate": 5.920595555967944e-06, + "loss": 0.8174, + "step": 26460 + }, + { + "epoch": 1.49, + "grad_norm": 7.599737835293299, + "learning_rate": 5.918984515178845e-06, + "loss": 0.8598, + "step": 26465 + }, + { + "epoch": 1.49, + "grad_norm": 9.635118514467898, + "learning_rate": 5.917373375640805e-06, + "loss": 0.8705, + "step": 26470 + }, + { + "epoch": 1.49, + "grad_norm": 10.687349928354102, + "learning_rate": 5.915762137526947e-06, + "loss": 0.8505, + "step": 26475 + }, + { + "epoch": 1.49, + "grad_norm": 4.885049284565121, + "learning_rate": 5.914150801010406e-06, + "loss": 0.9069, + "step": 26480 + }, + { + "epoch": 1.49, + "grad_norm": 8.873364750571962, + "learning_rate": 5.912539366264325e-06, + "loss": 0.9065, + "step": 26485 + }, + { + "epoch": 1.49, + "grad_norm": 11.753382422178554, + "learning_rate": 5.9109278334618634e-06, + "loss": 0.8216, + "step": 26490 + }, + { + "epoch": 1.49, + "grad_norm": 4.780003970057351, + "learning_rate": 5.909316202776185e-06, + "loss": 0.8263, + "step": 26495 + }, + { + "epoch": 1.49, + "grad_norm": 9.34209097689926, + "learning_rate": 5.907704474380469e-06, + "loss": 0.8689, + "step": 26500 + }, + { + "epoch": 1.49, + "grad_norm": 9.97635300998426, + "learning_rate": 5.9060926484479e-06, + "loss": 0.8586, + "step": 26505 + }, + { + "epoch": 1.49, + "grad_norm": 6.604465766189382, + "learning_rate": 5.904480725151677e-06, + "loss": 0.8824, + "step": 26510 + }, + { + "epoch": 1.49, + "grad_norm": 11.891633120947663, + "learning_rate": 5.902868704665008e-06, + "loss": 0.8252, + "step": 26515 + }, + { + "epoch": 1.49, + "grad_norm": 6.571455838359508, + "learning_rate": 5.901256587161114e-06, + "loss": 0.8808, + "step": 26520 + }, + { + "epoch": 1.49, + "grad_norm": 9.274679742798048, + "learning_rate": 5.899644372813221e-06, + "loss": 0.8805, + "step": 26525 + }, + { + "epoch": 1.49, + "grad_norm": 16.93466942095143, + "learning_rate": 5.898032061794569e-06, + "loss": 0.8652, + "step": 26530 + }, + { + "epoch": 1.5, + "grad_norm": 5.48594329453608, + "learning_rate": 5.896419654278409e-06, + "loss": 0.9081, + "step": 26535 + }, + { + "epoch": 1.5, + "grad_norm": 5.709402291802655, + "learning_rate": 5.894807150438e-06, + "loss": 0.8297, + "step": 26540 + }, + { + "epoch": 1.5, + "grad_norm": 11.842407234736854, + "learning_rate": 5.893194550446616e-06, + "loss": 0.8525, + "step": 26545 + }, + { + "epoch": 1.5, + "grad_norm": 8.981793728229583, + "learning_rate": 5.891581854477535e-06, + "loss": 0.8435, + "step": 26550 + }, + { + "epoch": 1.5, + "grad_norm": 15.866464491317057, + "learning_rate": 5.889969062704047e-06, + "loss": 0.8778, + "step": 26555 + }, + { + "epoch": 1.5, + "grad_norm": 8.931442225747622, + "learning_rate": 5.888356175299457e-06, + "loss": 0.8712, + "step": 26560 + }, + { + "epoch": 1.5, + "grad_norm": 31.89315978023349, + "learning_rate": 5.886743192437076e-06, + "loss": 0.8997, + "step": 26565 + }, + { + "epoch": 1.5, + "grad_norm": 11.0658673776465, + "learning_rate": 5.885130114290224e-06, + "loss": 0.8914, + "step": 26570 + }, + { + "epoch": 1.5, + "grad_norm": 9.887782399973648, + "learning_rate": 5.883516941032235e-06, + "loss": 0.9017, + "step": 26575 + }, + { + "epoch": 1.5, + "grad_norm": 26.208381565763595, + "learning_rate": 5.881903672836452e-06, + "loss": 0.8956, + "step": 26580 + }, + { + "epoch": 1.5, + "grad_norm": 7.179136283971342, + "learning_rate": 5.8802903098762265e-06, + "loss": 0.8898, + "step": 26585 + }, + { + "epoch": 1.5, + "grad_norm": 10.364618616373024, + "learning_rate": 5.878676852324922e-06, + "loss": 0.8734, + "step": 26590 + }, + { + "epoch": 1.5, + "grad_norm": 9.833551058429732, + "learning_rate": 5.877063300355913e-06, + "loss": 0.8445, + "step": 26595 + }, + { + "epoch": 1.5, + "grad_norm": 5.814225852311926, + "learning_rate": 5.875449654142581e-06, + "loss": 0.831, + "step": 26600 + }, + { + "epoch": 1.5, + "grad_norm": 9.366540745200195, + "learning_rate": 5.873835913858321e-06, + "loss": 0.8436, + "step": 26605 + }, + { + "epoch": 1.5, + "grad_norm": 7.764322985703324, + "learning_rate": 5.872222079676535e-06, + "loss": 0.8883, + "step": 26610 + }, + { + "epoch": 1.5, + "grad_norm": 5.471545870336112, + "learning_rate": 5.870608151770637e-06, + "loss": 0.8157, + "step": 26615 + }, + { + "epoch": 1.5, + "grad_norm": 4.889854059013036, + "learning_rate": 5.86899413031405e-06, + "loss": 0.8686, + "step": 26620 + }, + { + "epoch": 1.5, + "grad_norm": 7.941368536110251, + "learning_rate": 5.867380015480208e-06, + "loss": 0.8934, + "step": 26625 + }, + { + "epoch": 1.5, + "grad_norm": 14.955809733667413, + "learning_rate": 5.865765807442556e-06, + "loss": 0.8227, + "step": 26630 + }, + { + "epoch": 1.5, + "grad_norm": 5.9779549529841205, + "learning_rate": 5.864151506374549e-06, + "loss": 0.8634, + "step": 26635 + }, + { + "epoch": 1.5, + "grad_norm": 5.1885930870661845, + "learning_rate": 5.862537112449647e-06, + "loss": 0.859, + "step": 26640 + }, + { + "epoch": 1.5, + "grad_norm": 5.758706652338646, + "learning_rate": 5.860922625841325e-06, + "loss": 0.9062, + "step": 26645 + }, + { + "epoch": 1.5, + "grad_norm": 5.423297224186076, + "learning_rate": 5.859308046723069e-06, + "loss": 0.8836, + "step": 26650 + }, + { + "epoch": 1.5, + "grad_norm": 20.812369359611463, + "learning_rate": 5.857693375268371e-06, + "loss": 0.8479, + "step": 26655 + }, + { + "epoch": 1.5, + "grad_norm": 4.979318288189456, + "learning_rate": 5.856078611650734e-06, + "loss": 0.8498, + "step": 26660 + }, + { + "epoch": 1.5, + "grad_norm": 11.343566488232046, + "learning_rate": 5.85446375604367e-06, + "loss": 0.8483, + "step": 26665 + }, + { + "epoch": 1.5, + "grad_norm": 4.815669932575958, + "learning_rate": 5.852848808620707e-06, + "loss": 0.8853, + "step": 26670 + }, + { + "epoch": 1.5, + "grad_norm": 7.294733499794159, + "learning_rate": 5.851233769555374e-06, + "loss": 0.8478, + "step": 26675 + }, + { + "epoch": 1.5, + "grad_norm": 4.788783041888735, + "learning_rate": 5.849618639021216e-06, + "loss": 0.8931, + "step": 26680 + }, + { + "epoch": 1.5, + "grad_norm": 6.2797218962180015, + "learning_rate": 5.848003417191787e-06, + "loss": 0.8236, + "step": 26685 + }, + { + "epoch": 1.5, + "grad_norm": 5.4128678818500875, + "learning_rate": 5.846388104240648e-06, + "loss": 0.8344, + "step": 26690 + }, + { + "epoch": 1.5, + "grad_norm": 5.388001643542782, + "learning_rate": 5.844772700341371e-06, + "loss": 0.8416, + "step": 26695 + }, + { + "epoch": 1.5, + "grad_norm": 5.3996933906459805, + "learning_rate": 5.843157205667539e-06, + "loss": 0.8516, + "step": 26700 + }, + { + "epoch": 1.5, + "grad_norm": 6.619516343738756, + "learning_rate": 5.841541620392744e-06, + "loss": 0.8907, + "step": 26705 + }, + { + "epoch": 1.5, + "grad_norm": 7.568510711522257, + "learning_rate": 5.839925944690588e-06, + "loss": 0.8166, + "step": 26710 + }, + { + "epoch": 1.51, + "grad_norm": 6.934216784304126, + "learning_rate": 5.838310178734683e-06, + "loss": 0.8277, + "step": 26715 + }, + { + "epoch": 1.51, + "grad_norm": 6.345659943397673, + "learning_rate": 5.836694322698648e-06, + "loss": 0.8328, + "step": 26720 + }, + { + "epoch": 1.51, + "grad_norm": 7.141023314856025, + "learning_rate": 5.835078376756117e-06, + "loss": 0.8593, + "step": 26725 + }, + { + "epoch": 1.51, + "grad_norm": 10.783991886082312, + "learning_rate": 5.833462341080729e-06, + "loss": 0.8385, + "step": 26730 + }, + { + "epoch": 1.51, + "grad_norm": 8.255356798585868, + "learning_rate": 5.831846215846133e-06, + "loss": 0.8574, + "step": 26735 + }, + { + "epoch": 1.51, + "grad_norm": 5.131233336217069, + "learning_rate": 5.8302300012259905e-06, + "loss": 0.8325, + "step": 26740 + }, + { + "epoch": 1.51, + "grad_norm": 5.702782139324231, + "learning_rate": 5.828613697393971e-06, + "loss": 0.9062, + "step": 26745 + }, + { + "epoch": 1.51, + "grad_norm": 5.040898225230506, + "learning_rate": 5.826997304523752e-06, + "loss": 0.852, + "step": 26750 + }, + { + "epoch": 1.51, + "grad_norm": 20.11270605309605, + "learning_rate": 5.825380822789023e-06, + "loss": 0.8876, + "step": 26755 + }, + { + "epoch": 1.51, + "grad_norm": 15.134634927485443, + "learning_rate": 5.823764252363481e-06, + "loss": 0.8443, + "step": 26760 + }, + { + "epoch": 1.51, + "grad_norm": 5.645463287262561, + "learning_rate": 5.822147593420835e-06, + "loss": 0.8999, + "step": 26765 + }, + { + "epoch": 1.51, + "grad_norm": 6.396966521136354, + "learning_rate": 5.820530846134802e-06, + "loss": 0.833, + "step": 26770 + }, + { + "epoch": 1.51, + "grad_norm": 5.078969221794201, + "learning_rate": 5.818914010679109e-06, + "loss": 0.8709, + "step": 26775 + }, + { + "epoch": 1.51, + "grad_norm": 9.447435177422953, + "learning_rate": 5.8172970872274905e-06, + "loss": 0.8583, + "step": 26780 + }, + { + "epoch": 1.51, + "grad_norm": 13.762526281484488, + "learning_rate": 5.815680075953692e-06, + "loss": 0.8843, + "step": 26785 + }, + { + "epoch": 1.51, + "grad_norm": 13.341490315359552, + "learning_rate": 5.814062977031471e-06, + "loss": 0.87, + "step": 26790 + }, + { + "epoch": 1.51, + "grad_norm": 9.570139263823231, + "learning_rate": 5.81244579063459e-06, + "loss": 0.8592, + "step": 26795 + }, + { + "epoch": 1.51, + "grad_norm": 10.324585695361593, + "learning_rate": 5.810828516936823e-06, + "loss": 0.8609, + "step": 26800 + }, + { + "epoch": 1.51, + "grad_norm": 7.2786761122367, + "learning_rate": 5.809211156111954e-06, + "loss": 0.7837, + "step": 26805 + }, + { + "epoch": 1.51, + "grad_norm": 6.115853743290486, + "learning_rate": 5.807593708333774e-06, + "loss": 0.865, + "step": 26810 + }, + { + "epoch": 1.51, + "grad_norm": 15.533442451737326, + "learning_rate": 5.805976173776087e-06, + "loss": 0.809, + "step": 26815 + }, + { + "epoch": 1.51, + "grad_norm": 8.912855762982474, + "learning_rate": 5.804358552612702e-06, + "loss": 0.8432, + "step": 26820 + }, + { + "epoch": 1.51, + "grad_norm": 8.80822625181859, + "learning_rate": 5.802740845017441e-06, + "loss": 0.8475, + "step": 26825 + }, + { + "epoch": 1.51, + "grad_norm": 6.190074610071065, + "learning_rate": 5.801123051164132e-06, + "loss": 0.8487, + "step": 26830 + }, + { + "epoch": 1.51, + "grad_norm": 9.120069966431346, + "learning_rate": 5.799505171226619e-06, + "loss": 0.8375, + "step": 26835 + }, + { + "epoch": 1.51, + "grad_norm": 5.389772520084062, + "learning_rate": 5.797887205378745e-06, + "loss": 0.8214, + "step": 26840 + }, + { + "epoch": 1.51, + "grad_norm": 5.410071650198306, + "learning_rate": 5.79626915379437e-06, + "loss": 0.8389, + "step": 26845 + }, + { + "epoch": 1.51, + "grad_norm": 5.534543475819742, + "learning_rate": 5.794651016647359e-06, + "loss": 0.8018, + "step": 26850 + }, + { + "epoch": 1.51, + "grad_norm": 5.047415150480368, + "learning_rate": 5.7930327941115886e-06, + "loss": 0.8607, + "step": 26855 + }, + { + "epoch": 1.51, + "grad_norm": 16.63189317376628, + "learning_rate": 5.7914144863609465e-06, + "loss": 0.8406, + "step": 26860 + }, + { + "epoch": 1.51, + "grad_norm": 16.846280295668944, + "learning_rate": 5.789796093569324e-06, + "loss": 0.8312, + "step": 26865 + }, + { + "epoch": 1.51, + "grad_norm": 14.664091825460446, + "learning_rate": 5.788177615910626e-06, + "loss": 0.8888, + "step": 26870 + }, + { + "epoch": 1.51, + "grad_norm": 9.129029328346334, + "learning_rate": 5.786559053558765e-06, + "loss": 0.8246, + "step": 26875 + }, + { + "epoch": 1.51, + "grad_norm": 7.73827927659292, + "learning_rate": 5.784940406687662e-06, + "loss": 0.8094, + "step": 26880 + }, + { + "epoch": 1.51, + "grad_norm": 21.90043747894924, + "learning_rate": 5.783321675471248e-06, + "loss": 0.8565, + "step": 26885 + }, + { + "epoch": 1.52, + "grad_norm": 31.415125837571356, + "learning_rate": 5.781702860083463e-06, + "loss": 0.824, + "step": 26890 + }, + { + "epoch": 1.52, + "grad_norm": 40.69209896384807, + "learning_rate": 5.780083960698255e-06, + "loss": 0.8858, + "step": 26895 + }, + { + "epoch": 1.52, + "grad_norm": 8.292146418174342, + "learning_rate": 5.778464977489583e-06, + "loss": 0.8729, + "step": 26900 + }, + { + "epoch": 1.52, + "grad_norm": 17.991862160933817, + "learning_rate": 5.776845910631413e-06, + "loss": 0.8523, + "step": 26905 + }, + { + "epoch": 1.52, + "grad_norm": 6.537098363159829, + "learning_rate": 5.775226760297723e-06, + "loss": 0.8714, + "step": 26910 + }, + { + "epoch": 1.52, + "grad_norm": 72.04608074423221, + "learning_rate": 5.773607526662495e-06, + "loss": 0.889, + "step": 26915 + }, + { + "epoch": 1.52, + "grad_norm": 29.135941371981282, + "learning_rate": 5.771988209899726e-06, + "loss": 0.8272, + "step": 26920 + }, + { + "epoch": 1.52, + "grad_norm": 13.55994572101125, + "learning_rate": 5.770368810183416e-06, + "loss": 0.8662, + "step": 26925 + }, + { + "epoch": 1.52, + "grad_norm": 24.86800422217252, + "learning_rate": 5.768749327687576e-06, + "loss": 0.8341, + "step": 26930 + }, + { + "epoch": 1.52, + "grad_norm": 19.44367611703542, + "learning_rate": 5.7671297625862285e-06, + "loss": 0.8864, + "step": 26935 + }, + { + "epoch": 1.52, + "grad_norm": 27.140857210256808, + "learning_rate": 5.765510115053403e-06, + "loss": 0.867, + "step": 26940 + }, + { + "epoch": 1.52, + "grad_norm": 25.503062108739485, + "learning_rate": 5.763890385263137e-06, + "loss": 0.8602, + "step": 26945 + }, + { + "epoch": 1.52, + "grad_norm": 20.262505144514137, + "learning_rate": 5.7622705733894765e-06, + "loss": 0.8382, + "step": 26950 + }, + { + "epoch": 1.52, + "grad_norm": 20.544102102282014, + "learning_rate": 5.760650679606481e-06, + "loss": 0.8326, + "step": 26955 + }, + { + "epoch": 1.52, + "grad_norm": 10.16640864225168, + "learning_rate": 5.759030704088212e-06, + "loss": 0.8495, + "step": 26960 + }, + { + "epoch": 1.52, + "grad_norm": 31.16127289154043, + "learning_rate": 5.757410647008743e-06, + "loss": 0.8607, + "step": 26965 + }, + { + "epoch": 1.52, + "grad_norm": 5.524574888705631, + "learning_rate": 5.7557905085421586e-06, + "loss": 0.8961, + "step": 26970 + }, + { + "epoch": 1.52, + "grad_norm": 22.84678258694053, + "learning_rate": 5.7541702888625494e-06, + "loss": 0.8712, + "step": 26975 + }, + { + "epoch": 1.52, + "grad_norm": 27.847945253156105, + "learning_rate": 5.752549988144013e-06, + "loss": 0.8503, + "step": 26980 + }, + { + "epoch": 1.52, + "grad_norm": 12.34699505928167, + "learning_rate": 5.750929606560659e-06, + "loss": 0.8432, + "step": 26985 + }, + { + "epoch": 1.52, + "grad_norm": 9.853432723443728, + "learning_rate": 5.749309144286603e-06, + "loss": 0.8297, + "step": 26990 + }, + { + "epoch": 1.52, + "grad_norm": 16.400375145732713, + "learning_rate": 5.747688601495974e-06, + "loss": 0.8697, + "step": 26995 + }, + { + "epoch": 1.52, + "grad_norm": 5.871987907957095, + "learning_rate": 5.746067978362906e-06, + "loss": 0.8809, + "step": 27000 + }, + { + "epoch": 1.52, + "grad_norm": 6.997440121192785, + "learning_rate": 5.744447275061541e-06, + "loss": 0.8242, + "step": 27005 + }, + { + "epoch": 1.52, + "grad_norm": 5.611445145556261, + "learning_rate": 5.74282649176603e-06, + "loss": 0.8658, + "step": 27010 + }, + { + "epoch": 1.52, + "grad_norm": 23.036934755302443, + "learning_rate": 5.741205628650534e-06, + "loss": 0.8667, + "step": 27015 + }, + { + "epoch": 1.52, + "grad_norm": 20.898614904548687, + "learning_rate": 5.739584685889222e-06, + "loss": 0.863, + "step": 27020 + }, + { + "epoch": 1.52, + "grad_norm": 5.778958091052699, + "learning_rate": 5.737963663656272e-06, + "loss": 0.8631, + "step": 27025 + }, + { + "epoch": 1.52, + "grad_norm": 5.452358480653361, + "learning_rate": 5.736342562125869e-06, + "loss": 0.8225, + "step": 27030 + }, + { + "epoch": 1.52, + "grad_norm": 11.5718974147484, + "learning_rate": 5.734721381472208e-06, + "loss": 0.8896, + "step": 27035 + }, + { + "epoch": 1.52, + "grad_norm": 8.39509962960557, + "learning_rate": 5.733100121869491e-06, + "loss": 0.8663, + "step": 27040 + }, + { + "epoch": 1.52, + "grad_norm": 12.338306012900134, + "learning_rate": 5.731478783491933e-06, + "loss": 0.8702, + "step": 27045 + }, + { + "epoch": 1.52, + "grad_norm": 7.794404135191542, + "learning_rate": 5.729857366513749e-06, + "loss": 0.8503, + "step": 27050 + }, + { + "epoch": 1.52, + "grad_norm": 16.670373140811577, + "learning_rate": 5.728235871109171e-06, + "loss": 0.9143, + "step": 27055 + }, + { + "epoch": 1.52, + "grad_norm": 32.70669935782094, + "learning_rate": 5.726614297452435e-06, + "loss": 0.8292, + "step": 27060 + }, + { + "epoch": 1.52, + "grad_norm": 7.953620508293519, + "learning_rate": 5.7249926457177864e-06, + "loss": 0.8404, + "step": 27065 + }, + { + "epoch": 1.53, + "grad_norm": 14.67039148261923, + "learning_rate": 5.723370916079477e-06, + "loss": 0.8678, + "step": 27070 + }, + { + "epoch": 1.53, + "grad_norm": 13.365671781112125, + "learning_rate": 5.721749108711771e-06, + "loss": 0.8381, + "step": 27075 + }, + { + "epoch": 1.53, + "grad_norm": 23.26522805711699, + "learning_rate": 5.720127223788937e-06, + "loss": 0.8151, + "step": 27080 + }, + { + "epoch": 1.53, + "grad_norm": 12.859929883109832, + "learning_rate": 5.718505261485254e-06, + "loss": 0.8431, + "step": 27085 + }, + { + "epoch": 1.53, + "grad_norm": 7.799837914430625, + "learning_rate": 5.716883221975012e-06, + "loss": 0.8729, + "step": 27090 + }, + { + "epoch": 1.53, + "grad_norm": 4.821493804212123, + "learning_rate": 5.7152611054325035e-06, + "loss": 0.8486, + "step": 27095 + }, + { + "epoch": 1.53, + "grad_norm": 5.809258458222593, + "learning_rate": 5.713638912032031e-06, + "loss": 0.8258, + "step": 27100 + }, + { + "epoch": 1.53, + "grad_norm": 4.7600763162085995, + "learning_rate": 5.71201664194791e-06, + "loss": 0.8208, + "step": 27105 + }, + { + "epoch": 1.53, + "grad_norm": 11.140165397678468, + "learning_rate": 5.710394295354457e-06, + "loss": 0.7833, + "step": 27110 + }, + { + "epoch": 1.53, + "grad_norm": 5.888013156962577, + "learning_rate": 5.708771872426001e-06, + "loss": 0.8541, + "step": 27115 + }, + { + "epoch": 1.53, + "grad_norm": 5.210882855080766, + "learning_rate": 5.707149373336882e-06, + "loss": 0.8656, + "step": 27120 + }, + { + "epoch": 1.53, + "grad_norm": 5.53341906596725, + "learning_rate": 5.705526798261439e-06, + "loss": 0.8182, + "step": 27125 + }, + { + "epoch": 1.53, + "grad_norm": 8.030618587221534, + "learning_rate": 5.703904147374029e-06, + "loss": 0.8436, + "step": 27130 + }, + { + "epoch": 1.53, + "grad_norm": 14.83816126088371, + "learning_rate": 5.702281420849012e-06, + "loss": 0.8318, + "step": 27135 + }, + { + "epoch": 1.53, + "grad_norm": 10.291796271100154, + "learning_rate": 5.700658618860756e-06, + "loss": 0.804, + "step": 27140 + }, + { + "epoch": 1.53, + "grad_norm": 6.409684993814613, + "learning_rate": 5.69903574158364e-06, + "loss": 0.8522, + "step": 27145 + }, + { + "epoch": 1.53, + "grad_norm": 14.419367765427134, + "learning_rate": 5.697412789192049e-06, + "loss": 0.8312, + "step": 27150 + }, + { + "epoch": 1.53, + "grad_norm": 5.929834417340285, + "learning_rate": 5.695789761860376e-06, + "loss": 0.8549, + "step": 27155 + }, + { + "epoch": 1.53, + "grad_norm": 9.511636447472515, + "learning_rate": 5.694166659763021e-06, + "loss": 0.8601, + "step": 27160 + }, + { + "epoch": 1.53, + "grad_norm": 5.3262169619225785, + "learning_rate": 5.692543483074395e-06, + "loss": 0.8706, + "step": 27165 + }, + { + "epoch": 1.53, + "grad_norm": 14.21830397641139, + "learning_rate": 5.690920231968916e-06, + "loss": 0.8675, + "step": 27170 + }, + { + "epoch": 1.53, + "grad_norm": 8.60583307079588, + "learning_rate": 5.689296906621009e-06, + "loss": 0.8583, + "step": 27175 + }, + { + "epoch": 1.53, + "grad_norm": 5.1778738588941176, + "learning_rate": 5.687673507205106e-06, + "loss": 0.8124, + "step": 27180 + }, + { + "epoch": 1.53, + "grad_norm": 6.144574801402935, + "learning_rate": 5.68605003389565e-06, + "loss": 0.8301, + "step": 27185 + }, + { + "epoch": 1.53, + "grad_norm": 5.262813440679303, + "learning_rate": 5.684426486867091e-06, + "loss": 0.8332, + "step": 27190 + }, + { + "epoch": 1.53, + "grad_norm": 7.062909804526193, + "learning_rate": 5.682802866293885e-06, + "loss": 0.8136, + "step": 27195 + }, + { + "epoch": 1.53, + "grad_norm": 5.476692240168745, + "learning_rate": 5.681179172350498e-06, + "loss": 0.805, + "step": 27200 + }, + { + "epoch": 1.53, + "grad_norm": 5.778244180909876, + "learning_rate": 5.679555405211403e-06, + "loss": 0.8624, + "step": 27205 + }, + { + "epoch": 1.53, + "grad_norm": 6.187187229792764, + "learning_rate": 5.67793156505108e-06, + "loss": 0.8377, + "step": 27210 + }, + { + "epoch": 1.53, + "grad_norm": 4.7618080121242485, + "learning_rate": 5.676307652044017e-06, + "loss": 0.8537, + "step": 27215 + }, + { + "epoch": 1.53, + "grad_norm": 5.379226001157689, + "learning_rate": 5.674683666364713e-06, + "loss": 0.84, + "step": 27220 + }, + { + "epoch": 1.53, + "grad_norm": 6.9890669802333285, + "learning_rate": 5.673059608187672e-06, + "loss": 0.8068, + "step": 27225 + }, + { + "epoch": 1.53, + "grad_norm": 5.043482550445178, + "learning_rate": 5.671435477687406e-06, + "loss": 0.8391, + "step": 27230 + }, + { + "epoch": 1.53, + "grad_norm": 5.341320791877013, + "learning_rate": 5.669811275038434e-06, + "loss": 0.8516, + "step": 27235 + }, + { + "epoch": 1.53, + "grad_norm": 8.902154855316724, + "learning_rate": 5.668187000415284e-06, + "loss": 0.8329, + "step": 27240 + }, + { + "epoch": 1.54, + "grad_norm": 11.595010163035232, + "learning_rate": 5.666562653992492e-06, + "loss": 0.8757, + "step": 27245 + }, + { + "epoch": 1.54, + "grad_norm": 18.225129141628653, + "learning_rate": 5.6649382359446015e-06, + "loss": 0.8014, + "step": 27250 + }, + { + "epoch": 1.54, + "grad_norm": 6.458756381956273, + "learning_rate": 5.663313746446163e-06, + "loss": 0.8402, + "step": 27255 + }, + { + "epoch": 1.54, + "grad_norm": 7.787930171534921, + "learning_rate": 5.661689185671735e-06, + "loss": 0.7883, + "step": 27260 + }, + { + "epoch": 1.54, + "grad_norm": 8.914455180278969, + "learning_rate": 5.660064553795883e-06, + "loss": 0.904, + "step": 27265 + }, + { + "epoch": 1.54, + "grad_norm": 13.551594467432754, + "learning_rate": 5.658439850993183e-06, + "loss": 0.8098, + "step": 27270 + }, + { + "epoch": 1.54, + "grad_norm": 5.065101015780171, + "learning_rate": 5.6568150774382135e-06, + "loss": 0.8523, + "step": 27275 + }, + { + "epoch": 1.54, + "grad_norm": 6.162843789091219, + "learning_rate": 5.655190233305566e-06, + "loss": 0.8694, + "step": 27280 + }, + { + "epoch": 1.54, + "grad_norm": 19.2239011034022, + "learning_rate": 5.653565318769836e-06, + "loss": 0.8375, + "step": 27285 + }, + { + "epoch": 1.54, + "grad_norm": 7.116686191897343, + "learning_rate": 5.65194033400563e-06, + "loss": 0.794, + "step": 27290 + }, + { + "epoch": 1.54, + "grad_norm": 21.334769537034536, + "learning_rate": 5.650315279187556e-06, + "loss": 0.8177, + "step": 27295 + }, + { + "epoch": 1.54, + "grad_norm": 9.080425864614117, + "learning_rate": 5.648690154490235e-06, + "loss": 0.8346, + "step": 27300 + }, + { + "epoch": 1.54, + "grad_norm": 6.872567644534107, + "learning_rate": 5.647064960088294e-06, + "loss": 0.8222, + "step": 27305 + }, + { + "epoch": 1.54, + "grad_norm": 10.4852150094843, + "learning_rate": 5.645439696156368e-06, + "loss": 0.8425, + "step": 27310 + }, + { + "epoch": 1.54, + "grad_norm": 12.094028907139457, + "learning_rate": 5.643814362869098e-06, + "loss": 0.7855, + "step": 27315 + }, + { + "epoch": 1.54, + "grad_norm": 12.267161992812545, + "learning_rate": 5.642188960401133e-06, + "loss": 0.7879, + "step": 27320 + }, + { + "epoch": 1.54, + "grad_norm": 14.6381639561117, + "learning_rate": 5.640563488927131e-06, + "loss": 0.848, + "step": 27325 + }, + { + "epoch": 1.54, + "grad_norm": 12.090066368644928, + "learning_rate": 5.638937948621755e-06, + "loss": 0.8077, + "step": 27330 + }, + { + "epoch": 1.54, + "grad_norm": 18.086032406154715, + "learning_rate": 5.637312339659675e-06, + "loss": 0.861, + "step": 27335 + }, + { + "epoch": 1.54, + "grad_norm": 5.460124892521248, + "learning_rate": 5.635686662215572e-06, + "loss": 0.8224, + "step": 27340 + }, + { + "epoch": 1.54, + "grad_norm": 5.710627853746999, + "learning_rate": 5.6340609164641335e-06, + "loss": 0.8473, + "step": 27345 + }, + { + "epoch": 1.54, + "grad_norm": 5.55769853529202, + "learning_rate": 5.632435102580049e-06, + "loss": 0.8736, + "step": 27350 + }, + { + "epoch": 1.54, + "grad_norm": 5.201174022453873, + "learning_rate": 5.630809220738022e-06, + "loss": 0.8547, + "step": 27355 + }, + { + "epoch": 1.54, + "grad_norm": 4.843169835655888, + "learning_rate": 5.629183271112759e-06, + "loss": 0.8214, + "step": 27360 + }, + { + "epoch": 1.54, + "grad_norm": 7.910826737135694, + "learning_rate": 5.627557253878979e-06, + "loss": 0.848, + "step": 27365 + }, + { + "epoch": 1.54, + "grad_norm": 18.059057445888836, + "learning_rate": 5.625931169211401e-06, + "loss": 0.7987, + "step": 27370 + }, + { + "epoch": 1.54, + "grad_norm": 5.074060846192095, + "learning_rate": 5.624305017284759e-06, + "loss": 0.8243, + "step": 27375 + }, + { + "epoch": 1.54, + "grad_norm": 12.729710748597427, + "learning_rate": 5.6226787982737855e-06, + "loss": 0.8602, + "step": 27380 + }, + { + "epoch": 1.54, + "grad_norm": 8.756800832903632, + "learning_rate": 5.621052512353227e-06, + "loss": 0.8395, + "step": 27385 + }, + { + "epoch": 1.54, + "grad_norm": 7.351292362153368, + "learning_rate": 5.619426159697835e-06, + "loss": 0.8523, + "step": 27390 + }, + { + "epoch": 1.54, + "grad_norm": 15.789759867363111, + "learning_rate": 5.617799740482369e-06, + "loss": 0.8895, + "step": 27395 + }, + { + "epoch": 1.54, + "grad_norm": 5.234766910628488, + "learning_rate": 5.616173254881594e-06, + "loss": 0.8221, + "step": 27400 + }, + { + "epoch": 1.54, + "grad_norm": 6.358802567627612, + "learning_rate": 5.614546703070284e-06, + "loss": 0.8594, + "step": 27405 + }, + { + "epoch": 1.54, + "grad_norm": 5.78267701844748, + "learning_rate": 5.612920085223218e-06, + "loss": 0.8545, + "step": 27410 + }, + { + "epoch": 1.54, + "grad_norm": 5.088368755960512, + "learning_rate": 5.611293401515183e-06, + "loss": 0.8499, + "step": 27415 + }, + { + "epoch": 1.54, + "grad_norm": 13.73630214387602, + "learning_rate": 5.609666652120975e-06, + "loss": 0.8254, + "step": 27420 + }, + { + "epoch": 1.55, + "grad_norm": 5.017294780631743, + "learning_rate": 5.608039837215395e-06, + "loss": 0.8532, + "step": 27425 + }, + { + "epoch": 1.55, + "grad_norm": 9.662313902311142, + "learning_rate": 5.606412956973252e-06, + "loss": 0.7892, + "step": 27430 + }, + { + "epoch": 1.55, + "grad_norm": 6.687695641695749, + "learning_rate": 5.604786011569359e-06, + "loss": 0.8551, + "step": 27435 + }, + { + "epoch": 1.55, + "grad_norm": 20.408377014543518, + "learning_rate": 5.60315900117854e-06, + "loss": 0.8325, + "step": 27440 + }, + { + "epoch": 1.55, + "grad_norm": 6.739644669376226, + "learning_rate": 5.601531925975624e-06, + "loss": 0.836, + "step": 27445 + }, + { + "epoch": 1.55, + "grad_norm": 6.245318583892333, + "learning_rate": 5.599904786135447e-06, + "loss": 0.8567, + "step": 27450 + }, + { + "epoch": 1.55, + "grad_norm": 9.848560725241006, + "learning_rate": 5.598277581832855e-06, + "loss": 0.8272, + "step": 27455 + }, + { + "epoch": 1.55, + "grad_norm": 5.181595541273278, + "learning_rate": 5.596650313242697e-06, + "loss": 0.8291, + "step": 27460 + }, + { + "epoch": 1.55, + "grad_norm": 22.732225652466553, + "learning_rate": 5.595022980539828e-06, + "loss": 0.8439, + "step": 27465 + }, + { + "epoch": 1.55, + "grad_norm": 13.749331682316672, + "learning_rate": 5.593395583899114e-06, + "loss": 0.8074, + "step": 27470 + }, + { + "epoch": 1.55, + "grad_norm": 7.6155307899934765, + "learning_rate": 5.591768123495425e-06, + "loss": 0.8162, + "step": 27475 + }, + { + "epoch": 1.55, + "grad_norm": 16.38078240445812, + "learning_rate": 5.59014059950364e-06, + "loss": 0.8112, + "step": 27480 + }, + { + "epoch": 1.55, + "grad_norm": 7.726742359434211, + "learning_rate": 5.588513012098644e-06, + "loss": 0.854, + "step": 27485 + }, + { + "epoch": 1.55, + "grad_norm": 5.522912054408138, + "learning_rate": 5.586885361455326e-06, + "loss": 0.8676, + "step": 27490 + }, + { + "epoch": 1.55, + "grad_norm": 42.93731286731151, + "learning_rate": 5.585257647748587e-06, + "loss": 0.8993, + "step": 27495 + }, + { + "epoch": 1.55, + "grad_norm": 6.749483489627861, + "learning_rate": 5.583629871153331e-06, + "loss": 0.8451, + "step": 27500 + }, + { + "epoch": 1.55, + "grad_norm": 20.590363748463695, + "learning_rate": 5.582002031844469e-06, + "loss": 0.8579, + "step": 27505 + }, + { + "epoch": 1.55, + "grad_norm": 11.337436630851883, + "learning_rate": 5.580374129996919e-06, + "loss": 0.8373, + "step": 27510 + }, + { + "epoch": 1.55, + "grad_norm": 31.544819676786396, + "learning_rate": 5.578746165785609e-06, + "loss": 0.8752, + "step": 27515 + }, + { + "epoch": 1.55, + "grad_norm": 13.415996857447883, + "learning_rate": 5.5771181393854705e-06, + "loss": 0.8477, + "step": 27520 + }, + { + "epoch": 1.55, + "grad_norm": 13.86394109230957, + "learning_rate": 5.5754900509714396e-06, + "loss": 0.8092, + "step": 27525 + }, + { + "epoch": 1.55, + "grad_norm": 8.3592762405029, + "learning_rate": 5.573861900718461e-06, + "loss": 0.8656, + "step": 27530 + }, + { + "epoch": 1.55, + "grad_norm": 10.46238396955788, + "learning_rate": 5.5722336888014905e-06, + "loss": 0.82, + "step": 27535 + }, + { + "epoch": 1.55, + "grad_norm": 6.127625863744789, + "learning_rate": 5.570605415395485e-06, + "loss": 0.8092, + "step": 27540 + }, + { + "epoch": 1.55, + "grad_norm": 5.667990204026085, + "learning_rate": 5.568977080675407e-06, + "loss": 0.8503, + "step": 27545 + }, + { + "epoch": 1.55, + "grad_norm": 7.960084324015313, + "learning_rate": 5.567348684816233e-06, + "loss": 0.8626, + "step": 27550 + }, + { + "epoch": 1.55, + "grad_norm": 4.8856049884045785, + "learning_rate": 5.5657202279929375e-06, + "loss": 0.8334, + "step": 27555 + }, + { + "epoch": 1.55, + "grad_norm": 5.562954285835208, + "learning_rate": 5.564091710380507e-06, + "loss": 0.8936, + "step": 27560 + }, + { + "epoch": 1.55, + "grad_norm": 6.1887514664284975, + "learning_rate": 5.562463132153933e-06, + "loss": 0.8367, + "step": 27565 + }, + { + "epoch": 1.55, + "grad_norm": 4.910120053504759, + "learning_rate": 5.560834493488213e-06, + "loss": 0.8185, + "step": 27570 + }, + { + "epoch": 1.55, + "grad_norm": 5.007103534982401, + "learning_rate": 5.559205794558354e-06, + "loss": 0.8084, + "step": 27575 + }, + { + "epoch": 1.55, + "grad_norm": 18.508089863839682, + "learning_rate": 5.557577035539361e-06, + "loss": 0.8396, + "step": 27580 + }, + { + "epoch": 1.55, + "grad_norm": 11.201086783663813, + "learning_rate": 5.555948216606255e-06, + "loss": 0.877, + "step": 27585 + }, + { + "epoch": 1.55, + "grad_norm": 4.82826226084147, + "learning_rate": 5.55431933793406e-06, + "loss": 0.8088, + "step": 27590 + }, + { + "epoch": 1.55, + "grad_norm": 6.4018673360403024, + "learning_rate": 5.552690399697807e-06, + "loss": 0.8179, + "step": 27595 + }, + { + "epoch": 1.56, + "grad_norm": 4.896817301291493, + "learning_rate": 5.551061402072529e-06, + "loss": 0.8889, + "step": 27600 + }, + { + "epoch": 1.56, + "grad_norm": 4.8512092962836375, + "learning_rate": 5.549432345233274e-06, + "loss": 0.886, + "step": 27605 + }, + { + "epoch": 1.56, + "grad_norm": 10.78467056315316, + "learning_rate": 5.5478032293550875e-06, + "loss": 0.8348, + "step": 27610 + }, + { + "epoch": 1.56, + "grad_norm": 6.038521798819803, + "learning_rate": 5.546174054613027e-06, + "loss": 0.8004, + "step": 27615 + }, + { + "epoch": 1.56, + "grad_norm": 5.543501419243198, + "learning_rate": 5.544544821182153e-06, + "loss": 0.802, + "step": 27620 + }, + { + "epoch": 1.56, + "grad_norm": 7.576978082024347, + "learning_rate": 5.542915529237535e-06, + "loss": 0.8237, + "step": 27625 + }, + { + "epoch": 1.56, + "grad_norm": 6.346995386496742, + "learning_rate": 5.541286178954247e-06, + "loss": 0.8602, + "step": 27630 + }, + { + "epoch": 1.56, + "grad_norm": 8.233414275924641, + "learning_rate": 5.53965677050737e-06, + "loss": 0.8565, + "step": 27635 + }, + { + "epoch": 1.56, + "grad_norm": 24.543995130295432, + "learning_rate": 5.538027304071992e-06, + "loss": 0.837, + "step": 27640 + }, + { + "epoch": 1.56, + "grad_norm": 5.357417946046084, + "learning_rate": 5.536397779823205e-06, + "loss": 0.7966, + "step": 27645 + }, + { + "epoch": 1.56, + "grad_norm": 7.137854842287357, + "learning_rate": 5.53476819793611e-06, + "loss": 0.825, + "step": 27650 + }, + { + "epoch": 1.56, + "grad_norm": 6.30063959889956, + "learning_rate": 5.533138558585811e-06, + "loss": 0.8646, + "step": 27655 + }, + { + "epoch": 1.56, + "grad_norm": 8.692924139491135, + "learning_rate": 5.5315088619474235e-06, + "loss": 0.8498, + "step": 27660 + }, + { + "epoch": 1.56, + "grad_norm": 6.711643769294201, + "learning_rate": 5.529879108196061e-06, + "loss": 0.8135, + "step": 27665 + }, + { + "epoch": 1.56, + "grad_norm": 6.025663645641752, + "learning_rate": 5.5282492975068495e-06, + "loss": 0.8151, + "step": 27670 + }, + { + "epoch": 1.56, + "grad_norm": 6.062118954198813, + "learning_rate": 5.52661943005492e-06, + "loss": 0.8185, + "step": 27675 + }, + { + "epoch": 1.56, + "grad_norm": 7.757415316997683, + "learning_rate": 5.524989506015408e-06, + "loss": 0.8247, + "step": 27680 + }, + { + "epoch": 1.56, + "grad_norm": 10.26047128596025, + "learning_rate": 5.523359525563457e-06, + "loss": 0.8758, + "step": 27685 + }, + { + "epoch": 1.56, + "grad_norm": 9.441446298587094, + "learning_rate": 5.521729488874217e-06, + "loss": 0.9443, + "step": 27690 + }, + { + "epoch": 1.56, + "grad_norm": 7.590963719737849, + "learning_rate": 5.520099396122839e-06, + "loss": 0.8296, + "step": 27695 + }, + { + "epoch": 1.56, + "grad_norm": 7.370192771088231, + "learning_rate": 5.518469247484486e-06, + "loss": 0.867, + "step": 27700 + }, + { + "epoch": 1.56, + "grad_norm": 5.2407369045623176, + "learning_rate": 5.516839043134325e-06, + "loss": 0.8568, + "step": 27705 + }, + { + "epoch": 1.56, + "grad_norm": 5.088367550059989, + "learning_rate": 5.515208783247525e-06, + "loss": 0.8679, + "step": 27710 + }, + { + "epoch": 1.56, + "grad_norm": 7.493212415317777, + "learning_rate": 5.513578467999271e-06, + "loss": 0.8355, + "step": 27715 + }, + { + "epoch": 1.56, + "grad_norm": 5.280554850900711, + "learning_rate": 5.511948097564741e-06, + "loss": 0.8707, + "step": 27720 + }, + { + "epoch": 1.56, + "grad_norm": 12.138374637209298, + "learning_rate": 5.510317672119129e-06, + "loss": 0.8567, + "step": 27725 + }, + { + "epoch": 1.56, + "grad_norm": 7.892377205049216, + "learning_rate": 5.508687191837633e-06, + "loss": 0.8684, + "step": 27730 + }, + { + "epoch": 1.56, + "grad_norm": 7.286114569552311, + "learning_rate": 5.507056656895452e-06, + "loss": 0.7954, + "step": 27735 + }, + { + "epoch": 1.56, + "grad_norm": 6.108608327951508, + "learning_rate": 5.5054260674677965e-06, + "loss": 0.828, + "step": 27740 + }, + { + "epoch": 1.56, + "grad_norm": 6.019463904192895, + "learning_rate": 5.503795423729879e-06, + "loss": 0.8467, + "step": 27745 + }, + { + "epoch": 1.56, + "grad_norm": 4.911627974968405, + "learning_rate": 5.502164725856919e-06, + "loss": 0.8575, + "step": 27750 + }, + { + "epoch": 1.56, + "grad_norm": 4.949764024135887, + "learning_rate": 5.500533974024143e-06, + "loss": 0.8088, + "step": 27755 + }, + { + "epoch": 1.56, + "grad_norm": 7.710812954342271, + "learning_rate": 5.498903168406783e-06, + "loss": 0.8165, + "step": 27760 + }, + { + "epoch": 1.56, + "grad_norm": 8.615274832979397, + "learning_rate": 5.497272309180075e-06, + "loss": 0.7984, + "step": 27765 + }, + { + "epoch": 1.56, + "grad_norm": 4.690133110156747, + "learning_rate": 5.4956413965192636e-06, + "loss": 0.862, + "step": 27770 + }, + { + "epoch": 1.56, + "grad_norm": 8.723900262271425, + "learning_rate": 5.494010430599595e-06, + "loss": 0.8347, + "step": 27775 + }, + { + "epoch": 1.57, + "grad_norm": 5.791899694774015, + "learning_rate": 5.4923794115963265e-06, + "loss": 0.8129, + "step": 27780 + }, + { + "epoch": 1.57, + "grad_norm": 5.223685329637303, + "learning_rate": 5.4907483396847185e-06, + "loss": 0.8601, + "step": 27785 + }, + { + "epoch": 1.57, + "grad_norm": 7.8504187217528445, + "learning_rate": 5.489117215040034e-06, + "loss": 0.8321, + "step": 27790 + }, + { + "epoch": 1.57, + "grad_norm": 9.702570409384052, + "learning_rate": 5.487486037837546e-06, + "loss": 0.8708, + "step": 27795 + }, + { + "epoch": 1.57, + "grad_norm": 7.058235735878204, + "learning_rate": 5.485854808252533e-06, + "loss": 0.8327, + "step": 27800 + }, + { + "epoch": 1.57, + "grad_norm": 5.450428454937076, + "learning_rate": 5.484223526460276e-06, + "loss": 0.8538, + "step": 27805 + }, + { + "epoch": 1.57, + "grad_norm": 12.907230420761335, + "learning_rate": 5.4825921926360644e-06, + "loss": 0.852, + "step": 27810 + }, + { + "epoch": 1.57, + "grad_norm": 7.108923660685045, + "learning_rate": 5.48096080695519e-06, + "loss": 0.8309, + "step": 27815 + }, + { + "epoch": 1.57, + "grad_norm": 4.867693572535084, + "learning_rate": 5.4793293695929564e-06, + "loss": 0.8385, + "step": 27820 + }, + { + "epoch": 1.57, + "grad_norm": 5.946951013702713, + "learning_rate": 5.477697880724665e-06, + "loss": 0.7753, + "step": 27825 + }, + { + "epoch": 1.57, + "grad_norm": 7.816107661414321, + "learning_rate": 5.47606634052563e-06, + "loss": 0.8015, + "step": 27830 + }, + { + "epoch": 1.57, + "grad_norm": 5.016088357097027, + "learning_rate": 5.474434749171164e-06, + "loss": 0.8528, + "step": 27835 + }, + { + "epoch": 1.57, + "grad_norm": 21.306577446607616, + "learning_rate": 5.472803106836592e-06, + "loss": 0.8113, + "step": 27840 + }, + { + "epoch": 1.57, + "grad_norm": 6.674021200535829, + "learning_rate": 5.471171413697238e-06, + "loss": 0.8126, + "step": 27845 + }, + { + "epoch": 1.57, + "grad_norm": 7.430222254658212, + "learning_rate": 5.469539669928437e-06, + "loss": 0.8224, + "step": 27850 + }, + { + "epoch": 1.57, + "grad_norm": 7.143416577397818, + "learning_rate": 5.467907875705526e-06, + "loss": 0.8392, + "step": 27855 + }, + { + "epoch": 1.57, + "grad_norm": 5.417623630354982, + "learning_rate": 5.466276031203849e-06, + "loss": 0.8414, + "step": 27860 + }, + { + "epoch": 1.57, + "grad_norm": 13.67882115343529, + "learning_rate": 5.464644136598753e-06, + "loss": 0.8471, + "step": 27865 + }, + { + "epoch": 1.57, + "grad_norm": 26.87099653384102, + "learning_rate": 5.463012192065595e-06, + "loss": 0.8393, + "step": 27870 + }, + { + "epoch": 1.57, + "grad_norm": 11.282517645050707, + "learning_rate": 5.4613801977797345e-06, + "loss": 0.7999, + "step": 27875 + }, + { + "epoch": 1.57, + "grad_norm": 32.54110515129484, + "learning_rate": 5.459748153916535e-06, + "loss": 0.908, + "step": 27880 + }, + { + "epoch": 1.57, + "grad_norm": 7.740210452511342, + "learning_rate": 5.458116060651366e-06, + "loss": 0.8866, + "step": 27885 + }, + { + "epoch": 1.57, + "grad_norm": 7.81184829771982, + "learning_rate": 5.456483918159606e-06, + "loss": 0.8226, + "step": 27890 + }, + { + "epoch": 1.57, + "grad_norm": 5.378253161486748, + "learning_rate": 5.4548517266166346e-06, + "loss": 0.8511, + "step": 27895 + }, + { + "epoch": 1.57, + "grad_norm": 14.630480359676842, + "learning_rate": 5.453219486197837e-06, + "loss": 0.8325, + "step": 27900 + }, + { + "epoch": 1.57, + "grad_norm": 8.870311708880287, + "learning_rate": 5.451587197078605e-06, + "loss": 0.8656, + "step": 27905 + }, + { + "epoch": 1.57, + "grad_norm": 8.967312223873702, + "learning_rate": 5.449954859434336e-06, + "loss": 0.8518, + "step": 27910 + }, + { + "epoch": 1.57, + "grad_norm": 10.148821654746222, + "learning_rate": 5.448322473440431e-06, + "loss": 0.8134, + "step": 27915 + }, + { + "epoch": 1.57, + "grad_norm": 11.03014567338059, + "learning_rate": 5.446690039272299e-06, + "loss": 0.8618, + "step": 27920 + }, + { + "epoch": 1.57, + "grad_norm": 10.297395813566277, + "learning_rate": 5.4450575571053496e-06, + "loss": 0.8319, + "step": 27925 + }, + { + "epoch": 1.57, + "grad_norm": 34.17516218317393, + "learning_rate": 5.443425027115002e-06, + "loss": 0.7979, + "step": 27930 + }, + { + "epoch": 1.57, + "grad_norm": 21.500496252608457, + "learning_rate": 5.441792449476678e-06, + "loss": 0.8697, + "step": 27935 + }, + { + "epoch": 1.57, + "grad_norm": 9.519097324982825, + "learning_rate": 5.440159824365806e-06, + "loss": 0.8588, + "step": 27940 + }, + { + "epoch": 1.57, + "grad_norm": 12.312360802795807, + "learning_rate": 5.438527151957819e-06, + "loss": 0.8273, + "step": 27945 + }, + { + "epoch": 1.57, + "grad_norm": 11.24726881885029, + "learning_rate": 5.436894432428153e-06, + "loss": 0.841, + "step": 27950 + }, + { + "epoch": 1.58, + "grad_norm": 6.7477453811371015, + "learning_rate": 5.435261665952252e-06, + "loss": 0.8138, + "step": 27955 + }, + { + "epoch": 1.58, + "grad_norm": 9.59863456757837, + "learning_rate": 5.433628852705565e-06, + "loss": 0.8524, + "step": 27960 + }, + { + "epoch": 1.58, + "grad_norm": 8.281543679950202, + "learning_rate": 5.431995992863542e-06, + "loss": 0.8141, + "step": 27965 + }, + { + "epoch": 1.58, + "grad_norm": 9.853037424161569, + "learning_rate": 5.430363086601646e-06, + "loss": 0.8229, + "step": 27970 + }, + { + "epoch": 1.58, + "grad_norm": 11.118403261749481, + "learning_rate": 5.428730134095336e-06, + "loss": 0.8422, + "step": 27975 + }, + { + "epoch": 1.58, + "grad_norm": 13.127501702032474, + "learning_rate": 5.4270971355200795e-06, + "loss": 0.8262, + "step": 27980 + }, + { + "epoch": 1.58, + "grad_norm": 9.403048819659286, + "learning_rate": 5.4254640910513525e-06, + "loss": 0.842, + "step": 27985 + }, + { + "epoch": 1.58, + "grad_norm": 13.816020851418337, + "learning_rate": 5.42383100086463e-06, + "loss": 0.8253, + "step": 27990 + }, + { + "epoch": 1.58, + "grad_norm": 10.133337369541252, + "learning_rate": 5.422197865135397e-06, + "loss": 0.9429, + "step": 27995 + }, + { + "epoch": 1.58, + "grad_norm": 11.782257755899195, + "learning_rate": 5.420564684039139e-06, + "loss": 0.8021, + "step": 28000 + }, + { + "epoch": 1.58, + "grad_norm": 6.532147995135568, + "learning_rate": 5.4189314577513495e-06, + "loss": 0.8192, + "step": 28005 + }, + { + "epoch": 1.58, + "grad_norm": 9.78199884547465, + "learning_rate": 5.417298186447527e-06, + "loss": 0.7816, + "step": 28010 + }, + { + "epoch": 1.58, + "grad_norm": 4.928969019553306, + "learning_rate": 5.415664870303172e-06, + "loss": 0.845, + "step": 28015 + }, + { + "epoch": 1.58, + "grad_norm": 4.633092908974432, + "learning_rate": 5.414031509493792e-06, + "loss": 0.8306, + "step": 28020 + }, + { + "epoch": 1.58, + "grad_norm": 6.095078882652489, + "learning_rate": 5.4123981041949005e-06, + "loss": 0.827, + "step": 28025 + }, + { + "epoch": 1.58, + "grad_norm": 9.938725798644738, + "learning_rate": 5.410764654582012e-06, + "loss": 0.8444, + "step": 28030 + }, + { + "epoch": 1.58, + "grad_norm": 5.951926269678413, + "learning_rate": 5.409131160830649e-06, + "loss": 0.781, + "step": 28035 + }, + { + "epoch": 1.58, + "grad_norm": 5.150375771619542, + "learning_rate": 5.407497623116336e-06, + "loss": 0.8226, + "step": 28040 + }, + { + "epoch": 1.58, + "grad_norm": 14.32529151836401, + "learning_rate": 5.405864041614604e-06, + "loss": 0.8172, + "step": 28045 + }, + { + "epoch": 1.58, + "grad_norm": 9.692926834476918, + "learning_rate": 5.40423041650099e-06, + "loss": 0.839, + "step": 28050 + }, + { + "epoch": 1.58, + "grad_norm": 16.009573418508833, + "learning_rate": 5.402596747951036e-06, + "loss": 0.8159, + "step": 28055 + }, + { + "epoch": 1.58, + "grad_norm": 6.999411297179064, + "learning_rate": 5.400963036140283e-06, + "loss": 0.8206, + "step": 28060 + }, + { + "epoch": 1.58, + "grad_norm": 18.888704546367954, + "learning_rate": 5.399329281244283e-06, + "loss": 0.8445, + "step": 28065 + }, + { + "epoch": 1.58, + "grad_norm": 6.24314780906232, + "learning_rate": 5.397695483438588e-06, + "loss": 0.8414, + "step": 28070 + }, + { + "epoch": 1.58, + "grad_norm": 6.532741570267251, + "learning_rate": 5.39606164289876e-06, + "loss": 0.8034, + "step": 28075 + }, + { + "epoch": 1.58, + "grad_norm": 8.245586825990241, + "learning_rate": 5.394427759800359e-06, + "loss": 0.849, + "step": 28080 + }, + { + "epoch": 1.58, + "grad_norm": 6.087065643864672, + "learning_rate": 5.392793834318954e-06, + "loss": 0.8166, + "step": 28085 + }, + { + "epoch": 1.58, + "grad_norm": 5.660077410121367, + "learning_rate": 5.3911598666301206e-06, + "loss": 0.8042, + "step": 28090 + }, + { + "epoch": 1.58, + "grad_norm": 8.129664756007115, + "learning_rate": 5.38952585690943e-06, + "loss": 0.7934, + "step": 28095 + }, + { + "epoch": 1.58, + "grad_norm": 6.758722826603849, + "learning_rate": 5.387891805332469e-06, + "loss": 0.8012, + "step": 28100 + }, + { + "epoch": 1.58, + "grad_norm": 7.35020321715153, + "learning_rate": 5.3862577120748204e-06, + "loss": 0.8362, + "step": 28105 + }, + { + "epoch": 1.58, + "grad_norm": 14.948839984601692, + "learning_rate": 5.384623577312077e-06, + "loss": 0.7891, + "step": 28110 + }, + { + "epoch": 1.58, + "grad_norm": 11.983929128502247, + "learning_rate": 5.382989401219834e-06, + "loss": 0.8467, + "step": 28115 + }, + { + "epoch": 1.58, + "grad_norm": 7.532246618009143, + "learning_rate": 5.381355183973688e-06, + "loss": 0.8319, + "step": 28120 + }, + { + "epoch": 1.58, + "grad_norm": 6.237831590175402, + "learning_rate": 5.379720925749243e-06, + "loss": 0.8106, + "step": 28125 + }, + { + "epoch": 1.58, + "grad_norm": 12.704105732080835, + "learning_rate": 5.378086626722111e-06, + "loss": 0.8412, + "step": 28130 + }, + { + "epoch": 1.59, + "grad_norm": 5.420320694062168, + "learning_rate": 5.376452287067901e-06, + "loss": 0.7876, + "step": 28135 + }, + { + "epoch": 1.59, + "grad_norm": 14.405119387129808, + "learning_rate": 5.37481790696223e-06, + "loss": 0.8545, + "step": 28140 + }, + { + "epoch": 1.59, + "grad_norm": 14.403413268390825, + "learning_rate": 5.373183486580724e-06, + "loss": 0.8172, + "step": 28145 + }, + { + "epoch": 1.59, + "grad_norm": 7.302438246491855, + "learning_rate": 5.371549026099004e-06, + "loss": 0.8505, + "step": 28150 + }, + { + "epoch": 1.59, + "grad_norm": 6.282060502041304, + "learning_rate": 5.369914525692701e-06, + "loss": 0.8316, + "step": 28155 + }, + { + "epoch": 1.59, + "grad_norm": 6.323682826733916, + "learning_rate": 5.3682799855374505e-06, + "loss": 0.8347, + "step": 28160 + }, + { + "epoch": 1.59, + "grad_norm": 6.227848468789792, + "learning_rate": 5.366645405808889e-06, + "loss": 0.826, + "step": 28165 + }, + { + "epoch": 1.59, + "grad_norm": 5.618638011999, + "learning_rate": 5.3650107866826605e-06, + "loss": 0.8574, + "step": 28170 + }, + { + "epoch": 1.59, + "grad_norm": 6.267553653194656, + "learning_rate": 5.3633761283344145e-06, + "loss": 0.8136, + "step": 28175 + }, + { + "epoch": 1.59, + "grad_norm": 13.54685896451495, + "learning_rate": 5.361741430939796e-06, + "loss": 0.8709, + "step": 28180 + }, + { + "epoch": 1.59, + "grad_norm": 11.247908029266732, + "learning_rate": 5.360106694674467e-06, + "loss": 0.8234, + "step": 28185 + }, + { + "epoch": 1.59, + "grad_norm": 6.683044212708232, + "learning_rate": 5.358471919714083e-06, + "loss": 0.8416, + "step": 28190 + }, + { + "epoch": 1.59, + "grad_norm": 8.054085673351551, + "learning_rate": 5.3568371062343095e-06, + "loss": 0.8172, + "step": 28195 + }, + { + "epoch": 1.59, + "grad_norm": 8.030692635069453, + "learning_rate": 5.355202254410815e-06, + "loss": 0.8039, + "step": 28200 + }, + { + "epoch": 1.59, + "grad_norm": 24.589980772210108, + "learning_rate": 5.35356736441927e-06, + "loss": 0.8337, + "step": 28205 + }, + { + "epoch": 1.59, + "grad_norm": 6.812933030426917, + "learning_rate": 5.351932436435352e-06, + "loss": 0.8023, + "step": 28210 + }, + { + "epoch": 1.59, + "grad_norm": 5.462925966717622, + "learning_rate": 5.350297470634739e-06, + "loss": 0.8422, + "step": 28215 + }, + { + "epoch": 1.59, + "grad_norm": 12.128437956796128, + "learning_rate": 5.348662467193117e-06, + "loss": 0.8196, + "step": 28220 + }, + { + "epoch": 1.59, + "grad_norm": 5.437050767796805, + "learning_rate": 5.347027426286175e-06, + "loss": 0.7703, + "step": 28225 + }, + { + "epoch": 1.59, + "grad_norm": 9.495761817884926, + "learning_rate": 5.3453923480896045e-06, + "loss": 0.8727, + "step": 28230 + }, + { + "epoch": 1.59, + "grad_norm": 4.910424106159469, + "learning_rate": 5.343757232779101e-06, + "loss": 0.8417, + "step": 28235 + }, + { + "epoch": 1.59, + "grad_norm": 5.15136226937716, + "learning_rate": 5.342122080530366e-06, + "loss": 0.7904, + "step": 28240 + }, + { + "epoch": 1.59, + "grad_norm": 14.786258876532818, + "learning_rate": 5.340486891519104e-06, + "loss": 0.8356, + "step": 28245 + }, + { + "epoch": 1.59, + "grad_norm": 4.425838374468625, + "learning_rate": 5.338851665921023e-06, + "loss": 0.8012, + "step": 28250 + }, + { + "epoch": 1.59, + "grad_norm": 9.029136058538585, + "learning_rate": 5.337216403911835e-06, + "loss": 0.8284, + "step": 28255 + }, + { + "epoch": 1.59, + "grad_norm": 6.539465752975717, + "learning_rate": 5.335581105667257e-06, + "loss": 0.8912, + "step": 28260 + }, + { + "epoch": 1.59, + "grad_norm": 7.122478646096978, + "learning_rate": 5.333945771363007e-06, + "loss": 0.8147, + "step": 28265 + }, + { + "epoch": 1.59, + "grad_norm": 6.8301488665732215, + "learning_rate": 5.3323104011748095e-06, + "loss": 0.82, + "step": 28270 + }, + { + "epoch": 1.59, + "grad_norm": 5.454402230807887, + "learning_rate": 5.330674995278392e-06, + "loss": 0.8273, + "step": 28275 + }, + { + "epoch": 1.59, + "grad_norm": 4.442497445804376, + "learning_rate": 5.3290395538494884e-06, + "loss": 0.774, + "step": 28280 + }, + { + "epoch": 1.59, + "grad_norm": 5.509011519846112, + "learning_rate": 5.327404077063833e-06, + "loss": 0.8503, + "step": 28285 + }, + { + "epoch": 1.59, + "grad_norm": 6.119290469516933, + "learning_rate": 5.325768565097165e-06, + "loss": 0.8425, + "step": 28290 + }, + { + "epoch": 1.59, + "grad_norm": 7.794450221973544, + "learning_rate": 5.324133018125225e-06, + "loss": 0.8286, + "step": 28295 + }, + { + "epoch": 1.59, + "grad_norm": 5.116707377438561, + "learning_rate": 5.322497436323762e-06, + "loss": 0.8461, + "step": 28300 + }, + { + "epoch": 1.59, + "grad_norm": 5.9167396023247125, + "learning_rate": 5.320861819868528e-06, + "loss": 0.8355, + "step": 28305 + }, + { + "epoch": 1.6, + "grad_norm": 10.810364930544731, + "learning_rate": 5.319226168935274e-06, + "loss": 0.8558, + "step": 28310 + }, + { + "epoch": 1.6, + "grad_norm": 18.762813564943713, + "learning_rate": 5.317590483699759e-06, + "loss": 0.8194, + "step": 28315 + }, + { + "epoch": 1.6, + "grad_norm": 9.088033894341939, + "learning_rate": 5.315954764337745e-06, + "loss": 0.8797, + "step": 28320 + }, + { + "epoch": 1.6, + "grad_norm": 10.968115299170364, + "learning_rate": 5.314319011024997e-06, + "loss": 0.799, + "step": 28325 + }, + { + "epoch": 1.6, + "grad_norm": 20.125631095807982, + "learning_rate": 5.312683223937286e-06, + "loss": 0.8644, + "step": 28330 + }, + { + "epoch": 1.6, + "grad_norm": 9.920482514170674, + "learning_rate": 5.311047403250381e-06, + "loss": 0.8641, + "step": 28335 + }, + { + "epoch": 1.6, + "grad_norm": 5.541664732561482, + "learning_rate": 5.30941154914006e-06, + "loss": 0.8346, + "step": 28340 + }, + { + "epoch": 1.6, + "grad_norm": 6.105027711919444, + "learning_rate": 5.307775661782103e-06, + "loss": 0.8226, + "step": 28345 + }, + { + "epoch": 1.6, + "grad_norm": 9.637906092546576, + "learning_rate": 5.306139741352292e-06, + "loss": 0.8212, + "step": 28350 + }, + { + "epoch": 1.6, + "grad_norm": 4.838597684914096, + "learning_rate": 5.304503788026415e-06, + "loss": 0.8369, + "step": 28355 + }, + { + "epoch": 1.6, + "grad_norm": 7.083702528464722, + "learning_rate": 5.302867801980263e-06, + "loss": 0.8499, + "step": 28360 + }, + { + "epoch": 1.6, + "grad_norm": 8.065812029607843, + "learning_rate": 5.3012317833896275e-06, + "loss": 0.8237, + "step": 28365 + }, + { + "epoch": 1.6, + "grad_norm": 6.070149019654948, + "learning_rate": 5.299595732430307e-06, + "loss": 0.8599, + "step": 28370 + }, + { + "epoch": 1.6, + "grad_norm": 5.644554902747811, + "learning_rate": 5.297959649278105e-06, + "loss": 0.8012, + "step": 28375 + }, + { + "epoch": 1.6, + "grad_norm": 6.663294659461215, + "learning_rate": 5.296323534108823e-06, + "loss": 0.8384, + "step": 28380 + }, + { + "epoch": 1.6, + "grad_norm": 15.04330433603099, + "learning_rate": 5.294687387098268e-06, + "loss": 0.8057, + "step": 28385 + }, + { + "epoch": 1.6, + "grad_norm": 7.311084241460337, + "learning_rate": 5.293051208422254e-06, + "loss": 0.8146, + "step": 28390 + }, + { + "epoch": 1.6, + "grad_norm": 13.021395805213237, + "learning_rate": 5.2914149982565945e-06, + "loss": 0.8392, + "step": 28395 + }, + { + "epoch": 1.6, + "grad_norm": 7.8248191892494425, + "learning_rate": 5.289778756777107e-06, + "loss": 0.8171, + "step": 28400 + }, + { + "epoch": 1.6, + "grad_norm": 5.4583143913361685, + "learning_rate": 5.288142484159614e-06, + "loss": 0.8439, + "step": 28405 + }, + { + "epoch": 1.6, + "grad_norm": 9.20549804743247, + "learning_rate": 5.2865061805799364e-06, + "loss": 0.7952, + "step": 28410 + }, + { + "epoch": 1.6, + "grad_norm": 11.586153347066439, + "learning_rate": 5.284869846213908e-06, + "loss": 0.7838, + "step": 28415 + }, + { + "epoch": 1.6, + "grad_norm": 9.322082353890254, + "learning_rate": 5.283233481237357e-06, + "loss": 0.8923, + "step": 28420 + }, + { + "epoch": 1.6, + "grad_norm": 5.567110742476824, + "learning_rate": 5.281597085826118e-06, + "loss": 0.773, + "step": 28425 + }, + { + "epoch": 1.6, + "grad_norm": 6.71610228404068, + "learning_rate": 5.2799606601560305e-06, + "loss": 0.8532, + "step": 28430 + }, + { + "epoch": 1.6, + "grad_norm": 10.171136241192343, + "learning_rate": 5.278324204402934e-06, + "loss": 0.7937, + "step": 28435 + }, + { + "epoch": 1.6, + "grad_norm": 6.155123385249471, + "learning_rate": 5.276687718742672e-06, + "loss": 0.8554, + "step": 28440 + }, + { + "epoch": 1.6, + "grad_norm": 6.064522669111217, + "learning_rate": 5.275051203351095e-06, + "loss": 0.8011, + "step": 28445 + }, + { + "epoch": 1.6, + "grad_norm": 5.544786271309227, + "learning_rate": 5.2734146584040515e-06, + "loss": 0.8425, + "step": 28450 + }, + { + "epoch": 1.6, + "grad_norm": 18.444782487313137, + "learning_rate": 5.2717780840773966e-06, + "loss": 0.8349, + "step": 28455 + }, + { + "epoch": 1.6, + "grad_norm": 9.317603540270103, + "learning_rate": 5.270141480546986e-06, + "loss": 0.8091, + "step": 28460 + }, + { + "epoch": 1.6, + "grad_norm": 5.180428451338997, + "learning_rate": 5.268504847988682e-06, + "loss": 0.8081, + "step": 28465 + }, + { + "epoch": 1.6, + "grad_norm": 5.209484459801911, + "learning_rate": 5.266868186578348e-06, + "loss": 0.8028, + "step": 28470 + }, + { + "epoch": 1.6, + "grad_norm": 6.031280521102857, + "learning_rate": 5.265231496491849e-06, + "loss": 0.8212, + "step": 28475 + }, + { + "epoch": 1.6, + "grad_norm": 21.740918196196784, + "learning_rate": 5.263594777905055e-06, + "loss": 0.8097, + "step": 28480 + }, + { + "epoch": 1.6, + "grad_norm": 14.710009659556956, + "learning_rate": 5.261958030993839e-06, + "loss": 0.8248, + "step": 28485 + }, + { + "epoch": 1.61, + "grad_norm": 6.624321534245403, + "learning_rate": 5.2603212559340766e-06, + "loss": 0.8303, + "step": 28490 + }, + { + "epoch": 1.61, + "grad_norm": 10.159543330024958, + "learning_rate": 5.2586844529016455e-06, + "loss": 0.8359, + "step": 28495 + }, + { + "epoch": 1.61, + "grad_norm": 8.356023775654574, + "learning_rate": 5.257047622072429e-06, + "loss": 0.8248, + "step": 28500 + }, + { + "epoch": 1.61, + "grad_norm": 6.950501614008669, + "learning_rate": 5.25541076362231e-06, + "loss": 0.7807, + "step": 28505 + }, + { + "epoch": 1.61, + "grad_norm": 6.302670648120601, + "learning_rate": 5.253773877727177e-06, + "loss": 0.8055, + "step": 28510 + }, + { + "epoch": 1.61, + "grad_norm": 6.127714021436408, + "learning_rate": 5.252136964562924e-06, + "loss": 0.8283, + "step": 28515 + }, + { + "epoch": 1.61, + "grad_norm": 7.3778428849436155, + "learning_rate": 5.250500024305439e-06, + "loss": 0.845, + "step": 28520 + }, + { + "epoch": 1.61, + "grad_norm": 6.1378916131104955, + "learning_rate": 5.248863057130621e-06, + "loss": 0.804, + "step": 28525 + }, + { + "epoch": 1.61, + "grad_norm": 7.645583237418911, + "learning_rate": 5.247226063214369e-06, + "loss": 0.8339, + "step": 28530 + }, + { + "epoch": 1.61, + "grad_norm": 5.039580870099732, + "learning_rate": 5.245589042732586e-06, + "loss": 0.8129, + "step": 28535 + }, + { + "epoch": 1.61, + "grad_norm": 6.2790213479552435, + "learning_rate": 5.243951995861176e-06, + "loss": 0.8125, + "step": 28540 + }, + { + "epoch": 1.61, + "grad_norm": 9.55020039178158, + "learning_rate": 5.242314922776048e-06, + "loss": 0.8256, + "step": 28545 + }, + { + "epoch": 1.61, + "grad_norm": 6.6076665561491135, + "learning_rate": 5.240677823653112e-06, + "loss": 0.8748, + "step": 28550 + }, + { + "epoch": 1.61, + "grad_norm": 24.278825335381025, + "learning_rate": 5.23904069866828e-06, + "loss": 0.8338, + "step": 28555 + }, + { + "epoch": 1.61, + "grad_norm": 11.42729182361593, + "learning_rate": 5.237403547997471e-06, + "loss": 0.8013, + "step": 28560 + }, + { + "epoch": 1.61, + "grad_norm": 9.773900554676061, + "learning_rate": 5.235766371816604e-06, + "loss": 0.8201, + "step": 28565 + }, + { + "epoch": 1.61, + "grad_norm": 5.500273051139277, + "learning_rate": 5.2341291703016e-06, + "loss": 0.8157, + "step": 28570 + }, + { + "epoch": 1.61, + "grad_norm": 22.024888483130667, + "learning_rate": 5.232491943628384e-06, + "loss": 0.8235, + "step": 28575 + }, + { + "epoch": 1.61, + "grad_norm": 17.465939287461797, + "learning_rate": 5.230854691972881e-06, + "loss": 0.8051, + "step": 28580 + }, + { + "epoch": 1.61, + "grad_norm": 28.469470431834168, + "learning_rate": 5.229217415511022e-06, + "loss": 0.8297, + "step": 28585 + }, + { + "epoch": 1.61, + "grad_norm": 19.61513964826842, + "learning_rate": 5.227580114418741e-06, + "loss": 0.7875, + "step": 28590 + }, + { + "epoch": 1.61, + "grad_norm": 28.960400754131758, + "learning_rate": 5.225942788871971e-06, + "loss": 0.9221, + "step": 28595 + }, + { + "epoch": 1.61, + "grad_norm": 34.85775133734265, + "learning_rate": 5.224305439046653e-06, + "loss": 0.826, + "step": 28600 + }, + { + "epoch": 1.61, + "grad_norm": 32.89610012348651, + "learning_rate": 5.222668065118725e-06, + "loss": 0.8416, + "step": 28605 + }, + { + "epoch": 1.61, + "grad_norm": 21.755337613991, + "learning_rate": 5.221030667264131e-06, + "loss": 0.8629, + "step": 28610 + }, + { + "epoch": 1.61, + "grad_norm": 30.940812572866243, + "learning_rate": 5.219393245658816e-06, + "loss": 0.8116, + "step": 28615 + }, + { + "epoch": 1.61, + "grad_norm": 4.6509867370032225, + "learning_rate": 5.217755800478728e-06, + "loss": 0.8112, + "step": 28620 + }, + { + "epoch": 1.61, + "grad_norm": 5.937239650798151, + "learning_rate": 5.216118331899819e-06, + "loss": 0.7856, + "step": 28625 + }, + { + "epoch": 1.61, + "grad_norm": 33.073728592417766, + "learning_rate": 5.214480840098042e-06, + "loss": 0.8371, + "step": 28630 + }, + { + "epoch": 1.61, + "grad_norm": 5.417096497324796, + "learning_rate": 5.21284332524935e-06, + "loss": 0.7839, + "step": 28635 + }, + { + "epoch": 1.61, + "grad_norm": 16.509024976662598, + "learning_rate": 5.211205787529704e-06, + "loss": 0.8257, + "step": 28640 + }, + { + "epoch": 1.61, + "grad_norm": 6.721006779388342, + "learning_rate": 5.209568227115065e-06, + "loss": 0.8432, + "step": 28645 + }, + { + "epoch": 1.61, + "grad_norm": 11.370181041668513, + "learning_rate": 5.207930644181393e-06, + "loss": 0.8082, + "step": 28650 + }, + { + "epoch": 1.61, + "grad_norm": 7.423895454065968, + "learning_rate": 5.206293038904658e-06, + "loss": 0.7768, + "step": 28655 + }, + { + "epoch": 1.61, + "grad_norm": 4.835741904843163, + "learning_rate": 5.204655411460826e-06, + "loss": 0.8313, + "step": 28660 + }, + { + "epoch": 1.62, + "grad_norm": 8.913292766298175, + "learning_rate": 5.203017762025866e-06, + "loss": 0.8042, + "step": 28665 + }, + { + "epoch": 1.62, + "grad_norm": 15.02807283367277, + "learning_rate": 5.20138009077575e-06, + "loss": 0.8219, + "step": 28670 + }, + { + "epoch": 1.62, + "grad_norm": 5.091339301424439, + "learning_rate": 5.199742397886457e-06, + "loss": 0.8608, + "step": 28675 + }, + { + "epoch": 1.62, + "grad_norm": 5.285834745633364, + "learning_rate": 5.19810468353396e-06, + "loss": 0.8222, + "step": 28680 + }, + { + "epoch": 1.62, + "grad_norm": 5.516574615927582, + "learning_rate": 5.196466947894241e-06, + "loss": 0.815, + "step": 28685 + }, + { + "epoch": 1.62, + "grad_norm": 5.0057631378192236, + "learning_rate": 5.194829191143281e-06, + "loss": 0.8031, + "step": 28690 + }, + { + "epoch": 1.62, + "grad_norm": 5.965740632432216, + "learning_rate": 5.193191413457065e-06, + "loss": 0.8364, + "step": 28695 + }, + { + "epoch": 1.62, + "grad_norm": 5.045844744196989, + "learning_rate": 5.1915536150115785e-06, + "loss": 0.8203, + "step": 28700 + }, + { + "epoch": 1.62, + "grad_norm": 7.274808808539857, + "learning_rate": 5.1899157959828105e-06, + "loss": 0.8586, + "step": 28705 + }, + { + "epoch": 1.62, + "grad_norm": 5.203246946244696, + "learning_rate": 5.188277956546753e-06, + "loss": 0.7896, + "step": 28710 + }, + { + "epoch": 1.62, + "grad_norm": 4.9627082070989506, + "learning_rate": 5.186640096879398e-06, + "loss": 0.7943, + "step": 28715 + }, + { + "epoch": 1.62, + "grad_norm": 4.90119774631027, + "learning_rate": 5.185002217156742e-06, + "loss": 0.8242, + "step": 28720 + }, + { + "epoch": 1.62, + "grad_norm": 4.557675697750803, + "learning_rate": 5.18336431755478e-06, + "loss": 0.8143, + "step": 28725 + }, + { + "epoch": 1.62, + "grad_norm": 4.881661502349697, + "learning_rate": 5.1817263982495115e-06, + "loss": 0.8028, + "step": 28730 + }, + { + "epoch": 1.62, + "grad_norm": 6.085382369715009, + "learning_rate": 5.18008845941694e-06, + "loss": 0.7911, + "step": 28735 + }, + { + "epoch": 1.62, + "grad_norm": 5.216198443207247, + "learning_rate": 5.17845050123307e-06, + "loss": 0.813, + "step": 28740 + }, + { + "epoch": 1.62, + "grad_norm": 4.685846069694916, + "learning_rate": 5.176812523873907e-06, + "loss": 0.8364, + "step": 28745 + }, + { + "epoch": 1.62, + "grad_norm": 4.908023317253362, + "learning_rate": 5.175174527515456e-06, + "loss": 0.834, + "step": 28750 + }, + { + "epoch": 1.62, + "grad_norm": 5.344547124321556, + "learning_rate": 5.173536512333729e-06, + "loss": 0.7877, + "step": 28755 + }, + { + "epoch": 1.62, + "grad_norm": 7.776375872443901, + "learning_rate": 5.171898478504738e-06, + "loss": 0.7963, + "step": 28760 + }, + { + "epoch": 1.62, + "grad_norm": 6.1794716386261195, + "learning_rate": 5.170260426204497e-06, + "loss": 0.8484, + "step": 28765 + }, + { + "epoch": 1.62, + "grad_norm": 5.451058636147341, + "learning_rate": 5.1686223556090224e-06, + "loss": 0.8327, + "step": 28770 + }, + { + "epoch": 1.62, + "grad_norm": 5.46441247888714, + "learning_rate": 5.1669842668943306e-06, + "loss": 0.8248, + "step": 28775 + }, + { + "epoch": 1.62, + "grad_norm": 4.877061812176017, + "learning_rate": 5.165346160236443e-06, + "loss": 0.7816, + "step": 28780 + }, + { + "epoch": 1.62, + "grad_norm": 8.741123780733577, + "learning_rate": 5.16370803581138e-06, + "loss": 0.8224, + "step": 28785 + }, + { + "epoch": 1.62, + "grad_norm": 4.88265021141814, + "learning_rate": 5.162069893795167e-06, + "loss": 0.8327, + "step": 28790 + }, + { + "epoch": 1.62, + "grad_norm": 6.586186895688275, + "learning_rate": 5.160431734363828e-06, + "loss": 0.8383, + "step": 28795 + }, + { + "epoch": 1.62, + "grad_norm": 15.773180350430062, + "learning_rate": 5.158793557693392e-06, + "loss": 0.8317, + "step": 28800 + }, + { + "epoch": 1.62, + "grad_norm": 10.26473750806853, + "learning_rate": 5.157155363959887e-06, + "loss": 0.8056, + "step": 28805 + }, + { + "epoch": 1.62, + "grad_norm": 17.082843396271144, + "learning_rate": 5.155517153339344e-06, + "loss": 0.8532, + "step": 28810 + }, + { + "epoch": 1.62, + "grad_norm": 6.019847333823887, + "learning_rate": 5.153878926007797e-06, + "loss": 0.8124, + "step": 28815 + }, + { + "epoch": 1.62, + "grad_norm": 15.5005656325108, + "learning_rate": 5.15224068214128e-06, + "loss": 0.8311, + "step": 28820 + }, + { + "epoch": 1.62, + "grad_norm": 11.202641021959506, + "learning_rate": 5.15060242191583e-06, + "loss": 0.8252, + "step": 28825 + }, + { + "epoch": 1.62, + "grad_norm": 7.522168519484391, + "learning_rate": 5.1489641455074846e-06, + "loss": 0.7817, + "step": 28830 + }, + { + "epoch": 1.62, + "grad_norm": 10.19026913316636, + "learning_rate": 5.147325853092285e-06, + "loss": 0.8029, + "step": 28835 + }, + { + "epoch": 1.62, + "grad_norm": 7.068565292854005, + "learning_rate": 5.145687544846272e-06, + "loss": 0.848, + "step": 28840 + }, + { + "epoch": 1.63, + "grad_norm": 9.18378413557943, + "learning_rate": 5.144049220945489e-06, + "loss": 0.8461, + "step": 28845 + }, + { + "epoch": 1.63, + "grad_norm": 5.8838921355026566, + "learning_rate": 5.142410881565982e-06, + "loss": 0.7872, + "step": 28850 + }, + { + "epoch": 1.63, + "grad_norm": 8.665106347052056, + "learning_rate": 5.140772526883797e-06, + "loss": 0.7971, + "step": 28855 + }, + { + "epoch": 1.63, + "grad_norm": 5.1166735629730065, + "learning_rate": 5.139134157074984e-06, + "loss": 0.8436, + "step": 28860 + }, + { + "epoch": 1.63, + "grad_norm": 6.847427139454322, + "learning_rate": 5.137495772315591e-06, + "loss": 0.8124, + "step": 28865 + }, + { + "epoch": 1.63, + "grad_norm": 8.837221321593361, + "learning_rate": 5.1358573727816685e-06, + "loss": 0.8529, + "step": 28870 + }, + { + "epoch": 1.63, + "grad_norm": 15.201604849290314, + "learning_rate": 5.134218958649274e-06, + "loss": 0.8133, + "step": 28875 + }, + { + "epoch": 1.63, + "grad_norm": 5.032899750780574, + "learning_rate": 5.132580530094459e-06, + "loss": 0.8408, + "step": 28880 + }, + { + "epoch": 1.63, + "grad_norm": 5.202869799994652, + "learning_rate": 5.130942087293283e-06, + "loss": 0.8099, + "step": 28885 + }, + { + "epoch": 1.63, + "grad_norm": 11.6021648021502, + "learning_rate": 5.1293036304217995e-06, + "loss": 0.8057, + "step": 28890 + }, + { + "epoch": 1.63, + "grad_norm": 6.195860451037316, + "learning_rate": 5.12766515965607e-06, + "loss": 0.7707, + "step": 28895 + }, + { + "epoch": 1.63, + "grad_norm": 18.125250980532645, + "learning_rate": 5.126026675172157e-06, + "loss": 0.8227, + "step": 28900 + }, + { + "epoch": 1.63, + "grad_norm": 6.398595748643314, + "learning_rate": 5.124388177146121e-06, + "loss": 0.7838, + "step": 28905 + }, + { + "epoch": 1.63, + "grad_norm": 5.700082787818464, + "learning_rate": 5.122749665754026e-06, + "loss": 0.8456, + "step": 28910 + }, + { + "epoch": 1.63, + "grad_norm": 23.825412981295585, + "learning_rate": 5.121111141171938e-06, + "loss": 0.8168, + "step": 28915 + }, + { + "epoch": 1.63, + "grad_norm": 7.916138066122154, + "learning_rate": 5.119472603575924e-06, + "loss": 0.8153, + "step": 28920 + }, + { + "epoch": 1.63, + "grad_norm": 6.0793455181871625, + "learning_rate": 5.117834053142051e-06, + "loss": 0.8472, + "step": 28925 + }, + { + "epoch": 1.63, + "grad_norm": 6.323977949052801, + "learning_rate": 5.116195490046388e-06, + "loss": 0.8472, + "step": 28930 + }, + { + "epoch": 1.63, + "grad_norm": 5.238936639245196, + "learning_rate": 5.114556914465009e-06, + "loss": 0.7972, + "step": 28935 + }, + { + "epoch": 1.63, + "grad_norm": 5.243687229405834, + "learning_rate": 5.112918326573982e-06, + "loss": 0.8409, + "step": 28940 + }, + { + "epoch": 1.63, + "grad_norm": 5.783046907850406, + "learning_rate": 5.111279726549384e-06, + "loss": 0.7658, + "step": 28945 + }, + { + "epoch": 1.63, + "grad_norm": 5.4566288577281625, + "learning_rate": 5.109641114567288e-06, + "loss": 0.7915, + "step": 28950 + }, + { + "epoch": 1.63, + "grad_norm": 5.741691260101394, + "learning_rate": 5.1080024908037694e-06, + "loss": 0.86, + "step": 28955 + }, + { + "epoch": 1.63, + "grad_norm": 5.270059758269816, + "learning_rate": 5.106363855434907e-06, + "loss": 0.8123, + "step": 28960 + }, + { + "epoch": 1.63, + "grad_norm": 5.904700105678346, + "learning_rate": 5.104725208636777e-06, + "loss": 0.8222, + "step": 28965 + }, + { + "epoch": 1.63, + "grad_norm": 11.837541181724244, + "learning_rate": 5.103086550585463e-06, + "loss": 0.8085, + "step": 28970 + }, + { + "epoch": 1.63, + "grad_norm": 8.179304702646292, + "learning_rate": 5.101447881457046e-06, + "loss": 0.8227, + "step": 28975 + }, + { + "epoch": 1.63, + "grad_norm": 4.776609548219907, + "learning_rate": 5.099809201427603e-06, + "loss": 0.7846, + "step": 28980 + }, + { + "epoch": 1.63, + "grad_norm": 11.730764020627527, + "learning_rate": 5.098170510673224e-06, + "loss": 0.7935, + "step": 28985 + }, + { + "epoch": 1.63, + "grad_norm": 10.635731099123605, + "learning_rate": 5.096531809369988e-06, + "loss": 0.8109, + "step": 28990 + }, + { + "epoch": 1.63, + "grad_norm": 9.810360155246453, + "learning_rate": 5.094893097693983e-06, + "loss": 0.8293, + "step": 28995 + }, + { + "epoch": 1.63, + "grad_norm": 5.460276744279129, + "learning_rate": 5.093254375821299e-06, + "loss": 0.7785, + "step": 29000 + }, + { + "epoch": 1.63, + "grad_norm": 4.728778861040613, + "learning_rate": 5.091615643928016e-06, + "loss": 0.789, + "step": 29005 + }, + { + "epoch": 1.63, + "grad_norm": 5.868446201290173, + "learning_rate": 5.08997690219023e-06, + "loss": 0.8192, + "step": 29010 + }, + { + "epoch": 1.63, + "grad_norm": 4.8510699300250755, + "learning_rate": 5.088338150784029e-06, + "loss": 0.7578, + "step": 29015 + }, + { + "epoch": 1.64, + "grad_norm": 5.562075881914659, + "learning_rate": 5.0866993898855045e-06, + "loss": 0.8352, + "step": 29020 + }, + { + "epoch": 1.64, + "grad_norm": 5.143850679057571, + "learning_rate": 5.085060619670746e-06, + "loss": 0.7261, + "step": 29025 + }, + { + "epoch": 1.64, + "grad_norm": 7.397323078474872, + "learning_rate": 5.083421840315851e-06, + "loss": 0.8438, + "step": 29030 + }, + { + "epoch": 1.64, + "grad_norm": 13.808400027460724, + "learning_rate": 5.08178305199691e-06, + "loss": 0.8454, + "step": 29035 + }, + { + "epoch": 1.64, + "grad_norm": 5.881506130414715, + "learning_rate": 5.0801442548900195e-06, + "loss": 0.8391, + "step": 29040 + }, + { + "epoch": 1.64, + "grad_norm": 7.679740233421449, + "learning_rate": 5.078505449171274e-06, + "loss": 0.8092, + "step": 29045 + }, + { + "epoch": 1.64, + "grad_norm": 10.404697141770395, + "learning_rate": 5.076866635016772e-06, + "loss": 0.7824, + "step": 29050 + }, + { + "epoch": 1.64, + "grad_norm": 4.904767650446284, + "learning_rate": 5.07522781260261e-06, + "loss": 0.8483, + "step": 29055 + }, + { + "epoch": 1.64, + "grad_norm": 25.718605229696344, + "learning_rate": 5.073588982104889e-06, + "loss": 0.8033, + "step": 29060 + }, + { + "epoch": 1.64, + "grad_norm": 7.759656116234666, + "learning_rate": 5.071950143699707e-06, + "loss": 0.801, + "step": 29065 + }, + { + "epoch": 1.64, + "grad_norm": 10.32674223064685, + "learning_rate": 5.070311297563163e-06, + "loss": 0.811, + "step": 29070 + }, + { + "epoch": 1.64, + "grad_norm": 5.494047609200806, + "learning_rate": 5.068672443871361e-06, + "loss": 0.8213, + "step": 29075 + }, + { + "epoch": 1.64, + "grad_norm": 6.18498999013527, + "learning_rate": 5.067033582800401e-06, + "loss": 0.8329, + "step": 29080 + }, + { + "epoch": 1.64, + "grad_norm": 4.603159807360061, + "learning_rate": 5.0653947145263885e-06, + "loss": 0.7768, + "step": 29085 + }, + { + "epoch": 1.64, + "grad_norm": 10.705809007745918, + "learning_rate": 5.0637558392254235e-06, + "loss": 0.8532, + "step": 29090 + }, + { + "epoch": 1.64, + "grad_norm": 7.485124137036119, + "learning_rate": 5.062116957073613e-06, + "loss": 0.7952, + "step": 29095 + }, + { + "epoch": 1.64, + "grad_norm": 8.993705068357134, + "learning_rate": 5.06047806824706e-06, + "loss": 0.8336, + "step": 29100 + }, + { + "epoch": 1.64, + "grad_norm": 5.681963323212381, + "learning_rate": 5.058839172921873e-06, + "loss": 0.797, + "step": 29105 + }, + { + "epoch": 1.64, + "grad_norm": 5.111160127137747, + "learning_rate": 5.057200271274156e-06, + "loss": 0.8228, + "step": 29110 + }, + { + "epoch": 1.64, + "grad_norm": 9.411693347426782, + "learning_rate": 5.05556136348002e-06, + "loss": 0.8309, + "step": 29115 + }, + { + "epoch": 1.64, + "grad_norm": 5.62345410385421, + "learning_rate": 5.05392244971557e-06, + "loss": 0.8151, + "step": 29120 + }, + { + "epoch": 1.64, + "grad_norm": 6.933159233687968, + "learning_rate": 5.052283530156912e-06, + "loss": 0.8178, + "step": 29125 + }, + { + "epoch": 1.64, + "grad_norm": 7.04589105362451, + "learning_rate": 5.050644604980159e-06, + "loss": 0.7979, + "step": 29130 + }, + { + "epoch": 1.64, + "grad_norm": 5.277252822990433, + "learning_rate": 5.04900567436142e-06, + "loss": 0.7749, + "step": 29135 + }, + { + "epoch": 1.64, + "grad_norm": 8.322892892107303, + "learning_rate": 5.0473667384768054e-06, + "loss": 0.7885, + "step": 29140 + }, + { + "epoch": 1.64, + "grad_norm": 20.882763964089886, + "learning_rate": 5.045727797502426e-06, + "loss": 0.8309, + "step": 29145 + }, + { + "epoch": 1.64, + "grad_norm": 8.806843017144521, + "learning_rate": 5.044088851614391e-06, + "loss": 0.7765, + "step": 29150 + }, + { + "epoch": 1.64, + "grad_norm": 6.324009815413327, + "learning_rate": 5.0424499009888165e-06, + "loss": 0.8049, + "step": 29155 + }, + { + "epoch": 1.64, + "grad_norm": 11.711334284462785, + "learning_rate": 5.040810945801812e-06, + "loss": 0.7728, + "step": 29160 + }, + { + "epoch": 1.64, + "grad_norm": 7.8084473931244585, + "learning_rate": 5.039171986229491e-06, + "loss": 0.8313, + "step": 29165 + }, + { + "epoch": 1.64, + "grad_norm": 9.524139245799343, + "learning_rate": 5.037533022447967e-06, + "loss": 0.8093, + "step": 29170 + }, + { + "epoch": 1.64, + "grad_norm": 5.73566929702185, + "learning_rate": 5.0358940546333555e-06, + "loss": 0.8271, + "step": 29175 + }, + { + "epoch": 1.64, + "grad_norm": 4.6962732035184604, + "learning_rate": 5.034255082961768e-06, + "loss": 0.7423, + "step": 29180 + }, + { + "epoch": 1.64, + "grad_norm": 5.533090871706895, + "learning_rate": 5.032616107609322e-06, + "loss": 0.8142, + "step": 29185 + }, + { + "epoch": 1.64, + "grad_norm": 6.408774450153409, + "learning_rate": 5.0309771287521295e-06, + "loss": 0.8627, + "step": 29190 + }, + { + "epoch": 1.64, + "grad_norm": 8.156362265169408, + "learning_rate": 5.029338146566307e-06, + "loss": 0.7984, + "step": 29195 + }, + { + "epoch": 1.65, + "grad_norm": 10.754148694201769, + "learning_rate": 5.027699161227975e-06, + "loss": 0.8132, + "step": 29200 + }, + { + "epoch": 1.65, + "grad_norm": 4.874061883796169, + "learning_rate": 5.026060172913243e-06, + "loss": 0.785, + "step": 29205 + }, + { + "epoch": 1.65, + "grad_norm": 8.070895173347584, + "learning_rate": 5.024421181798231e-06, + "loss": 0.7783, + "step": 29210 + }, + { + "epoch": 1.65, + "grad_norm": 14.08443732562972, + "learning_rate": 5.022782188059056e-06, + "loss": 0.8524, + "step": 29215 + }, + { + "epoch": 1.65, + "grad_norm": 7.392687815404849, + "learning_rate": 5.021143191871834e-06, + "loss": 0.7729, + "step": 29220 + }, + { + "epoch": 1.65, + "grad_norm": 12.464790980083453, + "learning_rate": 5.019504193412682e-06, + "loss": 0.8182, + "step": 29225 + }, + { + "epoch": 1.65, + "grad_norm": 4.628883840032938, + "learning_rate": 5.01786519285772e-06, + "loss": 0.7888, + "step": 29230 + }, + { + "epoch": 1.65, + "grad_norm": 10.04323033822842, + "learning_rate": 5.016226190383061e-06, + "loss": 0.8187, + "step": 29235 + }, + { + "epoch": 1.65, + "grad_norm": 6.726518575213119, + "learning_rate": 5.014587186164828e-06, + "loss": 0.7887, + "step": 29240 + }, + { + "epoch": 1.65, + "grad_norm": 5.160618859165872, + "learning_rate": 5.012948180379138e-06, + "loss": 0.7665, + "step": 29245 + }, + { + "epoch": 1.65, + "grad_norm": 5.291806868973104, + "learning_rate": 5.011309173202108e-06, + "loss": 0.8027, + "step": 29250 + }, + { + "epoch": 1.65, + "grad_norm": 5.45783451634518, + "learning_rate": 5.009670164809858e-06, + "loss": 0.7657, + "step": 29255 + }, + { + "epoch": 1.65, + "grad_norm": 7.4830726465245, + "learning_rate": 5.0080311553785075e-06, + "loss": 0.7731, + "step": 29260 + }, + { + "epoch": 1.65, + "grad_norm": 4.546063778080442, + "learning_rate": 5.006392145084172e-06, + "loss": 0.7672, + "step": 29265 + }, + { + "epoch": 1.65, + "grad_norm": 5.775772578098124, + "learning_rate": 5.004753134102973e-06, + "loss": 0.8346, + "step": 29270 + }, + { + "epoch": 1.65, + "grad_norm": 7.5794976908939935, + "learning_rate": 5.003114122611028e-06, + "loss": 0.7771, + "step": 29275 + }, + { + "epoch": 1.65, + "grad_norm": 6.762264800576519, + "learning_rate": 5.0014751107844576e-06, + "loss": 0.8228, + "step": 29280 + }, + { + "epoch": 1.65, + "grad_norm": 4.9314046914117835, + "learning_rate": 4.999836098799379e-06, + "loss": 0.7801, + "step": 29285 + }, + { + "epoch": 1.65, + "grad_norm": 9.2197424090684, + "learning_rate": 4.998197086831912e-06, + "loss": 0.8282, + "step": 29290 + }, + { + "epoch": 1.65, + "grad_norm": 7.863892250220879, + "learning_rate": 4.996558075058179e-06, + "loss": 0.7849, + "step": 29295 + }, + { + "epoch": 1.65, + "grad_norm": 9.88712624080577, + "learning_rate": 4.994919063654295e-06, + "loss": 0.8118, + "step": 29300 + }, + { + "epoch": 1.65, + "grad_norm": 5.899540341700078, + "learning_rate": 4.993280052796378e-06, + "loss": 0.8211, + "step": 29305 + }, + { + "epoch": 1.65, + "grad_norm": 6.366164063496896, + "learning_rate": 4.99164104266055e-06, + "loss": 0.7674, + "step": 29310 + }, + { + "epoch": 1.65, + "grad_norm": 6.0300986008164665, + "learning_rate": 4.990002033422928e-06, + "loss": 0.8071, + "step": 29315 + }, + { + "epoch": 1.65, + "grad_norm": 6.294204839684711, + "learning_rate": 4.988363025259635e-06, + "loss": 0.7988, + "step": 29320 + }, + { + "epoch": 1.65, + "grad_norm": 5.682600958364213, + "learning_rate": 4.986724018346782e-06, + "loss": 0.7934, + "step": 29325 + }, + { + "epoch": 1.65, + "grad_norm": 10.708390759590833, + "learning_rate": 4.985085012860493e-06, + "loss": 0.829, + "step": 29330 + }, + { + "epoch": 1.65, + "grad_norm": 9.758827242074737, + "learning_rate": 4.983446008976887e-06, + "loss": 0.7857, + "step": 29335 + }, + { + "epoch": 1.65, + "grad_norm": 24.59076245051149, + "learning_rate": 4.9818070068720785e-06, + "loss": 0.8186, + "step": 29340 + }, + { + "epoch": 1.65, + "grad_norm": 10.659853242933357, + "learning_rate": 4.980168006722189e-06, + "loss": 0.8073, + "step": 29345 + }, + { + "epoch": 1.65, + "grad_norm": 15.466058576157828, + "learning_rate": 4.978529008703333e-06, + "loss": 0.8042, + "step": 29350 + }, + { + "epoch": 1.65, + "grad_norm": 13.89881613598247, + "learning_rate": 4.976890012991631e-06, + "loss": 0.7984, + "step": 29355 + }, + { + "epoch": 1.65, + "grad_norm": 18.077320619114, + "learning_rate": 4.9752510197632e-06, + "loss": 0.8476, + "step": 29360 + }, + { + "epoch": 1.65, + "grad_norm": 11.424010477811986, + "learning_rate": 4.973612029194154e-06, + "loss": 0.8136, + "step": 29365 + }, + { + "epoch": 1.65, + "grad_norm": 5.770990192838112, + "learning_rate": 4.971973041460613e-06, + "loss": 0.8047, + "step": 29370 + }, + { + "epoch": 1.66, + "grad_norm": 9.662160218025914, + "learning_rate": 4.970334056738692e-06, + "loss": 0.8108, + "step": 29375 + }, + { + "epoch": 1.66, + "grad_norm": 6.411098912694886, + "learning_rate": 4.968695075204508e-06, + "loss": 0.7702, + "step": 29380 + }, + { + "epoch": 1.66, + "grad_norm": 7.7155223274576015, + "learning_rate": 4.967056097034177e-06, + "loss": 0.8121, + "step": 29385 + }, + { + "epoch": 1.66, + "grad_norm": 5.371586893614241, + "learning_rate": 4.965417122403811e-06, + "loss": 0.7996, + "step": 29390 + }, + { + "epoch": 1.66, + "grad_norm": 5.282520779370752, + "learning_rate": 4.9637781514895315e-06, + "loss": 0.8254, + "step": 29395 + }, + { + "epoch": 1.66, + "grad_norm": 6.877233785679275, + "learning_rate": 4.962139184467446e-06, + "loss": 0.7821, + "step": 29400 + }, + { + "epoch": 1.66, + "grad_norm": 6.062370566173023, + "learning_rate": 4.960500221513675e-06, + "loss": 0.8165, + "step": 29405 + }, + { + "epoch": 1.66, + "grad_norm": 5.062883174542722, + "learning_rate": 4.958861262804327e-06, + "loss": 0.8096, + "step": 29410 + }, + { + "epoch": 1.66, + "grad_norm": 5.344973583028538, + "learning_rate": 4.95722230851552e-06, + "loss": 0.7902, + "step": 29415 + }, + { + "epoch": 1.66, + "grad_norm": 5.269743878820086, + "learning_rate": 4.955583358823364e-06, + "loss": 0.7993, + "step": 29420 + }, + { + "epoch": 1.66, + "grad_norm": 5.024395665734921, + "learning_rate": 4.953944413903971e-06, + "loss": 0.8432, + "step": 29425 + }, + { + "epoch": 1.66, + "grad_norm": 6.421035473683998, + "learning_rate": 4.952305473933456e-06, + "loss": 0.7857, + "step": 29430 + }, + { + "epoch": 1.66, + "grad_norm": 5.588717028826829, + "learning_rate": 4.950666539087927e-06, + "loss": 0.78, + "step": 29435 + }, + { + "epoch": 1.66, + "grad_norm": 7.632604368468013, + "learning_rate": 4.949027609543497e-06, + "loss": 0.7669, + "step": 29440 + }, + { + "epoch": 1.66, + "grad_norm": 7.630758459389194, + "learning_rate": 4.947388685476276e-06, + "loss": 0.7864, + "step": 29445 + }, + { + "epoch": 1.66, + "grad_norm": 5.018305897013323, + "learning_rate": 4.94574976706237e-06, + "loss": 0.7852, + "step": 29450 + }, + { + "epoch": 1.66, + "grad_norm": 6.948471414354077, + "learning_rate": 4.944110854477894e-06, + "loss": 0.8035, + "step": 29455 + }, + { + "epoch": 1.66, + "grad_norm": 7.109579858646023, + "learning_rate": 4.9424719478989515e-06, + "loss": 0.7884, + "step": 29460 + }, + { + "epoch": 1.66, + "grad_norm": 5.266712112485102, + "learning_rate": 4.940833047501652e-06, + "loss": 0.8263, + "step": 29465 + }, + { + "epoch": 1.66, + "grad_norm": 7.535824088404048, + "learning_rate": 4.939194153462103e-06, + "loss": 0.8198, + "step": 29470 + }, + { + "epoch": 1.66, + "grad_norm": 13.107778575655036, + "learning_rate": 4.937555265956411e-06, + "loss": 0.8127, + "step": 29475 + }, + { + "epoch": 1.66, + "grad_norm": 5.346506795232058, + "learning_rate": 4.935916385160683e-06, + "loss": 0.8092, + "step": 29480 + }, + { + "epoch": 1.66, + "grad_norm": 9.557190475765134, + "learning_rate": 4.934277511251019e-06, + "loss": 0.8308, + "step": 29485 + }, + { + "epoch": 1.66, + "grad_norm": 6.675250707152214, + "learning_rate": 4.932638644403529e-06, + "loss": 0.8183, + "step": 29490 + }, + { + "epoch": 1.66, + "grad_norm": 4.956476470838433, + "learning_rate": 4.930999784794311e-06, + "loss": 0.8023, + "step": 29495 + }, + { + "epoch": 1.66, + "grad_norm": 4.7709799701961035, + "learning_rate": 4.929360932599474e-06, + "loss": 0.8033, + "step": 29500 + }, + { + "epoch": 1.66, + "grad_norm": 6.2637823075954335, + "learning_rate": 4.927722087995115e-06, + "loss": 0.776, + "step": 29505 + }, + { + "epoch": 1.66, + "grad_norm": 5.495390031682441, + "learning_rate": 4.926083251157336e-06, + "loss": 0.8069, + "step": 29510 + }, + { + "epoch": 1.66, + "grad_norm": 6.54021499002464, + "learning_rate": 4.924444422262239e-06, + "loss": 0.807, + "step": 29515 + }, + { + "epoch": 1.66, + "grad_norm": 5.066093691697654, + "learning_rate": 4.922805601485921e-06, + "loss": 0.7629, + "step": 29520 + }, + { + "epoch": 1.66, + "grad_norm": 6.9980763077862305, + "learning_rate": 4.9211667890044835e-06, + "loss": 0.7896, + "step": 29525 + }, + { + "epoch": 1.66, + "grad_norm": 7.452395057883908, + "learning_rate": 4.919527984994023e-06, + "loss": 0.806, + "step": 29530 + }, + { + "epoch": 1.66, + "grad_norm": 18.906589815923898, + "learning_rate": 4.9178891896306326e-06, + "loss": 0.8162, + "step": 29535 + }, + { + "epoch": 1.66, + "grad_norm": 4.809767699924567, + "learning_rate": 4.916250403090414e-06, + "loss": 0.8011, + "step": 29540 + }, + { + "epoch": 1.66, + "grad_norm": 12.363769083652064, + "learning_rate": 4.914611625549456e-06, + "loss": 0.7945, + "step": 29545 + }, + { + "epoch": 1.66, + "grad_norm": 12.202218114203786, + "learning_rate": 4.912972857183859e-06, + "loss": 0.786, + "step": 29550 + }, + { + "epoch": 1.67, + "grad_norm": 10.660272967854556, + "learning_rate": 4.91133409816971e-06, + "loss": 0.8108, + "step": 29555 + }, + { + "epoch": 1.67, + "grad_norm": 9.476911932929125, + "learning_rate": 4.909695348683104e-06, + "loss": 0.7923, + "step": 29560 + }, + { + "epoch": 1.67, + "grad_norm": 7.806783499374322, + "learning_rate": 4.908056608900133e-06, + "loss": 0.8095, + "step": 29565 + }, + { + "epoch": 1.67, + "grad_norm": 5.138858436291346, + "learning_rate": 4.906417878996883e-06, + "loss": 0.7477, + "step": 29570 + }, + { + "epoch": 1.67, + "grad_norm": 9.895158280181477, + "learning_rate": 4.9047791591494475e-06, + "loss": 0.8601, + "step": 29575 + }, + { + "epoch": 1.67, + "grad_norm": 8.47706193638025, + "learning_rate": 4.903140449533909e-06, + "loss": 0.8103, + "step": 29580 + }, + { + "epoch": 1.67, + "grad_norm": 11.993411544865312, + "learning_rate": 4.901501750326359e-06, + "loss": 0.8425, + "step": 29585 + }, + { + "epoch": 1.67, + "grad_norm": 5.702668991067775, + "learning_rate": 4.899863061702881e-06, + "loss": 0.8218, + "step": 29590 + }, + { + "epoch": 1.67, + "grad_norm": 6.2106236377777675, + "learning_rate": 4.898224383839556e-06, + "loss": 0.8052, + "step": 29595 + }, + { + "epoch": 1.67, + "grad_norm": 5.815829897033973, + "learning_rate": 4.8965857169124714e-06, + "loss": 0.8285, + "step": 29600 + }, + { + "epoch": 1.67, + "grad_norm": 5.7280018731393865, + "learning_rate": 4.894947061097708e-06, + "loss": 0.7939, + "step": 29605 + }, + { + "epoch": 1.67, + "grad_norm": 5.512877569309113, + "learning_rate": 4.893308416571347e-06, + "loss": 0.7994, + "step": 29610 + }, + { + "epoch": 1.67, + "grad_norm": 7.532257727972996, + "learning_rate": 4.891669783509468e-06, + "loss": 0.8087, + "step": 29615 + }, + { + "epoch": 1.67, + "grad_norm": 5.9606685351204, + "learning_rate": 4.8900311620881464e-06, + "loss": 0.7874, + "step": 29620 + }, + { + "epoch": 1.67, + "grad_norm": 6.126604625139472, + "learning_rate": 4.888392552483465e-06, + "loss": 0.7716, + "step": 29625 + }, + { + "epoch": 1.67, + "grad_norm": 5.565607606006656, + "learning_rate": 4.8867539548714936e-06, + "loss": 0.7736, + "step": 29630 + }, + { + "epoch": 1.67, + "grad_norm": 6.165534621343919, + "learning_rate": 4.885115369428311e-06, + "loss": 0.8528, + "step": 29635 + }, + { + "epoch": 1.67, + "grad_norm": 11.433104211919282, + "learning_rate": 4.883476796329988e-06, + "loss": 0.8179, + "step": 29640 + }, + { + "epoch": 1.67, + "grad_norm": 5.976847490224073, + "learning_rate": 4.881838235752599e-06, + "loss": 0.7883, + "step": 29645 + }, + { + "epoch": 1.67, + "grad_norm": 8.353315632211304, + "learning_rate": 4.8801996878722115e-06, + "loss": 0.7951, + "step": 29650 + }, + { + "epoch": 1.67, + "grad_norm": 5.689167601321623, + "learning_rate": 4.878561152864896e-06, + "loss": 0.7685, + "step": 29655 + }, + { + "epoch": 1.67, + "grad_norm": 9.763086204971069, + "learning_rate": 4.8769226309067225e-06, + "loss": 0.808, + "step": 29660 + }, + { + "epoch": 1.67, + "grad_norm": 5.235599067821967, + "learning_rate": 4.875284122173753e-06, + "loss": 0.8003, + "step": 29665 + }, + { + "epoch": 1.67, + "grad_norm": 10.036504165449317, + "learning_rate": 4.873645626842058e-06, + "loss": 0.8137, + "step": 29670 + }, + { + "epoch": 1.67, + "grad_norm": 12.135836833289432, + "learning_rate": 4.872007145087698e-06, + "loss": 0.7674, + "step": 29675 + }, + { + "epoch": 1.67, + "grad_norm": 11.506328651735684, + "learning_rate": 4.870368677086732e-06, + "loss": 0.8207, + "step": 29680 + }, + { + "epoch": 1.67, + "grad_norm": 5.629886198396468, + "learning_rate": 4.868730223015227e-06, + "loss": 0.8029, + "step": 29685 + }, + { + "epoch": 1.67, + "grad_norm": 8.824316072670978, + "learning_rate": 4.867091783049236e-06, + "loss": 0.8197, + "step": 29690 + }, + { + "epoch": 1.67, + "grad_norm": 8.418927302091987, + "learning_rate": 4.865453357364821e-06, + "loss": 0.7727, + "step": 29695 + }, + { + "epoch": 1.67, + "grad_norm": 5.571949221645414, + "learning_rate": 4.8638149461380364e-06, + "loss": 0.7894, + "step": 29700 + }, + { + "epoch": 1.67, + "grad_norm": 13.467381923539769, + "learning_rate": 4.8621765495449354e-06, + "loss": 0.8143, + "step": 29705 + }, + { + "epoch": 1.67, + "grad_norm": 5.121398006803582, + "learning_rate": 4.860538167761575e-06, + "loss": 0.8145, + "step": 29710 + }, + { + "epoch": 1.67, + "grad_norm": 12.141009686601915, + "learning_rate": 4.858899800964002e-06, + "loss": 0.8161, + "step": 29715 + }, + { + "epoch": 1.67, + "grad_norm": 5.815248446997574, + "learning_rate": 4.85726144932827e-06, + "loss": 0.7666, + "step": 29720 + }, + { + "epoch": 1.67, + "grad_norm": 10.300760267108112, + "learning_rate": 4.855623113030423e-06, + "loss": 0.7932, + "step": 29725 + }, + { + "epoch": 1.68, + "grad_norm": 15.222183631700164, + "learning_rate": 4.8539847922465115e-06, + "loss": 0.8275, + "step": 29730 + }, + { + "epoch": 1.68, + "grad_norm": 11.599627578657428, + "learning_rate": 4.8523464871525785e-06, + "loss": 0.7501, + "step": 29735 + }, + { + "epoch": 1.68, + "grad_norm": 9.447940565078039, + "learning_rate": 4.850708197924666e-06, + "loss": 0.7678, + "step": 29740 + }, + { + "epoch": 1.68, + "grad_norm": 12.126041805046706, + "learning_rate": 4.8490699247388165e-06, + "loss": 0.8297, + "step": 29745 + }, + { + "epoch": 1.68, + "grad_norm": 5.746869691429128, + "learning_rate": 4.8474316677710695e-06, + "loss": 0.8213, + "step": 29750 + }, + { + "epoch": 1.68, + "grad_norm": 10.25782994771493, + "learning_rate": 4.845793427197465e-06, + "loss": 0.7632, + "step": 29755 + }, + { + "epoch": 1.68, + "grad_norm": 14.112215242568517, + "learning_rate": 4.844155203194036e-06, + "loss": 0.8075, + "step": 29760 + }, + { + "epoch": 1.68, + "grad_norm": 16.247021941465757, + "learning_rate": 4.842516995936817e-06, + "loss": 0.7925, + "step": 29765 + }, + { + "epoch": 1.68, + "grad_norm": 15.407277703325768, + "learning_rate": 4.840878805601845e-06, + "loss": 0.7922, + "step": 29770 + }, + { + "epoch": 1.68, + "grad_norm": 19.07675960183057, + "learning_rate": 4.839240632365144e-06, + "loss": 0.7974, + "step": 29775 + }, + { + "epoch": 1.68, + "grad_norm": 5.337796061320492, + "learning_rate": 4.83760247640275e-06, + "loss": 0.7534, + "step": 29780 + }, + { + "epoch": 1.68, + "grad_norm": 28.93401640164091, + "learning_rate": 4.835964337890684e-06, + "loss": 0.7885, + "step": 29785 + }, + { + "epoch": 1.68, + "grad_norm": 17.14149833183626, + "learning_rate": 4.834326217004974e-06, + "loss": 0.8125, + "step": 29790 + }, + { + "epoch": 1.68, + "grad_norm": 10.61939976416113, + "learning_rate": 4.8326881139216446e-06, + "loss": 0.7885, + "step": 29795 + }, + { + "epoch": 1.68, + "grad_norm": 5.308973243131987, + "learning_rate": 4.831050028816713e-06, + "loss": 0.7826, + "step": 29800 + }, + { + "epoch": 1.68, + "grad_norm": 5.166485818628002, + "learning_rate": 4.829411961866205e-06, + "loss": 0.7902, + "step": 29805 + }, + { + "epoch": 1.68, + "grad_norm": 6.235460655298076, + "learning_rate": 4.827773913246132e-06, + "loss": 0.7853, + "step": 29810 + }, + { + "epoch": 1.68, + "grad_norm": 7.012300261771474, + "learning_rate": 4.826135883132514e-06, + "loss": 0.7916, + "step": 29815 + }, + { + "epoch": 1.68, + "grad_norm": 19.4966381445275, + "learning_rate": 4.824497871701363e-06, + "loss": 0.7992, + "step": 29820 + }, + { + "epoch": 1.68, + "grad_norm": 5.7356534738914435, + "learning_rate": 4.822859879128687e-06, + "loss": 0.7997, + "step": 29825 + }, + { + "epoch": 1.68, + "grad_norm": 12.403234122506506, + "learning_rate": 4.821221905590501e-06, + "loss": 0.8447, + "step": 29830 + }, + { + "epoch": 1.68, + "grad_norm": 7.960980550710589, + "learning_rate": 4.81958395126281e-06, + "loss": 0.7782, + "step": 29835 + }, + { + "epoch": 1.68, + "grad_norm": 7.2252945598010765, + "learning_rate": 4.817946016321619e-06, + "loss": 0.8208, + "step": 29840 + }, + { + "epoch": 1.68, + "grad_norm": 5.996738294284722, + "learning_rate": 4.816308100942935e-06, + "loss": 0.7859, + "step": 29845 + }, + { + "epoch": 1.68, + "grad_norm": 6.681301926252344, + "learning_rate": 4.8146702053027525e-06, + "loss": 0.7857, + "step": 29850 + }, + { + "epoch": 1.68, + "grad_norm": 4.932112347604734, + "learning_rate": 4.813032329577076e-06, + "loss": 0.7957, + "step": 29855 + }, + { + "epoch": 1.68, + "grad_norm": 5.849151009617056, + "learning_rate": 4.8113944739419e-06, + "loss": 0.8329, + "step": 29860 + }, + { + "epoch": 1.68, + "grad_norm": 5.625374211275668, + "learning_rate": 4.809756638573222e-06, + "loss": 0.8123, + "step": 29865 + }, + { + "epoch": 1.68, + "grad_norm": 5.362709994828513, + "learning_rate": 4.8081188236470315e-06, + "loss": 0.7906, + "step": 29870 + }, + { + "epoch": 1.68, + "grad_norm": 6.880402705932108, + "learning_rate": 4.806481029339321e-06, + "loss": 0.7437, + "step": 29875 + }, + { + "epoch": 1.68, + "grad_norm": 10.69619434877722, + "learning_rate": 4.8048432558260765e-06, + "loss": 0.7687, + "step": 29880 + }, + { + "epoch": 1.68, + "grad_norm": 19.418283962974726, + "learning_rate": 4.803205503283285e-06, + "loss": 0.8102, + "step": 29885 + }, + { + "epoch": 1.68, + "grad_norm": 5.137728309150263, + "learning_rate": 4.801567771886933e-06, + "loss": 0.8295, + "step": 29890 + }, + { + "epoch": 1.68, + "grad_norm": 12.98234463446497, + "learning_rate": 4.7999300618129976e-06, + "loss": 0.7419, + "step": 29895 + }, + { + "epoch": 1.68, + "grad_norm": 4.746731612064017, + "learning_rate": 4.798292373237462e-06, + "loss": 0.8162, + "step": 29900 + }, + { + "epoch": 1.68, + "grad_norm": 8.864035556704577, + "learning_rate": 4.796654706336301e-06, + "loss": 0.8153, + "step": 29905 + }, + { + "epoch": 1.69, + "grad_norm": 6.749274723640831, + "learning_rate": 4.795017061285487e-06, + "loss": 0.8055, + "step": 29910 + }, + { + "epoch": 1.69, + "grad_norm": 4.622351675017694, + "learning_rate": 4.793379438260997e-06, + "loss": 0.8148, + "step": 29915 + }, + { + "epoch": 1.69, + "grad_norm": 5.051848746409057, + "learning_rate": 4.7917418374387955e-06, + "loss": 0.7543, + "step": 29920 + }, + { + "epoch": 1.69, + "grad_norm": 13.258996248407842, + "learning_rate": 4.790104258994855e-06, + "loss": 0.7768, + "step": 29925 + }, + { + "epoch": 1.69, + "grad_norm": 5.097692331287701, + "learning_rate": 4.788466703105138e-06, + "loss": 0.7802, + "step": 29930 + }, + { + "epoch": 1.69, + "grad_norm": 10.725187362488219, + "learning_rate": 4.786829169945606e-06, + "loss": 0.7569, + "step": 29935 + }, + { + "epoch": 1.69, + "grad_norm": 7.264251665243966, + "learning_rate": 4.785191659692222e-06, + "loss": 0.824, + "step": 29940 + }, + { + "epoch": 1.69, + "grad_norm": 6.892749424970571, + "learning_rate": 4.78355417252094e-06, + "loss": 0.7977, + "step": 29945 + }, + { + "epoch": 1.69, + "grad_norm": 5.152772257028792, + "learning_rate": 4.78191670860772e-06, + "loss": 0.7677, + "step": 29950 + }, + { + "epoch": 1.69, + "grad_norm": 6.7418885709930025, + "learning_rate": 4.78027926812851e-06, + "loss": 0.8035, + "step": 29955 + }, + { + "epoch": 1.69, + "grad_norm": 5.703054725185859, + "learning_rate": 4.778641851259264e-06, + "loss": 0.8008, + "step": 29960 + }, + { + "epoch": 1.69, + "grad_norm": 16.151759196592305, + "learning_rate": 4.777004458175928e-06, + "loss": 0.8152, + "step": 29965 + }, + { + "epoch": 1.69, + "grad_norm": 5.910392749323405, + "learning_rate": 4.775367089054446e-06, + "loss": 0.7963, + "step": 29970 + }, + { + "epoch": 1.69, + "grad_norm": 14.67542640599338, + "learning_rate": 4.773729744070762e-06, + "loss": 0.7965, + "step": 29975 + }, + { + "epoch": 1.69, + "grad_norm": 19.08530116647858, + "learning_rate": 4.772092423400815e-06, + "loss": 0.8069, + "step": 29980 + }, + { + "epoch": 1.69, + "grad_norm": 6.09800617392457, + "learning_rate": 4.770455127220544e-06, + "loss": 0.7727, + "step": 29985 + }, + { + "epoch": 1.69, + "grad_norm": 33.23931494505973, + "learning_rate": 4.7688178557058834e-06, + "loss": 0.7698, + "step": 29990 + }, + { + "epoch": 1.69, + "grad_norm": 12.856668275163106, + "learning_rate": 4.7671806090327625e-06, + "loss": 0.7984, + "step": 29995 + }, + { + "epoch": 1.69, + "grad_norm": 29.485450180586472, + "learning_rate": 4.765543387377115e-06, + "loss": 0.8264, + "step": 30000 + }, + { + "epoch": 1.69, + "grad_norm": 10.888031482974746, + "learning_rate": 4.763906190914862e-06, + "loss": 0.8033, + "step": 30005 + }, + { + "epoch": 1.69, + "grad_norm": 19.47157323006894, + "learning_rate": 4.762269019821933e-06, + "loss": 0.8286, + "step": 30010 + }, + { + "epoch": 1.69, + "grad_norm": 12.841732621791369, + "learning_rate": 4.760631874274247e-06, + "loss": 0.8062, + "step": 30015 + }, + { + "epoch": 1.69, + "grad_norm": 9.181694156915245, + "learning_rate": 4.758994754447721e-06, + "loss": 0.8381, + "step": 30020 + }, + { + "epoch": 1.69, + "grad_norm": 9.925935935029198, + "learning_rate": 4.757357660518275e-06, + "loss": 0.8215, + "step": 30025 + }, + { + "epoch": 1.69, + "grad_norm": 5.24092344867244, + "learning_rate": 4.755720592661816e-06, + "loss": 0.8017, + "step": 30030 + }, + { + "epoch": 1.69, + "grad_norm": 9.308951157601896, + "learning_rate": 4.75408355105426e-06, + "loss": 0.7926, + "step": 30035 + }, + { + "epoch": 1.69, + "grad_norm": 5.639978704363475, + "learning_rate": 4.752446535871509e-06, + "loss": 0.7751, + "step": 30040 + }, + { + "epoch": 1.69, + "grad_norm": 14.448822047152843, + "learning_rate": 4.750809547289473e-06, + "loss": 0.8199, + "step": 30045 + }, + { + "epoch": 1.69, + "grad_norm": 5.910283424670611, + "learning_rate": 4.7491725854840516e-06, + "loss": 0.7898, + "step": 30050 + }, + { + "epoch": 1.69, + "grad_norm": 5.488005216345067, + "learning_rate": 4.74753565063114e-06, + "loss": 0.766, + "step": 30055 + }, + { + "epoch": 1.69, + "grad_norm": 5.585112345932322, + "learning_rate": 4.7458987429066374e-06, + "loss": 0.8018, + "step": 30060 + }, + { + "epoch": 1.69, + "grad_norm": 6.352788415478089, + "learning_rate": 4.744261862486438e-06, + "loss": 0.8029, + "step": 30065 + }, + { + "epoch": 1.69, + "grad_norm": 12.564595572553094, + "learning_rate": 4.742625009546429e-06, + "loss": 0.7866, + "step": 30070 + }, + { + "epoch": 1.69, + "grad_norm": 6.3516501002682055, + "learning_rate": 4.740988184262499e-06, + "loss": 0.7552, + "step": 30075 + }, + { + "epoch": 1.69, + "grad_norm": 12.031661111625874, + "learning_rate": 4.73935138681053e-06, + "loss": 0.7799, + "step": 30080 + }, + { + "epoch": 1.7, + "grad_norm": 7.047777136097501, + "learning_rate": 4.737714617366408e-06, + "loss": 0.797, + "step": 30085 + }, + { + "epoch": 1.7, + "grad_norm": 16.841185629960563, + "learning_rate": 4.736077876106005e-06, + "loss": 0.7988, + "step": 30090 + }, + { + "epoch": 1.7, + "grad_norm": 6.902412267033933, + "learning_rate": 4.734441163205202e-06, + "loss": 0.7972, + "step": 30095 + }, + { + "epoch": 1.7, + "grad_norm": 7.899823930188095, + "learning_rate": 4.7328044788398655e-06, + "loss": 0.822, + "step": 30100 + }, + { + "epoch": 1.7, + "grad_norm": 5.244104167681226, + "learning_rate": 4.731167823185867e-06, + "loss": 0.7611, + "step": 30105 + }, + { + "epoch": 1.7, + "grad_norm": 5.031387987540888, + "learning_rate": 4.729531196419074e-06, + "loss": 0.7859, + "step": 30110 + }, + { + "epoch": 1.7, + "grad_norm": 9.551106765441283, + "learning_rate": 4.727894598715345e-06, + "loss": 0.8148, + "step": 30115 + }, + { + "epoch": 1.7, + "grad_norm": 7.9615410825668045, + "learning_rate": 4.726258030250545e-06, + "loss": 0.774, + "step": 30120 + }, + { + "epoch": 1.7, + "grad_norm": 5.299898513292915, + "learning_rate": 4.7246214912005255e-06, + "loss": 0.7852, + "step": 30125 + }, + { + "epoch": 1.7, + "grad_norm": 10.080392471708402, + "learning_rate": 4.7229849817411445e-06, + "loss": 0.7968, + "step": 30130 + }, + { + "epoch": 1.7, + "grad_norm": 5.385539634015399, + "learning_rate": 4.7213485020482495e-06, + "loss": 0.8311, + "step": 30135 + }, + { + "epoch": 1.7, + "grad_norm": 6.118844227317314, + "learning_rate": 4.7197120522976855e-06, + "loss": 0.7762, + "step": 30140 + }, + { + "epoch": 1.7, + "grad_norm": 10.017188656833131, + "learning_rate": 4.718075632665301e-06, + "loss": 0.7788, + "step": 30145 + }, + { + "epoch": 1.7, + "grad_norm": 4.61556780088029, + "learning_rate": 4.7164392433269314e-06, + "loss": 0.8016, + "step": 30150 + }, + { + "epoch": 1.7, + "grad_norm": 7.635523592967406, + "learning_rate": 4.7148028844584185e-06, + "loss": 0.7737, + "step": 30155 + }, + { + "epoch": 1.7, + "grad_norm": 4.846318759564663, + "learning_rate": 4.713166556235594e-06, + "loss": 0.8064, + "step": 30160 + }, + { + "epoch": 1.7, + "grad_norm": 5.30424642437604, + "learning_rate": 4.7115302588342885e-06, + "loss": 0.7453, + "step": 30165 + }, + { + "epoch": 1.7, + "grad_norm": 4.831306235957011, + "learning_rate": 4.709893992430331e-06, + "loss": 0.7741, + "step": 30170 + }, + { + "epoch": 1.7, + "grad_norm": 5.330872623207505, + "learning_rate": 4.708257757199542e-06, + "loss": 0.7999, + "step": 30175 + }, + { + "epoch": 1.7, + "grad_norm": 25.378696340898358, + "learning_rate": 4.706621553317746e-06, + "loss": 0.7982, + "step": 30180 + }, + { + "epoch": 1.7, + "grad_norm": 5.688725928598445, + "learning_rate": 4.7049853809607604e-06, + "loss": 0.7798, + "step": 30185 + }, + { + "epoch": 1.7, + "grad_norm": 10.230638946395093, + "learning_rate": 4.703349240304395e-06, + "loss": 0.7861, + "step": 30190 + }, + { + "epoch": 1.7, + "grad_norm": 5.038961700679986, + "learning_rate": 4.701713131524464e-06, + "loss": 0.7871, + "step": 30195 + }, + { + "epoch": 1.7, + "grad_norm": 25.196247234254738, + "learning_rate": 4.7000770547967735e-06, + "loss": 0.8176, + "step": 30200 + }, + { + "epoch": 1.7, + "grad_norm": 10.882688341047091, + "learning_rate": 4.6984410102971265e-06, + "loss": 0.7958, + "step": 30205 + }, + { + "epoch": 1.7, + "grad_norm": 8.982819953624391, + "learning_rate": 4.696804998201324e-06, + "loss": 0.7492, + "step": 30210 + }, + { + "epoch": 1.7, + "grad_norm": 9.202877011319478, + "learning_rate": 4.695169018685164e-06, + "loss": 0.7599, + "step": 30215 + }, + { + "epoch": 1.7, + "grad_norm": 18.016970163694605, + "learning_rate": 4.693533071924438e-06, + "loss": 0.8026, + "step": 30220 + }, + { + "epoch": 1.7, + "grad_norm": 5.749427803192396, + "learning_rate": 4.691897158094934e-06, + "loss": 0.8056, + "step": 30225 + }, + { + "epoch": 1.7, + "grad_norm": 6.395352516790499, + "learning_rate": 4.690261277372442e-06, + "loss": 0.754, + "step": 30230 + }, + { + "epoch": 1.7, + "grad_norm": 14.312698473040356, + "learning_rate": 4.68862542993274e-06, + "loss": 0.8053, + "step": 30235 + }, + { + "epoch": 1.7, + "grad_norm": 7.961063223845596, + "learning_rate": 4.686989615951613e-06, + "loss": 0.8001, + "step": 30240 + }, + { + "epoch": 1.7, + "grad_norm": 9.015842572499906, + "learning_rate": 4.685353835604832e-06, + "loss": 0.8056, + "step": 30245 + }, + { + "epoch": 1.7, + "grad_norm": 4.8779439199749115, + "learning_rate": 4.683718089068169e-06, + "loss": 0.8126, + "step": 30250 + }, + { + "epoch": 1.7, + "grad_norm": 5.033629123670667, + "learning_rate": 4.682082376517395e-06, + "loss": 0.796, + "step": 30255 + }, + { + "epoch": 1.7, + "grad_norm": 6.04106153601148, + "learning_rate": 4.680446698128271e-06, + "loss": 0.7696, + "step": 30260 + }, + { + "epoch": 1.71, + "grad_norm": 12.949650591888291, + "learning_rate": 4.678811054076561e-06, + "loss": 0.8016, + "step": 30265 + }, + { + "epoch": 1.71, + "grad_norm": 6.103165173015239, + "learning_rate": 4.677175444538021e-06, + "loss": 0.8282, + "step": 30270 + }, + { + "epoch": 1.71, + "grad_norm": 5.357339839085992, + "learning_rate": 4.675539869688402e-06, + "loss": 0.8257, + "step": 30275 + }, + { + "epoch": 1.71, + "grad_norm": 5.004583256288209, + "learning_rate": 4.673904329703459e-06, + "loss": 0.734, + "step": 30280 + }, + { + "epoch": 1.71, + "grad_norm": 7.032854359968172, + "learning_rate": 4.672268824758932e-06, + "loss": 0.8103, + "step": 30285 + }, + { + "epoch": 1.71, + "grad_norm": 6.704184503885026, + "learning_rate": 4.6706333550305675e-06, + "loss": 0.7754, + "step": 30290 + }, + { + "epoch": 1.71, + "grad_norm": 13.83805772876524, + "learning_rate": 4.668997920694103e-06, + "loss": 0.7956, + "step": 30295 + }, + { + "epoch": 1.71, + "grad_norm": 9.60882884163727, + "learning_rate": 4.667362521925272e-06, + "loss": 0.7972, + "step": 30300 + }, + { + "epoch": 1.71, + "grad_norm": 4.4931881686723205, + "learning_rate": 4.665727158899807e-06, + "loss": 0.7891, + "step": 30305 + }, + { + "epoch": 1.71, + "grad_norm": 6.717858080434424, + "learning_rate": 4.664091831793432e-06, + "loss": 0.8242, + "step": 30310 + }, + { + "epoch": 1.71, + "grad_norm": 6.591795154861806, + "learning_rate": 4.662456540781875e-06, + "loss": 0.8148, + "step": 30315 + }, + { + "epoch": 1.71, + "grad_norm": 5.324604322295022, + "learning_rate": 4.66082128604085e-06, + "loss": 0.7701, + "step": 30320 + }, + { + "epoch": 1.71, + "grad_norm": 4.807349808563159, + "learning_rate": 4.6591860677460765e-06, + "loss": 0.7871, + "step": 30325 + }, + { + "epoch": 1.71, + "grad_norm": 7.379971415663782, + "learning_rate": 4.657550886073263e-06, + "loss": 0.7308, + "step": 30330 + }, + { + "epoch": 1.71, + "grad_norm": 10.692979415457627, + "learning_rate": 4.6559157411981185e-06, + "loss": 0.8314, + "step": 30335 + }, + { + "epoch": 1.71, + "grad_norm": 9.322567919421736, + "learning_rate": 4.6542806332963455e-06, + "loss": 0.783, + "step": 30340 + }, + { + "epoch": 1.71, + "grad_norm": 14.504274834555929, + "learning_rate": 4.6526455625436436e-06, + "loss": 0.798, + "step": 30345 + }, + { + "epoch": 1.71, + "grad_norm": 10.137756434797579, + "learning_rate": 4.651010529115712e-06, + "loss": 0.7444, + "step": 30350 + }, + { + "epoch": 1.71, + "grad_norm": 16.048530234730052, + "learning_rate": 4.649375533188239e-06, + "loss": 0.8339, + "step": 30355 + }, + { + "epoch": 1.71, + "grad_norm": 6.266714893016973, + "learning_rate": 4.6477405749369095e-06, + "loss": 0.7527, + "step": 30360 + }, + { + "epoch": 1.71, + "grad_norm": 11.202683717523678, + "learning_rate": 4.646105654537412e-06, + "loss": 0.766, + "step": 30365 + }, + { + "epoch": 1.71, + "grad_norm": 4.908417280112789, + "learning_rate": 4.644470772165422e-06, + "loss": 0.8348, + "step": 30370 + }, + { + "epoch": 1.71, + "grad_norm": 5.92922707124643, + "learning_rate": 4.642835927996618e-06, + "loss": 0.7651, + "step": 30375 + }, + { + "epoch": 1.71, + "grad_norm": 6.221855566146902, + "learning_rate": 4.641201122206669e-06, + "loss": 0.8098, + "step": 30380 + }, + { + "epoch": 1.71, + "grad_norm": 6.945490575664971, + "learning_rate": 4.639566354971243e-06, + "loss": 0.7882, + "step": 30385 + }, + { + "epoch": 1.71, + "grad_norm": 6.631437553930325, + "learning_rate": 4.637931626466004e-06, + "loss": 0.8199, + "step": 30390 + }, + { + "epoch": 1.71, + "grad_norm": 6.748414024473478, + "learning_rate": 4.636296936866608e-06, + "loss": 0.8084, + "step": 30395 + }, + { + "epoch": 1.71, + "grad_norm": 6.093336736761858, + "learning_rate": 4.634662286348713e-06, + "loss": 0.7911, + "step": 30400 + }, + { + "epoch": 1.71, + "grad_norm": 5.89393963512712, + "learning_rate": 4.633027675087965e-06, + "loss": 0.7509, + "step": 30405 + }, + { + "epoch": 1.71, + "grad_norm": 6.572528867939662, + "learning_rate": 4.6313931032600166e-06, + "loss": 0.7688, + "step": 30410 + }, + { + "epoch": 1.71, + "grad_norm": 8.864991872589657, + "learning_rate": 4.629758571040505e-06, + "loss": 0.8021, + "step": 30415 + }, + { + "epoch": 1.71, + "grad_norm": 4.761432934035888, + "learning_rate": 4.628124078605067e-06, + "loss": 0.7702, + "step": 30420 + }, + { + "epoch": 1.71, + "grad_norm": 7.137330379569389, + "learning_rate": 4.626489626129339e-06, + "loss": 0.7672, + "step": 30425 + }, + { + "epoch": 1.71, + "grad_norm": 10.002866723263471, + "learning_rate": 4.624855213788949e-06, + "loss": 0.794, + "step": 30430 + }, + { + "epoch": 1.71, + "grad_norm": 5.807448760548121, + "learning_rate": 4.623220841759522e-06, + "loss": 0.7887, + "step": 30435 + }, + { + "epoch": 1.72, + "grad_norm": 4.982114532743113, + "learning_rate": 4.621586510216677e-06, + "loss": 0.8064, + "step": 30440 + }, + { + "epoch": 1.72, + "grad_norm": 5.19786776108365, + "learning_rate": 4.619952219336034e-06, + "loss": 0.7866, + "step": 30445 + }, + { + "epoch": 1.72, + "grad_norm": 8.185672257390578, + "learning_rate": 4.618317969293203e-06, + "loss": 0.7995, + "step": 30450 + }, + { + "epoch": 1.72, + "grad_norm": 6.677456015521814, + "learning_rate": 4.616683760263788e-06, + "loss": 0.7774, + "step": 30455 + }, + { + "epoch": 1.72, + "grad_norm": 5.902543365821851, + "learning_rate": 4.615049592423396e-06, + "loss": 0.7847, + "step": 30460 + }, + { + "epoch": 1.72, + "grad_norm": 18.054153778972065, + "learning_rate": 4.6134154659476234e-06, + "loss": 0.825, + "step": 30465 + }, + { + "epoch": 1.72, + "grad_norm": 5.093642378000999, + "learning_rate": 4.6117813810120666e-06, + "loss": 0.7743, + "step": 30470 + }, + { + "epoch": 1.72, + "grad_norm": 23.83979502928781, + "learning_rate": 4.610147337792313e-06, + "loss": 0.7427, + "step": 30475 + }, + { + "epoch": 1.72, + "grad_norm": 11.785042569240462, + "learning_rate": 4.608513336463946e-06, + "loss": 0.848, + "step": 30480 + }, + { + "epoch": 1.72, + "grad_norm": 6.86235805737726, + "learning_rate": 4.6068793772025535e-06, + "loss": 0.7708, + "step": 30485 + }, + { + "epoch": 1.72, + "grad_norm": 12.12690200621559, + "learning_rate": 4.6052454601837025e-06, + "loss": 0.77, + "step": 30490 + }, + { + "epoch": 1.72, + "grad_norm": 6.213617836410733, + "learning_rate": 4.603611585582972e-06, + "loss": 0.7688, + "step": 30495 + }, + { + "epoch": 1.72, + "grad_norm": 5.9635968095081715, + "learning_rate": 4.601977753575927e-06, + "loss": 0.7621, + "step": 30500 + }, + { + "epoch": 1.72, + "grad_norm": 9.520427801040054, + "learning_rate": 4.600343964338125e-06, + "loss": 0.7929, + "step": 30505 + }, + { + "epoch": 1.72, + "grad_norm": 5.457308644107953, + "learning_rate": 4.598710218045131e-06, + "loss": 0.7794, + "step": 30510 + }, + { + "epoch": 1.72, + "grad_norm": 4.932348994994766, + "learning_rate": 4.597076514872493e-06, + "loss": 0.8306, + "step": 30515 + }, + { + "epoch": 1.72, + "grad_norm": 4.583724636856987, + "learning_rate": 4.595442854995762e-06, + "loss": 0.7993, + "step": 30520 + }, + { + "epoch": 1.72, + "grad_norm": 6.618874416324449, + "learning_rate": 4.5938092385904826e-06, + "loss": 0.7803, + "step": 30525 + }, + { + "epoch": 1.72, + "grad_norm": 4.633748954413597, + "learning_rate": 4.592175665832193e-06, + "loss": 0.7983, + "step": 30530 + }, + { + "epoch": 1.72, + "grad_norm": 4.884533804620512, + "learning_rate": 4.590542136896428e-06, + "loss": 0.7917, + "step": 30535 + }, + { + "epoch": 1.72, + "grad_norm": 6.802421179470391, + "learning_rate": 4.588908651958716e-06, + "loss": 0.758, + "step": 30540 + }, + { + "epoch": 1.72, + "grad_norm": 4.870770880892959, + "learning_rate": 4.587275211194586e-06, + "loss": 0.7458, + "step": 30545 + }, + { + "epoch": 1.72, + "grad_norm": 15.876659920577412, + "learning_rate": 4.585641814779554e-06, + "loss": 0.8006, + "step": 30550 + }, + { + "epoch": 1.72, + "grad_norm": 11.575556345491801, + "learning_rate": 4.58400846288914e-06, + "loss": 0.8246, + "step": 30555 + }, + { + "epoch": 1.72, + "grad_norm": 10.054746527256716, + "learning_rate": 4.5823751556988515e-06, + "loss": 0.7718, + "step": 30560 + }, + { + "epoch": 1.72, + "grad_norm": 8.787702906973506, + "learning_rate": 4.580741893384195e-06, + "loss": 0.7374, + "step": 30565 + }, + { + "epoch": 1.72, + "grad_norm": 13.496357982882074, + "learning_rate": 4.579108676120673e-06, + "loss": 0.7297, + "step": 30570 + }, + { + "epoch": 1.72, + "grad_norm": 7.188715676796305, + "learning_rate": 4.57747550408378e-06, + "loss": 0.7705, + "step": 30575 + }, + { + "epoch": 1.72, + "grad_norm": 12.96915600723857, + "learning_rate": 4.575842377449011e-06, + "loss": 0.7701, + "step": 30580 + }, + { + "epoch": 1.72, + "grad_norm": 10.27471628944124, + "learning_rate": 4.57420929639185e-06, + "loss": 0.7919, + "step": 30585 + }, + { + "epoch": 1.72, + "grad_norm": 6.954474080427729, + "learning_rate": 4.572576261087777e-06, + "loss": 0.8263, + "step": 30590 + }, + { + "epoch": 1.72, + "grad_norm": 4.967339068825614, + "learning_rate": 4.570943271712274e-06, + "loss": 0.8065, + "step": 30595 + }, + { + "epoch": 1.72, + "grad_norm": 5.95307628650065, + "learning_rate": 4.569310328440808e-06, + "loss": 0.7714, + "step": 30600 + }, + { + "epoch": 1.72, + "grad_norm": 6.1858974515440295, + "learning_rate": 4.56767743144885e-06, + "loss": 0.7888, + "step": 30605 + }, + { + "epoch": 1.72, + "grad_norm": 6.603289368621448, + "learning_rate": 4.566044580911857e-06, + "loss": 0.7874, + "step": 30610 + }, + { + "epoch": 1.72, + "grad_norm": 5.212870890259156, + "learning_rate": 4.564411777005291e-06, + "loss": 0.7233, + "step": 30615 + }, + { + "epoch": 1.73, + "grad_norm": 8.021863111667034, + "learning_rate": 4.562779019904603e-06, + "loss": 0.7685, + "step": 30620 + }, + { + "epoch": 1.73, + "grad_norm": 13.57482857299841, + "learning_rate": 4.561146309785236e-06, + "loss": 0.743, + "step": 30625 + }, + { + "epoch": 1.73, + "grad_norm": 12.933359595418446, + "learning_rate": 4.5595136468226374e-06, + "loss": 0.7689, + "step": 30630 + }, + { + "epoch": 1.73, + "grad_norm": 11.644422904804832, + "learning_rate": 4.5578810311922396e-06, + "loss": 0.7854, + "step": 30635 + }, + { + "epoch": 1.73, + "grad_norm": 4.985852582945892, + "learning_rate": 4.556248463069478e-06, + "loss": 0.7833, + "step": 30640 + }, + { + "epoch": 1.73, + "grad_norm": 5.616850299489971, + "learning_rate": 4.5546159426297775e-06, + "loss": 0.7798, + "step": 30645 + }, + { + "epoch": 1.73, + "grad_norm": 5.40064317403941, + "learning_rate": 4.552983470048558e-06, + "loss": 0.7806, + "step": 30650 + }, + { + "epoch": 1.73, + "grad_norm": 5.304983524882694, + "learning_rate": 4.551351045501238e-06, + "loss": 0.7595, + "step": 30655 + }, + { + "epoch": 1.73, + "grad_norm": 8.481968054889494, + "learning_rate": 4.549718669163228e-06, + "loss": 0.7718, + "step": 30660 + }, + { + "epoch": 1.73, + "grad_norm": 7.860778228699741, + "learning_rate": 4.548086341209934e-06, + "loss": 0.7637, + "step": 30665 + }, + { + "epoch": 1.73, + "grad_norm": 4.648316604195691, + "learning_rate": 4.546454061816759e-06, + "loss": 0.8252, + "step": 30670 + }, + { + "epoch": 1.73, + "grad_norm": 10.1501342867187, + "learning_rate": 4.544821831159094e-06, + "loss": 0.7855, + "step": 30675 + }, + { + "epoch": 1.73, + "grad_norm": 6.679821449145516, + "learning_rate": 4.543189649412335e-06, + "loss": 0.7714, + "step": 30680 + }, + { + "epoch": 1.73, + "grad_norm": 8.658851702787414, + "learning_rate": 4.541557516751861e-06, + "loss": 0.8068, + "step": 30685 + }, + { + "epoch": 1.73, + "grad_norm": 5.5092821750011645, + "learning_rate": 4.5399254333530575e-06, + "loss": 0.7554, + "step": 30690 + }, + { + "epoch": 1.73, + "grad_norm": 14.18908964575066, + "learning_rate": 4.538293399391295e-06, + "loss": 0.77, + "step": 30695 + }, + { + "epoch": 1.73, + "grad_norm": 19.764968967316946, + "learning_rate": 4.536661415041947e-06, + "loss": 0.8045, + "step": 30700 + }, + { + "epoch": 1.73, + "grad_norm": 12.07756035863198, + "learning_rate": 4.535029480480373e-06, + "loss": 0.7665, + "step": 30705 + }, + { + "epoch": 1.73, + "grad_norm": 9.869028974400354, + "learning_rate": 4.533397595881932e-06, + "loss": 0.7533, + "step": 30710 + }, + { + "epoch": 1.73, + "grad_norm": 18.928500894149803, + "learning_rate": 4.531765761421981e-06, + "loss": 0.7466, + "step": 30715 + }, + { + "epoch": 1.73, + "grad_norm": 7.129437668117854, + "learning_rate": 4.530133977275863e-06, + "loss": 0.7936, + "step": 30720 + }, + { + "epoch": 1.73, + "grad_norm": 4.992181502887697, + "learning_rate": 4.528502243618927e-06, + "loss": 0.7721, + "step": 30725 + }, + { + "epoch": 1.73, + "grad_norm": 9.03655983735585, + "learning_rate": 4.526870560626503e-06, + "loss": 0.7886, + "step": 30730 + }, + { + "epoch": 1.73, + "grad_norm": 10.84818588439893, + "learning_rate": 4.525238928473924e-06, + "loss": 0.7785, + "step": 30735 + }, + { + "epoch": 1.73, + "grad_norm": 5.044420929900029, + "learning_rate": 4.52360734733652e-06, + "loss": 0.768, + "step": 30740 + }, + { + "epoch": 1.73, + "grad_norm": 10.70383824810081, + "learning_rate": 4.521975817389607e-06, + "loss": 0.8228, + "step": 30745 + }, + { + "epoch": 1.73, + "grad_norm": 11.220618034970984, + "learning_rate": 4.520344338808503e-06, + "loss": 0.7752, + "step": 30750 + }, + { + "epoch": 1.73, + "grad_norm": 10.439677389351129, + "learning_rate": 4.518712911768518e-06, + "loss": 0.7824, + "step": 30755 + }, + { + "epoch": 1.73, + "grad_norm": 6.365446726376254, + "learning_rate": 4.517081536444953e-06, + "loss": 0.8092, + "step": 30760 + }, + { + "epoch": 1.73, + "grad_norm": 5.978277276892899, + "learning_rate": 4.515450213013109e-06, + "loss": 0.8105, + "step": 30765 + }, + { + "epoch": 1.73, + "grad_norm": 5.593006875897456, + "learning_rate": 4.5138189416482765e-06, + "loss": 0.759, + "step": 30770 + }, + { + "epoch": 1.73, + "grad_norm": 8.87811568062077, + "learning_rate": 4.5121877225257465e-06, + "loss": 0.7763, + "step": 30775 + }, + { + "epoch": 1.73, + "grad_norm": 6.072742428425823, + "learning_rate": 4.510556555820797e-06, + "loss": 0.7657, + "step": 30780 + }, + { + "epoch": 1.73, + "grad_norm": 10.017659350867577, + "learning_rate": 4.508925441708707e-06, + "loss": 0.7766, + "step": 30785 + }, + { + "epoch": 1.73, + "grad_norm": 6.291281139526044, + "learning_rate": 4.5072943803647455e-06, + "loss": 0.8064, + "step": 30790 + }, + { + "epoch": 1.74, + "grad_norm": 15.205925312444233, + "learning_rate": 4.505663371964176e-06, + "loss": 0.7985, + "step": 30795 + }, + { + "epoch": 1.74, + "grad_norm": 12.836958271735716, + "learning_rate": 4.50403241668226e-06, + "loss": 0.7831, + "step": 30800 + }, + { + "epoch": 1.74, + "grad_norm": 11.205153316243075, + "learning_rate": 4.502401514694248e-06, + "loss": 0.7513, + "step": 30805 + }, + { + "epoch": 1.74, + "grad_norm": 23.470695761410514, + "learning_rate": 4.500770666175391e-06, + "loss": 0.8096, + "step": 30810 + }, + { + "epoch": 1.74, + "grad_norm": 5.630883665656526, + "learning_rate": 4.499139871300931e-06, + "loss": 0.7521, + "step": 30815 + }, + { + "epoch": 1.74, + "grad_norm": 5.350718036561842, + "learning_rate": 4.497509130246098e-06, + "loss": 0.8045, + "step": 30820 + }, + { + "epoch": 1.74, + "grad_norm": 6.879727503033959, + "learning_rate": 4.49587844318613e-06, + "loss": 0.7993, + "step": 30825 + }, + { + "epoch": 1.74, + "grad_norm": 6.544375792731871, + "learning_rate": 4.494247810296245e-06, + "loss": 0.8047, + "step": 30830 + }, + { + "epoch": 1.74, + "grad_norm": 8.72640514434046, + "learning_rate": 4.492617231751668e-06, + "loss": 0.8267, + "step": 30835 + }, + { + "epoch": 1.74, + "grad_norm": 9.91031449738027, + "learning_rate": 4.490986707727607e-06, + "loss": 0.7463, + "step": 30840 + }, + { + "epoch": 1.74, + "grad_norm": 6.68250037970916, + "learning_rate": 4.489356238399271e-06, + "loss": 0.8151, + "step": 30845 + }, + { + "epoch": 1.74, + "grad_norm": 7.799596402429676, + "learning_rate": 4.487725823941861e-06, + "loss": 0.7721, + "step": 30850 + }, + { + "epoch": 1.74, + "grad_norm": 5.655085216244254, + "learning_rate": 4.486095464530572e-06, + "loss": 0.7693, + "step": 30855 + }, + { + "epoch": 1.74, + "grad_norm": 12.772520859631326, + "learning_rate": 4.4844651603405935e-06, + "loss": 0.7378, + "step": 30860 + }, + { + "epoch": 1.74, + "grad_norm": 5.892676890718672, + "learning_rate": 4.482834911547108e-06, + "loss": 0.7642, + "step": 30865 + }, + { + "epoch": 1.74, + "grad_norm": 4.921523137293614, + "learning_rate": 4.481204718325295e-06, + "loss": 0.7459, + "step": 30870 + }, + { + "epoch": 1.74, + "grad_norm": 6.110797146339437, + "learning_rate": 4.479574580850325e-06, + "loss": 0.7745, + "step": 30875 + }, + { + "epoch": 1.74, + "grad_norm": 9.715277224027325, + "learning_rate": 4.477944499297361e-06, + "loss": 0.7393, + "step": 30880 + }, + { + "epoch": 1.74, + "grad_norm": 12.57049664161394, + "learning_rate": 4.4763144738415655e-06, + "loss": 0.7739, + "step": 30885 + }, + { + "epoch": 1.74, + "grad_norm": 7.051762173851264, + "learning_rate": 4.474684504658091e-06, + "loss": 0.7689, + "step": 30890 + }, + { + "epoch": 1.74, + "grad_norm": 13.070437940829969, + "learning_rate": 4.473054591922085e-06, + "loss": 0.782, + "step": 30895 + }, + { + "epoch": 1.74, + "grad_norm": 7.6936418351549305, + "learning_rate": 4.471424735808689e-06, + "loss": 0.7977, + "step": 30900 + }, + { + "epoch": 1.74, + "grad_norm": 6.944237077514344, + "learning_rate": 4.469794936493037e-06, + "loss": 0.7659, + "step": 30905 + }, + { + "epoch": 1.74, + "grad_norm": 16.0988930121814, + "learning_rate": 4.46816519415026e-06, + "loss": 0.8011, + "step": 30910 + }, + { + "epoch": 1.74, + "grad_norm": 13.813932156741568, + "learning_rate": 4.4665355089554786e-06, + "loss": 0.8182, + "step": 30915 + }, + { + "epoch": 1.74, + "grad_norm": 5.032759838191125, + "learning_rate": 4.464905881083813e-06, + "loss": 0.788, + "step": 30920 + }, + { + "epoch": 1.74, + "grad_norm": 6.10155654552586, + "learning_rate": 4.46327631071037e-06, + "loss": 0.781, + "step": 30925 + }, + { + "epoch": 1.74, + "grad_norm": 5.770885476095118, + "learning_rate": 4.461646798010259e-06, + "loss": 0.7785, + "step": 30930 + }, + { + "epoch": 1.74, + "grad_norm": 8.366305357005817, + "learning_rate": 4.4600173431585734e-06, + "loss": 0.7825, + "step": 30935 + }, + { + "epoch": 1.74, + "grad_norm": 5.0720709688726275, + "learning_rate": 4.458387946330406e-06, + "loss": 0.7828, + "step": 30940 + }, + { + "epoch": 1.74, + "grad_norm": 5.104130471865807, + "learning_rate": 4.4567586077008475e-06, + "loss": 0.8041, + "step": 30945 + }, + { + "epoch": 1.74, + "grad_norm": 5.382667558508729, + "learning_rate": 4.455129327444971e-06, + "loss": 0.7559, + "step": 30950 + }, + { + "epoch": 1.74, + "grad_norm": 6.282227207214767, + "learning_rate": 4.453500105737856e-06, + "loss": 0.7512, + "step": 30955 + }, + { + "epoch": 1.74, + "grad_norm": 4.630416905290494, + "learning_rate": 4.451870942754566e-06, + "loss": 0.7707, + "step": 30960 + }, + { + "epoch": 1.74, + "grad_norm": 7.823400173313469, + "learning_rate": 4.4502418386701595e-06, + "loss": 0.7944, + "step": 30965 + }, + { + "epoch": 1.74, + "grad_norm": 6.140146188727773, + "learning_rate": 4.4486127936596966e-06, + "loss": 0.8237, + "step": 30970 + }, + { + "epoch": 1.75, + "grad_norm": 6.267334861579995, + "learning_rate": 4.44698380789822e-06, + "loss": 0.7765, + "step": 30975 + }, + { + "epoch": 1.75, + "grad_norm": 5.091711751004377, + "learning_rate": 4.4453548815607756e-06, + "loss": 0.7869, + "step": 30980 + }, + { + "epoch": 1.75, + "grad_norm": 5.673789695748567, + "learning_rate": 4.443726014822398e-06, + "loss": 0.7586, + "step": 30985 + }, + { + "epoch": 1.75, + "grad_norm": 5.978048564926374, + "learning_rate": 4.442097207858113e-06, + "loss": 0.7226, + "step": 30990 + }, + { + "epoch": 1.75, + "grad_norm": 10.043454928860996, + "learning_rate": 4.4404684608429465e-06, + "loss": 0.7785, + "step": 30995 + }, + { + "epoch": 1.75, + "grad_norm": 9.780883133711187, + "learning_rate": 4.438839773951912e-06, + "loss": 0.8075, + "step": 31000 + }, + { + "epoch": 1.75, + "grad_norm": 9.869682890959952, + "learning_rate": 4.4372111473600216e-06, + "loss": 0.8423, + "step": 31005 + }, + { + "epoch": 1.75, + "grad_norm": 14.382298328227915, + "learning_rate": 4.435582581242276e-06, + "loss": 0.7843, + "step": 31010 + }, + { + "epoch": 1.75, + "grad_norm": 6.298800869774029, + "learning_rate": 4.433954075773675e-06, + "loss": 0.7579, + "step": 31015 + }, + { + "epoch": 1.75, + "grad_norm": 5.500762773898244, + "learning_rate": 4.432325631129205e-06, + "loss": 0.7621, + "step": 31020 + }, + { + "epoch": 1.75, + "grad_norm": 6.278527125856053, + "learning_rate": 4.430697247483852e-06, + "loss": 0.7504, + "step": 31025 + }, + { + "epoch": 1.75, + "grad_norm": 5.789904700356915, + "learning_rate": 4.4290689250125915e-06, + "loss": 0.7909, + "step": 31030 + }, + { + "epoch": 1.75, + "grad_norm": 6.029927064891336, + "learning_rate": 4.427440663890394e-06, + "loss": 0.7907, + "step": 31035 + }, + { + "epoch": 1.75, + "grad_norm": 13.080109042755826, + "learning_rate": 4.425812464292226e-06, + "loss": 0.8349, + "step": 31040 + }, + { + "epoch": 1.75, + "grad_norm": 23.86416052534052, + "learning_rate": 4.424184326393044e-06, + "loss": 0.7762, + "step": 31045 + }, + { + "epoch": 1.75, + "grad_norm": 6.087622115151375, + "learning_rate": 4.422556250367794e-06, + "loss": 0.7698, + "step": 31050 + }, + { + "epoch": 1.75, + "grad_norm": 7.38325541238791, + "learning_rate": 4.420928236391426e-06, + "loss": 0.7936, + "step": 31055 + }, + { + "epoch": 1.75, + "grad_norm": 10.691124138180824, + "learning_rate": 4.419300284638872e-06, + "loss": 0.7878, + "step": 31060 + }, + { + "epoch": 1.75, + "grad_norm": 10.738360449067118, + "learning_rate": 4.417672395285068e-06, + "loss": 0.7893, + "step": 31065 + }, + { + "epoch": 1.75, + "grad_norm": 6.551122192832034, + "learning_rate": 4.416044568504934e-06, + "loss": 0.7862, + "step": 31070 + }, + { + "epoch": 1.75, + "grad_norm": 6.612648392437366, + "learning_rate": 4.414416804473387e-06, + "loss": 0.7469, + "step": 31075 + }, + { + "epoch": 1.75, + "grad_norm": 5.452026622214422, + "learning_rate": 4.412789103365341e-06, + "loss": 0.7527, + "step": 31080 + }, + { + "epoch": 1.75, + "grad_norm": 5.494616714621361, + "learning_rate": 4.411161465355695e-06, + "loss": 0.7527, + "step": 31085 + }, + { + "epoch": 1.75, + "grad_norm": 5.440267287183166, + "learning_rate": 4.409533890619351e-06, + "loss": 0.7898, + "step": 31090 + }, + { + "epoch": 1.75, + "grad_norm": 5.065609967411705, + "learning_rate": 4.407906379331193e-06, + "loss": 0.7662, + "step": 31095 + }, + { + "epoch": 1.75, + "grad_norm": 13.339445680539315, + "learning_rate": 4.4062789316661105e-06, + "loss": 0.772, + "step": 31100 + }, + { + "epoch": 1.75, + "grad_norm": 4.746744948171015, + "learning_rate": 4.4046515477989774e-06, + "loss": 0.7662, + "step": 31105 + }, + { + "epoch": 1.75, + "grad_norm": 6.6492187131747675, + "learning_rate": 4.403024227904661e-06, + "loss": 0.7758, + "step": 31110 + }, + { + "epoch": 1.75, + "grad_norm": 5.544339588254074, + "learning_rate": 4.401396972158027e-06, + "loss": 0.8084, + "step": 31115 + }, + { + "epoch": 1.75, + "grad_norm": 5.785846514801537, + "learning_rate": 4.399769780733929e-06, + "loss": 0.7735, + "step": 31120 + }, + { + "epoch": 1.75, + "grad_norm": 10.752272418122944, + "learning_rate": 4.398142653807219e-06, + "loss": 0.7645, + "step": 31125 + }, + { + "epoch": 1.75, + "grad_norm": 13.285973621470024, + "learning_rate": 4.396515591552737e-06, + "loss": 0.7613, + "step": 31130 + }, + { + "epoch": 1.75, + "grad_norm": 9.805614312727597, + "learning_rate": 4.394888594145315e-06, + "loss": 0.8157, + "step": 31135 + }, + { + "epoch": 1.75, + "grad_norm": 10.420765425722168, + "learning_rate": 4.393261661759788e-06, + "loss": 0.7925, + "step": 31140 + }, + { + "epoch": 1.75, + "grad_norm": 18.84362929653387, + "learning_rate": 4.39163479457097e-06, + "loss": 0.7682, + "step": 31145 + }, + { + "epoch": 1.76, + "grad_norm": 6.936863052636425, + "learning_rate": 4.390007992753681e-06, + "loss": 0.7413, + "step": 31150 + }, + { + "epoch": 1.76, + "grad_norm": 5.861988183192306, + "learning_rate": 4.388381256482724e-06, + "loss": 0.7679, + "step": 31155 + }, + { + "epoch": 1.76, + "grad_norm": 5.59897921077883, + "learning_rate": 4.3867545859329e-06, + "loss": 0.7404, + "step": 31160 + }, + { + "epoch": 1.76, + "grad_norm": 5.008417029745514, + "learning_rate": 4.385127981279002e-06, + "loss": 0.7653, + "step": 31165 + }, + { + "epoch": 1.76, + "grad_norm": 7.941394647427479, + "learning_rate": 4.383501442695816e-06, + "loss": 0.742, + "step": 31170 + }, + { + "epoch": 1.76, + "grad_norm": 17.08816107208935, + "learning_rate": 4.381874970358122e-06, + "loss": 0.7713, + "step": 31175 + }, + { + "epoch": 1.76, + "grad_norm": 7.618820013739727, + "learning_rate": 4.380248564440688e-06, + "loss": 0.7792, + "step": 31180 + }, + { + "epoch": 1.76, + "grad_norm": 11.683833882800295, + "learning_rate": 4.378622225118284e-06, + "loss": 0.7603, + "step": 31185 + }, + { + "epoch": 1.76, + "grad_norm": 5.099153329246707, + "learning_rate": 4.376995952565664e-06, + "loss": 0.7828, + "step": 31190 + }, + { + "epoch": 1.76, + "grad_norm": 5.300304520294611, + "learning_rate": 4.3753697469575765e-06, + "loss": 0.7546, + "step": 31195 + }, + { + "epoch": 1.76, + "grad_norm": 6.326420135609376, + "learning_rate": 4.373743608468768e-06, + "loss": 0.7401, + "step": 31200 + }, + { + "epoch": 1.76, + "grad_norm": 4.875584867433873, + "learning_rate": 4.372117537273971e-06, + "loss": 0.7677, + "step": 31205 + }, + { + "epoch": 1.76, + "grad_norm": 4.972293807215018, + "learning_rate": 4.3704915335479156e-06, + "loss": 0.7481, + "step": 31210 + }, + { + "epoch": 1.76, + "grad_norm": 5.580515434200049, + "learning_rate": 4.3688655974653254e-06, + "loss": 0.7869, + "step": 31215 + }, + { + "epoch": 1.76, + "grad_norm": 4.886669050135439, + "learning_rate": 4.367239729200909e-06, + "loss": 0.7538, + "step": 31220 + }, + { + "epoch": 1.76, + "grad_norm": 5.5301429470913055, + "learning_rate": 4.36561392892938e-06, + "loss": 0.7447, + "step": 31225 + }, + { + "epoch": 1.76, + "grad_norm": 5.048714691653655, + "learning_rate": 4.363988196825431e-06, + "loss": 0.7584, + "step": 31230 + }, + { + "epoch": 1.76, + "grad_norm": 4.978489615631539, + "learning_rate": 4.362362533063759e-06, + "loss": 0.7716, + "step": 31235 + }, + { + "epoch": 1.76, + "grad_norm": 6.246958008123747, + "learning_rate": 4.360736937819048e-06, + "loss": 0.7752, + "step": 31240 + }, + { + "epoch": 1.76, + "grad_norm": 5.179731498250461, + "learning_rate": 4.359111411265971e-06, + "loss": 0.783, + "step": 31245 + }, + { + "epoch": 1.76, + "grad_norm": 5.243791979682189, + "learning_rate": 4.357485953579203e-06, + "loss": 0.7304, + "step": 31250 + }, + { + "epoch": 1.76, + "grad_norm": 5.291653960712617, + "learning_rate": 4.3558605649334055e-06, + "loss": 0.7588, + "step": 31255 + }, + { + "epoch": 1.76, + "grad_norm": 6.915867816626169, + "learning_rate": 4.354235245503232e-06, + "loss": 0.7684, + "step": 31260 + }, + { + "epoch": 1.76, + "grad_norm": 10.745634037995895, + "learning_rate": 4.352609995463331e-06, + "loss": 0.7692, + "step": 31265 + }, + { + "epoch": 1.76, + "grad_norm": 4.857619438194422, + "learning_rate": 4.350984814988344e-06, + "loss": 0.7721, + "step": 31270 + }, + { + "epoch": 1.76, + "grad_norm": 20.924372231052487, + "learning_rate": 4.349359704252904e-06, + "loss": 0.7709, + "step": 31275 + }, + { + "epoch": 1.76, + "grad_norm": 17.05477162542986, + "learning_rate": 4.347734663431633e-06, + "loss": 0.7975, + "step": 31280 + }, + { + "epoch": 1.76, + "grad_norm": 11.919630400985666, + "learning_rate": 4.3461096926991535e-06, + "loss": 0.7838, + "step": 31285 + }, + { + "epoch": 1.76, + "grad_norm": 5.775409566108691, + "learning_rate": 4.3444847922300715e-06, + "loss": 0.7536, + "step": 31290 + }, + { + "epoch": 1.76, + "grad_norm": 15.137755669208488, + "learning_rate": 4.342859962198993e-06, + "loss": 0.75, + "step": 31295 + }, + { + "epoch": 1.76, + "grad_norm": 19.083390967951964, + "learning_rate": 4.341235202780511e-06, + "loss": 0.7942, + "step": 31300 + }, + { + "epoch": 1.76, + "grad_norm": 11.314667301433014, + "learning_rate": 4.339610514149212e-06, + "loss": 0.7906, + "step": 31305 + }, + { + "epoch": 1.76, + "grad_norm": 12.78234183383961, + "learning_rate": 4.337985896479681e-06, + "loss": 0.777, + "step": 31310 + }, + { + "epoch": 1.76, + "grad_norm": 6.542191665357444, + "learning_rate": 4.336361349946485e-06, + "loss": 0.8141, + "step": 31315 + }, + { + "epoch": 1.76, + "grad_norm": 16.02455848399195, + "learning_rate": 4.334736874724192e-06, + "loss": 0.7968, + "step": 31320 + }, + { + "epoch": 1.76, + "grad_norm": 12.411196553036113, + "learning_rate": 4.3331124709873564e-06, + "loss": 0.7965, + "step": 31325 + }, + { + "epoch": 1.77, + "grad_norm": 6.814419711407543, + "learning_rate": 4.331488138910531e-06, + "loss": 0.7867, + "step": 31330 + }, + { + "epoch": 1.77, + "grad_norm": 14.251291883369472, + "learning_rate": 4.329863878668255e-06, + "loss": 0.7708, + "step": 31335 + }, + { + "epoch": 1.77, + "grad_norm": 25.343195757766704, + "learning_rate": 4.328239690435061e-06, + "loss": 0.7544, + "step": 31340 + }, + { + "epoch": 1.77, + "grad_norm": 8.396865937727929, + "learning_rate": 4.326615574385477e-06, + "loss": 0.7425, + "step": 31345 + }, + { + "epoch": 1.77, + "grad_norm": 6.548083644899822, + "learning_rate": 4.324991530694023e-06, + "loss": 0.7863, + "step": 31350 + }, + { + "epoch": 1.77, + "grad_norm": 6.892588628574988, + "learning_rate": 4.323367559535208e-06, + "loss": 0.8153, + "step": 31355 + }, + { + "epoch": 1.77, + "grad_norm": 7.445878857866565, + "learning_rate": 4.321743661083535e-06, + "loss": 0.7565, + "step": 31360 + }, + { + "epoch": 1.77, + "grad_norm": 6.8100511915903486, + "learning_rate": 4.320119835513498e-06, + "loss": 0.7827, + "step": 31365 + }, + { + "epoch": 1.77, + "grad_norm": 5.39383204652131, + "learning_rate": 4.318496082999586e-06, + "loss": 0.7498, + "step": 31370 + }, + { + "epoch": 1.77, + "grad_norm": 5.997854485220208, + "learning_rate": 4.3168724037162765e-06, + "loss": 0.793, + "step": 31375 + }, + { + "epoch": 1.77, + "grad_norm": 5.213261918647128, + "learning_rate": 4.315248797838044e-06, + "loss": 0.795, + "step": 31380 + }, + { + "epoch": 1.77, + "grad_norm": 5.120675960037323, + "learning_rate": 4.313625265539349e-06, + "loss": 0.7762, + "step": 31385 + }, + { + "epoch": 1.77, + "grad_norm": 8.35815778137239, + "learning_rate": 4.312001806994648e-06, + "loss": 0.7231, + "step": 31390 + }, + { + "epoch": 1.77, + "grad_norm": 6.5415263799436385, + "learning_rate": 4.31037842237839e-06, + "loss": 0.7535, + "step": 31395 + }, + { + "epoch": 1.77, + "grad_norm": 5.395032082967594, + "learning_rate": 4.3087551118650125e-06, + "loss": 0.7911, + "step": 31400 + }, + { + "epoch": 1.77, + "grad_norm": 7.655500905307801, + "learning_rate": 4.307131875628951e-06, + "loss": 0.7518, + "step": 31405 + }, + { + "epoch": 1.77, + "grad_norm": 5.283945441297352, + "learning_rate": 4.305508713844626e-06, + "loss": 0.7708, + "step": 31410 + }, + { + "epoch": 1.77, + "grad_norm": 5.225466470945415, + "learning_rate": 4.3038856266864556e-06, + "loss": 0.7566, + "step": 31415 + }, + { + "epoch": 1.77, + "grad_norm": 5.5559474300909475, + "learning_rate": 4.302262614328847e-06, + "loss": 0.7392, + "step": 31420 + }, + { + "epoch": 1.77, + "grad_norm": 6.88652884886417, + "learning_rate": 4.300639676946199e-06, + "loss": 0.7742, + "step": 31425 + }, + { + "epoch": 1.77, + "grad_norm": 5.239389019150582, + "learning_rate": 4.299016814712905e-06, + "loss": 0.7435, + "step": 31430 + }, + { + "epoch": 1.77, + "grad_norm": 4.925998101904268, + "learning_rate": 4.2973940278033466e-06, + "loss": 0.7206, + "step": 31435 + }, + { + "epoch": 1.77, + "grad_norm": 15.790237032822464, + "learning_rate": 4.295771316391901e-06, + "loss": 0.7315, + "step": 31440 + }, + { + "epoch": 1.77, + "grad_norm": 5.876794687580935, + "learning_rate": 4.2941486806529375e-06, + "loss": 0.716, + "step": 31445 + }, + { + "epoch": 1.77, + "grad_norm": 4.942558109633507, + "learning_rate": 4.292526120760811e-06, + "loss": 0.7528, + "step": 31450 + }, + { + "epoch": 1.77, + "grad_norm": 11.385608527512987, + "learning_rate": 4.2909036368898775e-06, + "loss": 0.7606, + "step": 31455 + }, + { + "epoch": 1.77, + "grad_norm": 5.041549066237203, + "learning_rate": 4.289281229214475e-06, + "loss": 0.8232, + "step": 31460 + }, + { + "epoch": 1.77, + "grad_norm": 9.097921714615842, + "learning_rate": 4.2876588979089425e-06, + "loss": 0.7271, + "step": 31465 + }, + { + "epoch": 1.77, + "grad_norm": 6.035814420161072, + "learning_rate": 4.286036643147607e-06, + "loss": 0.784, + "step": 31470 + }, + { + "epoch": 1.77, + "grad_norm": 9.912060853703544, + "learning_rate": 4.284414465104782e-06, + "loss": 0.7881, + "step": 31475 + }, + { + "epoch": 1.77, + "grad_norm": 17.53554338247057, + "learning_rate": 4.282792363954782e-06, + "loss": 0.7949, + "step": 31480 + }, + { + "epoch": 1.77, + "grad_norm": 6.127032572018969, + "learning_rate": 4.281170339871908e-06, + "loss": 0.7658, + "step": 31485 + }, + { + "epoch": 1.77, + "grad_norm": 9.656023041910718, + "learning_rate": 4.2795483930304525e-06, + "loss": 0.7279, + "step": 31490 + }, + { + "epoch": 1.77, + "grad_norm": 7.3379778108073594, + "learning_rate": 4.277926523604702e-06, + "loss": 0.7454, + "step": 31495 + }, + { + "epoch": 1.77, + "grad_norm": 5.628433221272852, + "learning_rate": 4.276304731768934e-06, + "loss": 0.789, + "step": 31500 + }, + { + "epoch": 1.78, + "grad_norm": 6.739220700964836, + "learning_rate": 4.274683017697418e-06, + "loss": 0.7568, + "step": 31505 + }, + { + "epoch": 1.78, + "grad_norm": 12.022702406494936, + "learning_rate": 4.27306138156441e-06, + "loss": 0.7864, + "step": 31510 + }, + { + "epoch": 1.78, + "grad_norm": 8.801256168210536, + "learning_rate": 4.271439823544167e-06, + "loss": 0.7255, + "step": 31515 + }, + { + "epoch": 1.78, + "grad_norm": 7.218142837687559, + "learning_rate": 4.269818343810928e-06, + "loss": 0.7971, + "step": 31520 + }, + { + "epoch": 1.78, + "grad_norm": 6.722214871650178, + "learning_rate": 4.2681969425389325e-06, + "loss": 0.7171, + "step": 31525 + }, + { + "epoch": 1.78, + "grad_norm": 5.350522850546013, + "learning_rate": 4.266575619902404e-06, + "loss": 0.7584, + "step": 31530 + }, + { + "epoch": 1.78, + "grad_norm": 5.80285127418216, + "learning_rate": 4.264954376075562e-06, + "loss": 0.8044, + "step": 31535 + }, + { + "epoch": 1.78, + "grad_norm": 6.624145849686642, + "learning_rate": 4.263333211232618e-06, + "loss": 0.7942, + "step": 31540 + }, + { + "epoch": 1.78, + "grad_norm": 5.101323984067481, + "learning_rate": 4.261712125547769e-06, + "loss": 0.7647, + "step": 31545 + }, + { + "epoch": 1.78, + "grad_norm": 6.033069267818239, + "learning_rate": 4.260091119195212e-06, + "loss": 0.7906, + "step": 31550 + }, + { + "epoch": 1.78, + "grad_norm": 6.1912348970513715, + "learning_rate": 4.25847019234913e-06, + "loss": 0.7714, + "step": 31555 + }, + { + "epoch": 1.78, + "grad_norm": 4.752930101317553, + "learning_rate": 4.256849345183698e-06, + "loss": 0.7395, + "step": 31560 + }, + { + "epoch": 1.78, + "grad_norm": 21.63767299952961, + "learning_rate": 4.255228577873084e-06, + "loss": 0.7129, + "step": 31565 + }, + { + "epoch": 1.78, + "grad_norm": 5.792642421828407, + "learning_rate": 4.253607890591445e-06, + "loss": 0.7597, + "step": 31570 + }, + { + "epoch": 1.78, + "grad_norm": 7.253362064768006, + "learning_rate": 4.2519872835129325e-06, + "loss": 0.7456, + "step": 31575 + }, + { + "epoch": 1.78, + "grad_norm": 6.066988823586071, + "learning_rate": 4.250366756811688e-06, + "loss": 0.771, + "step": 31580 + }, + { + "epoch": 1.78, + "grad_norm": 5.51477458227472, + "learning_rate": 4.248746310661845e-06, + "loss": 0.7672, + "step": 31585 + }, + { + "epoch": 1.78, + "grad_norm": 6.071531395847083, + "learning_rate": 4.247125945237526e-06, + "loss": 0.7474, + "step": 31590 + }, + { + "epoch": 1.78, + "grad_norm": 5.23186061281606, + "learning_rate": 4.245505660712844e-06, + "loss": 0.7482, + "step": 31595 + }, + { + "epoch": 1.78, + "grad_norm": 4.865601469920776, + "learning_rate": 4.243885457261913e-06, + "loss": 0.7795, + "step": 31600 + }, + { + "epoch": 1.78, + "grad_norm": 6.505457591036943, + "learning_rate": 4.242265335058823e-06, + "loss": 0.7553, + "step": 31605 + }, + { + "epoch": 1.78, + "grad_norm": 6.736200050014133, + "learning_rate": 4.24064529427767e-06, + "loss": 0.8207, + "step": 31610 + }, + { + "epoch": 1.78, + "grad_norm": 5.936635635092902, + "learning_rate": 4.2390253350925295e-06, + "loss": 0.7855, + "step": 31615 + }, + { + "epoch": 1.78, + "grad_norm": 8.733425246430409, + "learning_rate": 4.237405457677476e-06, + "loss": 0.7755, + "step": 31620 + }, + { + "epoch": 1.78, + "grad_norm": 5.201225649268791, + "learning_rate": 4.235785662206571e-06, + "loss": 0.7493, + "step": 31625 + }, + { + "epoch": 1.78, + "grad_norm": 14.019078487520051, + "learning_rate": 4.2341659488538674e-06, + "loss": 0.7154, + "step": 31630 + }, + { + "epoch": 1.78, + "grad_norm": 6.258473765302222, + "learning_rate": 4.2325463177934165e-06, + "loss": 0.7571, + "step": 31635 + }, + { + "epoch": 1.78, + "grad_norm": 6.124520434246618, + "learning_rate": 4.23092676919925e-06, + "loss": 0.7879, + "step": 31640 + }, + { + "epoch": 1.78, + "grad_norm": 8.297941820023116, + "learning_rate": 4.229307303245393e-06, + "loss": 0.7946, + "step": 31645 + }, + { + "epoch": 1.78, + "grad_norm": 5.095887150025404, + "learning_rate": 4.227687920105871e-06, + "loss": 0.7504, + "step": 31650 + }, + { + "epoch": 1.78, + "grad_norm": 5.817195496141892, + "learning_rate": 4.226068619954688e-06, + "loss": 0.7546, + "step": 31655 + }, + { + "epoch": 1.78, + "grad_norm": 7.720075996295372, + "learning_rate": 4.22444940296585e-06, + "loss": 0.7872, + "step": 31660 + }, + { + "epoch": 1.78, + "grad_norm": 7.413683138621276, + "learning_rate": 4.222830269313344e-06, + "loss": 0.7135, + "step": 31665 + }, + { + "epoch": 1.78, + "grad_norm": 6.4658339719402305, + "learning_rate": 4.221211219171156e-06, + "loss": 0.7615, + "step": 31670 + }, + { + "epoch": 1.78, + "grad_norm": 6.810946550099423, + "learning_rate": 4.2195922527132615e-06, + "loss": 0.8076, + "step": 31675 + }, + { + "epoch": 1.78, + "grad_norm": 4.951481803182469, + "learning_rate": 4.217973370113622e-06, + "loss": 0.7608, + "step": 31680 + }, + { + "epoch": 1.79, + "grad_norm": 4.847401075071992, + "learning_rate": 4.216354571546195e-06, + "loss": 0.7598, + "step": 31685 + }, + { + "epoch": 1.79, + "grad_norm": 4.7501901210612845, + "learning_rate": 4.214735857184928e-06, + "loss": 0.745, + "step": 31690 + }, + { + "epoch": 1.79, + "grad_norm": 4.958178236177581, + "learning_rate": 4.21311722720376e-06, + "loss": 0.7506, + "step": 31695 + }, + { + "epoch": 1.79, + "grad_norm": 6.857799563866652, + "learning_rate": 4.211498681776619e-06, + "loss": 0.7283, + "step": 31700 + }, + { + "epoch": 1.79, + "grad_norm": 7.835098289625643, + "learning_rate": 4.209880221077422e-06, + "loss": 0.7535, + "step": 31705 + }, + { + "epoch": 1.79, + "grad_norm": 10.266675177355465, + "learning_rate": 4.208261845280084e-06, + "loss": 0.7948, + "step": 31710 + }, + { + "epoch": 1.79, + "grad_norm": 4.742704098505882, + "learning_rate": 4.206643554558504e-06, + "loss": 0.777, + "step": 31715 + }, + { + "epoch": 1.79, + "grad_norm": 8.663389553754389, + "learning_rate": 4.205025349086576e-06, + "loss": 0.7606, + "step": 31720 + }, + { + "epoch": 1.79, + "grad_norm": 13.655969345425756, + "learning_rate": 4.2034072290381835e-06, + "loss": 0.7612, + "step": 31725 + }, + { + "epoch": 1.79, + "grad_norm": 15.77392702473303, + "learning_rate": 4.201789194587198e-06, + "loss": 0.7682, + "step": 31730 + }, + { + "epoch": 1.79, + "grad_norm": 5.160671480717663, + "learning_rate": 4.200171245907488e-06, + "loss": 0.7027, + "step": 31735 + }, + { + "epoch": 1.79, + "grad_norm": 22.467000756344046, + "learning_rate": 4.1985533831729064e-06, + "loss": 0.7651, + "step": 31740 + }, + { + "epoch": 1.79, + "grad_norm": 4.860220719417435, + "learning_rate": 4.196935606557302e-06, + "loss": 0.751, + "step": 31745 + }, + { + "epoch": 1.79, + "grad_norm": 7.693644957629103, + "learning_rate": 4.195317916234509e-06, + "loss": 0.764, + "step": 31750 + }, + { + "epoch": 1.79, + "grad_norm": 6.04868919779745, + "learning_rate": 4.19370031237836e-06, + "loss": 0.7589, + "step": 31755 + }, + { + "epoch": 1.79, + "grad_norm": 7.029436333630056, + "learning_rate": 4.192082795162669e-06, + "loss": 0.7573, + "step": 31760 + }, + { + "epoch": 1.79, + "grad_norm": 5.80530077768993, + "learning_rate": 4.190465364761247e-06, + "loss": 0.7856, + "step": 31765 + }, + { + "epoch": 1.79, + "grad_norm": 4.832406430152098, + "learning_rate": 4.188848021347895e-06, + "loss": 0.758, + "step": 31770 + }, + { + "epoch": 1.79, + "grad_norm": 4.94655597758038, + "learning_rate": 4.1872307650964025e-06, + "loss": 0.7592, + "step": 31775 + }, + { + "epoch": 1.79, + "grad_norm": 4.648696778511196, + "learning_rate": 4.185613596180553e-06, + "loss": 0.7683, + "step": 31780 + }, + { + "epoch": 1.79, + "grad_norm": 6.204299199503271, + "learning_rate": 4.183996514774116e-06, + "loss": 0.7188, + "step": 31785 + }, + { + "epoch": 1.79, + "grad_norm": 6.507753091151947, + "learning_rate": 4.182379521050853e-06, + "loss": 0.7693, + "step": 31790 + }, + { + "epoch": 1.79, + "grad_norm": 4.454095096314965, + "learning_rate": 4.180762615184521e-06, + "loss": 0.7189, + "step": 31795 + }, + { + "epoch": 1.79, + "grad_norm": 5.288290058462045, + "learning_rate": 4.17914579734886e-06, + "loss": 0.722, + "step": 31800 + }, + { + "epoch": 1.79, + "grad_norm": 11.642839848928702, + "learning_rate": 4.177529067717607e-06, + "loss": 0.7602, + "step": 31805 + }, + { + "epoch": 1.79, + "grad_norm": 5.890058280333357, + "learning_rate": 4.175912426464485e-06, + "loss": 0.7351, + "step": 31810 + }, + { + "epoch": 1.79, + "grad_norm": 15.571984915838518, + "learning_rate": 4.1742958737632095e-06, + "loss": 0.8144, + "step": 31815 + }, + { + "epoch": 1.79, + "grad_norm": 5.200743083525845, + "learning_rate": 4.172679409787488e-06, + "loss": 0.7529, + "step": 31820 + }, + { + "epoch": 1.79, + "grad_norm": 6.3007219715715586, + "learning_rate": 4.171063034711012e-06, + "loss": 0.7663, + "step": 31825 + }, + { + "epoch": 1.79, + "grad_norm": 5.05394089101257, + "learning_rate": 4.169446748707475e-06, + "loss": 0.7661, + "step": 31830 + }, + { + "epoch": 1.79, + "grad_norm": 7.6760887125767425, + "learning_rate": 4.167830551950548e-06, + "loss": 0.7724, + "step": 31835 + }, + { + "epoch": 1.79, + "grad_norm": 4.962089281564885, + "learning_rate": 4.166214444613902e-06, + "loss": 0.7636, + "step": 31840 + }, + { + "epoch": 1.79, + "grad_norm": 5.12419493624849, + "learning_rate": 4.164598426871193e-06, + "loss": 0.7596, + "step": 31845 + }, + { + "epoch": 1.79, + "grad_norm": 7.593275584109431, + "learning_rate": 4.16298249889607e-06, + "loss": 0.7726, + "step": 31850 + }, + { + "epoch": 1.79, + "grad_norm": 5.8467002907292915, + "learning_rate": 4.161366660862171e-06, + "loss": 0.7463, + "step": 31855 + }, + { + "epoch": 1.8, + "grad_norm": 6.195673443419892, + "learning_rate": 4.159750912943125e-06, + "loss": 0.7892, + "step": 31860 + }, + { + "epoch": 1.8, + "grad_norm": 5.188407050668656, + "learning_rate": 4.158135255312554e-06, + "loss": 0.7294, + "step": 31865 + }, + { + "epoch": 1.8, + "grad_norm": 5.178572546014093, + "learning_rate": 4.156519688144065e-06, + "loss": 0.7363, + "step": 31870 + }, + { + "epoch": 1.8, + "grad_norm": 5.89703040686469, + "learning_rate": 4.154904211611256e-06, + "loss": 0.7305, + "step": 31875 + }, + { + "epoch": 1.8, + "grad_norm": 5.000373430155154, + "learning_rate": 4.153288825887721e-06, + "loss": 0.7757, + "step": 31880 + }, + { + "epoch": 1.8, + "grad_norm": 7.363742379551147, + "learning_rate": 4.151673531147037e-06, + "loss": 0.7347, + "step": 31885 + }, + { + "epoch": 1.8, + "grad_norm": 5.149094319391997, + "learning_rate": 4.1500583275627784e-06, + "loss": 0.7516, + "step": 31890 + }, + { + "epoch": 1.8, + "grad_norm": 6.262698444911576, + "learning_rate": 4.1484432153085e-06, + "loss": 0.7484, + "step": 31895 + }, + { + "epoch": 1.8, + "grad_norm": 12.415766897917479, + "learning_rate": 4.146828194557758e-06, + "loss": 0.7656, + "step": 31900 + }, + { + "epoch": 1.8, + "grad_norm": 5.81800745427543, + "learning_rate": 4.145213265484094e-06, + "loss": 0.7295, + "step": 31905 + }, + { + "epoch": 1.8, + "grad_norm": 5.304317114743605, + "learning_rate": 4.143598428261034e-06, + "loss": 0.7087, + "step": 31910 + }, + { + "epoch": 1.8, + "grad_norm": 5.1246612257695965, + "learning_rate": 4.141983683062104e-06, + "loss": 0.7485, + "step": 31915 + }, + { + "epoch": 1.8, + "grad_norm": 5.959453601082129, + "learning_rate": 4.140369030060813e-06, + "loss": 0.7555, + "step": 31920 + }, + { + "epoch": 1.8, + "grad_norm": 5.836317126015636, + "learning_rate": 4.1387544694306646e-06, + "loss": 0.7428, + "step": 31925 + }, + { + "epoch": 1.8, + "grad_norm": 5.812818822265653, + "learning_rate": 4.13714000134515e-06, + "loss": 0.8003, + "step": 31930 + }, + { + "epoch": 1.8, + "grad_norm": 5.2243360596537975, + "learning_rate": 4.1355256259777485e-06, + "loss": 0.756, + "step": 31935 + }, + { + "epoch": 1.8, + "grad_norm": 5.630192128442654, + "learning_rate": 4.1339113435019345e-06, + "loss": 0.7846, + "step": 31940 + }, + { + "epoch": 1.8, + "grad_norm": 13.166171758026652, + "learning_rate": 4.132297154091169e-06, + "loss": 0.7431, + "step": 31945 + }, + { + "epoch": 1.8, + "grad_norm": 5.57799394956431, + "learning_rate": 4.130683057918904e-06, + "loss": 0.7716, + "step": 31950 + }, + { + "epoch": 1.8, + "grad_norm": 8.935452628773687, + "learning_rate": 4.129069055158582e-06, + "loss": 0.8137, + "step": 31955 + }, + { + "epoch": 1.8, + "grad_norm": 4.891814026575228, + "learning_rate": 4.127455145983631e-06, + "loss": 0.7307, + "step": 31960 + }, + { + "epoch": 1.8, + "grad_norm": 5.038499061185585, + "learning_rate": 4.125841330567478e-06, + "loss": 0.7278, + "step": 31965 + }, + { + "epoch": 1.8, + "grad_norm": 5.284721168296819, + "learning_rate": 4.12422760908353e-06, + "loss": 0.7421, + "step": 31970 + }, + { + "epoch": 1.8, + "grad_norm": 7.111371571582586, + "learning_rate": 4.122613981705192e-06, + "loss": 0.7344, + "step": 31975 + }, + { + "epoch": 1.8, + "grad_norm": 4.60813408911817, + "learning_rate": 4.121000448605852e-06, + "loss": 0.7979, + "step": 31980 + }, + { + "epoch": 1.8, + "grad_norm": 23.050318204325972, + "learning_rate": 4.119387009958896e-06, + "loss": 0.7527, + "step": 31985 + }, + { + "epoch": 1.8, + "grad_norm": 5.453800733158293, + "learning_rate": 4.11777366593769e-06, + "loss": 0.7193, + "step": 31990 + }, + { + "epoch": 1.8, + "grad_norm": 4.929249730939494, + "learning_rate": 4.116160416715597e-06, + "loss": 0.766, + "step": 31995 + }, + { + "epoch": 1.8, + "grad_norm": 5.514173746917727, + "learning_rate": 4.11454726246597e-06, + "loss": 0.7525, + "step": 32000 + }, + { + "epoch": 1.8, + "grad_norm": 11.732700795478207, + "learning_rate": 4.112934203362145e-06, + "loss": 0.7508, + "step": 32005 + }, + { + "epoch": 1.8, + "grad_norm": 6.244566091873192, + "learning_rate": 4.1113212395774575e-06, + "loss": 0.7686, + "step": 32010 + }, + { + "epoch": 1.8, + "grad_norm": 15.900792083792036, + "learning_rate": 4.109708371285224e-06, + "loss": 0.7596, + "step": 32015 + }, + { + "epoch": 1.8, + "grad_norm": 4.53746529168734, + "learning_rate": 4.108095598658753e-06, + "loss": 0.7583, + "step": 32020 + }, + { + "epoch": 1.8, + "grad_norm": 5.774204067156112, + "learning_rate": 4.1064829218713485e-06, + "loss": 0.7658, + "step": 32025 + }, + { + "epoch": 1.8, + "grad_norm": 5.529234728684587, + "learning_rate": 4.104870341096295e-06, + "loss": 0.7511, + "step": 32030 + }, + { + "epoch": 1.8, + "grad_norm": 7.594346783118083, + "learning_rate": 4.103257856506876e-06, + "loss": 0.7331, + "step": 32035 + }, + { + "epoch": 1.81, + "grad_norm": 5.646534008548479, + "learning_rate": 4.101645468276358e-06, + "loss": 0.7753, + "step": 32040 + }, + { + "epoch": 1.81, + "grad_norm": 12.008223688407055, + "learning_rate": 4.100033176577998e-06, + "loss": 0.7485, + "step": 32045 + }, + { + "epoch": 1.81, + "grad_norm": 9.050050521919625, + "learning_rate": 4.098420981585046e-06, + "loss": 0.7368, + "step": 32050 + }, + { + "epoch": 1.81, + "grad_norm": 8.798072441024582, + "learning_rate": 4.096808883470737e-06, + "loss": 0.7529, + "step": 32055 + }, + { + "epoch": 1.81, + "grad_norm": 7.027194373653681, + "learning_rate": 4.095196882408302e-06, + "loss": 0.7647, + "step": 32060 + }, + { + "epoch": 1.81, + "grad_norm": 6.869935624755382, + "learning_rate": 4.093584978570952e-06, + "loss": 0.7633, + "step": 32065 + }, + { + "epoch": 1.81, + "grad_norm": 10.831572464823065, + "learning_rate": 4.0919731721319e-06, + "loss": 0.7689, + "step": 32070 + }, + { + "epoch": 1.81, + "grad_norm": 6.366645756409165, + "learning_rate": 4.090361463264336e-06, + "loss": 0.7306, + "step": 32075 + }, + { + "epoch": 1.81, + "grad_norm": 7.910519208879556, + "learning_rate": 4.088749852141447e-06, + "loss": 0.7194, + "step": 32080 + }, + { + "epoch": 1.81, + "grad_norm": 5.941402669165022, + "learning_rate": 4.087138338936409e-06, + "loss": 0.7607, + "step": 32085 + }, + { + "epoch": 1.81, + "grad_norm": 6.276830228065567, + "learning_rate": 4.085526923822383e-06, + "loss": 0.7706, + "step": 32090 + }, + { + "epoch": 1.81, + "grad_norm": 9.313920519669926, + "learning_rate": 4.0839156069725286e-06, + "loss": 0.7602, + "step": 32095 + }, + { + "epoch": 1.81, + "grad_norm": 5.744053032382729, + "learning_rate": 4.082304388559984e-06, + "loss": 0.77, + "step": 32100 + }, + { + "epoch": 1.81, + "grad_norm": 7.480837100740973, + "learning_rate": 4.080693268757881e-06, + "loss": 0.7392, + "step": 32105 + }, + { + "epoch": 1.81, + "grad_norm": 9.434959462726063, + "learning_rate": 4.079082247739346e-06, + "loss": 0.7386, + "step": 32110 + }, + { + "epoch": 1.81, + "grad_norm": 13.855344306413238, + "learning_rate": 4.077471325677485e-06, + "loss": 0.791, + "step": 32115 + }, + { + "epoch": 1.81, + "grad_norm": 7.09501038994656, + "learning_rate": 4.0758605027454045e-06, + "loss": 0.7569, + "step": 32120 + }, + { + "epoch": 1.81, + "grad_norm": 10.245371001691915, + "learning_rate": 4.07424977911619e-06, + "loss": 0.7775, + "step": 32125 + }, + { + "epoch": 1.81, + "grad_norm": 6.365929182518692, + "learning_rate": 4.072639154962922e-06, + "loss": 0.7653, + "step": 32130 + }, + { + "epoch": 1.81, + "grad_norm": 8.027473029988867, + "learning_rate": 4.071028630458671e-06, + "loss": 0.7751, + "step": 32135 + }, + { + "epoch": 1.81, + "grad_norm": 4.575579205576042, + "learning_rate": 4.069418205776491e-06, + "loss": 0.7451, + "step": 32140 + }, + { + "epoch": 1.81, + "grad_norm": 6.401146694741413, + "learning_rate": 4.067807881089434e-06, + "loss": 0.7245, + "step": 32145 + }, + { + "epoch": 1.81, + "grad_norm": 4.96988948956007, + "learning_rate": 4.066197656570533e-06, + "loss": 0.7655, + "step": 32150 + }, + { + "epoch": 1.81, + "grad_norm": 13.77269771981435, + "learning_rate": 4.064587532392817e-06, + "loss": 0.7382, + "step": 32155 + }, + { + "epoch": 1.81, + "grad_norm": 8.637743519235023, + "learning_rate": 4.062977508729298e-06, + "loss": 0.7631, + "step": 32160 + }, + { + "epoch": 1.81, + "grad_norm": 5.616435366436222, + "learning_rate": 4.0613675857529795e-06, + "loss": 0.7532, + "step": 32165 + }, + { + "epoch": 1.81, + "grad_norm": 5.412442146282715, + "learning_rate": 4.059757763636858e-06, + "loss": 0.7628, + "step": 32170 + }, + { + "epoch": 1.81, + "grad_norm": 9.549725817186147, + "learning_rate": 4.0581480425539135e-06, + "loss": 0.7391, + "step": 32175 + }, + { + "epoch": 1.81, + "grad_norm": 7.089406068427861, + "learning_rate": 4.056538422677119e-06, + "loss": 0.7484, + "step": 32180 + }, + { + "epoch": 1.81, + "grad_norm": 7.697875326602824, + "learning_rate": 4.054928904179435e-06, + "loss": 0.7399, + "step": 32185 + }, + { + "epoch": 1.81, + "grad_norm": 6.837444253034605, + "learning_rate": 4.05331948723381e-06, + "loss": 0.7692, + "step": 32190 + }, + { + "epoch": 1.81, + "grad_norm": 10.681669853385758, + "learning_rate": 4.051710172013186e-06, + "loss": 0.7832, + "step": 32195 + }, + { + "epoch": 1.81, + "grad_norm": 16.878806670156813, + "learning_rate": 4.050100958690487e-06, + "loss": 0.7259, + "step": 32200 + }, + { + "epoch": 1.81, + "grad_norm": 11.740676691401125, + "learning_rate": 4.048491847438634e-06, + "loss": 0.7078, + "step": 32205 + }, + { + "epoch": 1.81, + "grad_norm": 9.002376127598739, + "learning_rate": 4.0468828384305316e-06, + "loss": 0.7415, + "step": 32210 + }, + { + "epoch": 1.82, + "grad_norm": 5.982992071388394, + "learning_rate": 4.045273931839073e-06, + "loss": 0.7274, + "step": 32215 + }, + { + "epoch": 1.82, + "grad_norm": 10.126164475039666, + "learning_rate": 4.043665127837145e-06, + "loss": 0.741, + "step": 32220 + }, + { + "epoch": 1.82, + "grad_norm": 9.459062382502498, + "learning_rate": 4.042056426597618e-06, + "loss": 0.7541, + "step": 32225 + }, + { + "epoch": 1.82, + "grad_norm": 5.096543727090931, + "learning_rate": 4.040447828293359e-06, + "loss": 0.737, + "step": 32230 + }, + { + "epoch": 1.82, + "grad_norm": 5.059764483812855, + "learning_rate": 4.038839333097212e-06, + "loss": 0.7343, + "step": 32235 + }, + { + "epoch": 1.82, + "grad_norm": 5.2378777671462196, + "learning_rate": 4.0372309411820235e-06, + "loss": 0.7547, + "step": 32240 + }, + { + "epoch": 1.82, + "grad_norm": 108.92030761273968, + "learning_rate": 4.035622652720619e-06, + "loss": 0.7671, + "step": 32245 + }, + { + "epoch": 1.82, + "grad_norm": 43.745775882897625, + "learning_rate": 4.034014467885814e-06, + "loss": 0.7815, + "step": 32250 + }, + { + "epoch": 1.82, + "grad_norm": 49.83487826438429, + "learning_rate": 4.032406386850421e-06, + "loss": 0.7428, + "step": 32255 + }, + { + "epoch": 1.82, + "grad_norm": 24.095843495208634, + "learning_rate": 4.030798409787229e-06, + "loss": 0.7578, + "step": 32260 + }, + { + "epoch": 1.82, + "grad_norm": 13.34996733573595, + "learning_rate": 4.029190536869027e-06, + "loss": 0.7143, + "step": 32265 + }, + { + "epoch": 1.82, + "grad_norm": 7.93765065178245, + "learning_rate": 4.027582768268587e-06, + "loss": 0.7293, + "step": 32270 + }, + { + "epoch": 1.82, + "grad_norm": 8.382284251542114, + "learning_rate": 4.025975104158668e-06, + "loss": 0.7425, + "step": 32275 + }, + { + "epoch": 1.82, + "grad_norm": 8.361834811889404, + "learning_rate": 4.024367544712025e-06, + "loss": 0.7611, + "step": 32280 + }, + { + "epoch": 1.82, + "grad_norm": 5.7089254654863355, + "learning_rate": 4.022760090101393e-06, + "loss": 0.7659, + "step": 32285 + }, + { + "epoch": 1.82, + "grad_norm": 7.064004428185941, + "learning_rate": 4.0211527404995045e-06, + "loss": 0.7625, + "step": 32290 + }, + { + "epoch": 1.82, + "grad_norm": 4.697429299709736, + "learning_rate": 4.0195454960790715e-06, + "loss": 0.7129, + "step": 32295 + }, + { + "epoch": 1.82, + "grad_norm": 5.4286762907772586, + "learning_rate": 4.017938357012804e-06, + "loss": 0.7145, + "step": 32300 + }, + { + "epoch": 1.82, + "grad_norm": 4.945370514148755, + "learning_rate": 4.016331323473393e-06, + "loss": 0.7162, + "step": 32305 + }, + { + "epoch": 1.82, + "grad_norm": 4.789212200932463, + "learning_rate": 4.014724395633523e-06, + "loss": 0.7424, + "step": 32310 + }, + { + "epoch": 1.82, + "grad_norm": 12.104215820493675, + "learning_rate": 4.013117573665866e-06, + "loss": 0.7726, + "step": 32315 + }, + { + "epoch": 1.82, + "grad_norm": 16.668650501945677, + "learning_rate": 4.011510857743079e-06, + "loss": 0.7428, + "step": 32320 + }, + { + "epoch": 1.82, + "grad_norm": 19.135462965548953, + "learning_rate": 4.0099042480378155e-06, + "loss": 0.7649, + "step": 32325 + }, + { + "epoch": 1.82, + "grad_norm": 7.276693225153347, + "learning_rate": 4.008297744722709e-06, + "loss": 0.7273, + "step": 32330 + }, + { + "epoch": 1.82, + "grad_norm": 12.683834211031284, + "learning_rate": 4.006691347970386e-06, + "loss": 0.759, + "step": 32335 + }, + { + "epoch": 1.82, + "grad_norm": 5.1302909383361275, + "learning_rate": 4.005085057953463e-06, + "loss": 0.7441, + "step": 32340 + }, + { + "epoch": 1.82, + "grad_norm": 5.763589626580232, + "learning_rate": 4.0034788748445385e-06, + "loss": 0.6944, + "step": 32345 + }, + { + "epoch": 1.82, + "grad_norm": 4.823536433778182, + "learning_rate": 4.00187279881621e-06, + "loss": 0.7211, + "step": 32350 + }, + { + "epoch": 1.82, + "grad_norm": 4.890491462853178, + "learning_rate": 4.000266830041053e-06, + "loss": 0.7386, + "step": 32355 + }, + { + "epoch": 1.82, + "grad_norm": 6.279807004199837, + "learning_rate": 3.998660968691636e-06, + "loss": 0.7236, + "step": 32360 + }, + { + "epoch": 1.82, + "grad_norm": 6.480624980131441, + "learning_rate": 3.99705521494052e-06, + "loss": 0.7493, + "step": 32365 + }, + { + "epoch": 1.82, + "grad_norm": 8.14373201760105, + "learning_rate": 3.995449568960244e-06, + "loss": 0.6911, + "step": 32370 + }, + { + "epoch": 1.82, + "grad_norm": 5.308649146086617, + "learning_rate": 3.9938440309233485e-06, + "loss": 0.7192, + "step": 32375 + }, + { + "epoch": 1.82, + "grad_norm": 9.723555952214085, + "learning_rate": 3.99223860100235e-06, + "loss": 0.8161, + "step": 32380 + }, + { + "epoch": 1.82, + "grad_norm": 5.9143905317605245, + "learning_rate": 3.990633279369763e-06, + "loss": 0.7385, + "step": 32385 + }, + { + "epoch": 1.82, + "grad_norm": 8.874189250447786, + "learning_rate": 3.989028066198085e-06, + "loss": 0.7598, + "step": 32390 + }, + { + "epoch": 1.83, + "grad_norm": 5.392860507148786, + "learning_rate": 3.987422961659801e-06, + "loss": 0.7392, + "step": 32395 + }, + { + "epoch": 1.83, + "grad_norm": 4.752071227232578, + "learning_rate": 3.98581796592739e-06, + "loss": 0.7702, + "step": 32400 + }, + { + "epoch": 1.83, + "grad_norm": 7.491806669605895, + "learning_rate": 3.984213079173314e-06, + "loss": 0.7225, + "step": 32405 + }, + { + "epoch": 1.83, + "grad_norm": 5.027790376668704, + "learning_rate": 3.982608301570027e-06, + "loss": 0.7446, + "step": 32410 + }, + { + "epoch": 1.83, + "grad_norm": 7.539158941213466, + "learning_rate": 3.981003633289968e-06, + "loss": 0.7488, + "step": 32415 + }, + { + "epoch": 1.83, + "grad_norm": 8.506726147339293, + "learning_rate": 3.979399074505564e-06, + "loss": 0.7352, + "step": 32420 + }, + { + "epoch": 1.83, + "grad_norm": 6.977666467236214, + "learning_rate": 3.977794625389235e-06, + "loss": 0.7606, + "step": 32425 + }, + { + "epoch": 1.83, + "grad_norm": 5.131229336936734, + "learning_rate": 3.976190286113384e-06, + "loss": 0.7475, + "step": 32430 + }, + { + "epoch": 1.83, + "grad_norm": 5.437401289549152, + "learning_rate": 3.974586056850406e-06, + "loss": 0.7811, + "step": 32435 + }, + { + "epoch": 1.83, + "grad_norm": 5.246324271116518, + "learning_rate": 3.972981937772681e-06, + "loss": 0.7579, + "step": 32440 + }, + { + "epoch": 1.83, + "grad_norm": 6.721942524027835, + "learning_rate": 3.97137792905258e-06, + "loss": 0.8232, + "step": 32445 + }, + { + "epoch": 1.83, + "grad_norm": 7.269173428548643, + "learning_rate": 3.96977403086246e-06, + "loss": 0.7119, + "step": 32450 + }, + { + "epoch": 1.83, + "grad_norm": 7.602933566525741, + "learning_rate": 3.968170243374666e-06, + "loss": 0.7188, + "step": 32455 + }, + { + "epoch": 1.83, + "grad_norm": 9.749783608193544, + "learning_rate": 3.966566566761535e-06, + "loss": 0.7708, + "step": 32460 + }, + { + "epoch": 1.83, + "grad_norm": 5.059023493743448, + "learning_rate": 3.964963001195385e-06, + "loss": 0.714, + "step": 32465 + }, + { + "epoch": 1.83, + "grad_norm": 11.045492016603623, + "learning_rate": 3.963359546848531e-06, + "loss": 0.7396, + "step": 32470 + }, + { + "epoch": 1.83, + "grad_norm": 12.890409114883049, + "learning_rate": 3.961756203893269e-06, + "loss": 0.7636, + "step": 32475 + }, + { + "epoch": 1.83, + "grad_norm": 10.739718233715035, + "learning_rate": 3.960152972501881e-06, + "loss": 0.755, + "step": 32480 + }, + { + "epoch": 1.83, + "grad_norm": 13.876632833186473, + "learning_rate": 3.9585498528466495e-06, + "loss": 0.7837, + "step": 32485 + }, + { + "epoch": 1.83, + "grad_norm": 6.417276614614368, + "learning_rate": 3.95694684509983e-06, + "loss": 0.7206, + "step": 32490 + }, + { + "epoch": 1.83, + "grad_norm": 29.818927515288983, + "learning_rate": 3.955343949433677e-06, + "loss": 0.7615, + "step": 32495 + }, + { + "epoch": 1.83, + "grad_norm": 7.838857168759922, + "learning_rate": 3.9537411660204275e-06, + "loss": 0.7185, + "step": 32500 + }, + { + "epoch": 1.83, + "grad_norm": 12.803653681464658, + "learning_rate": 3.952138495032304e-06, + "loss": 0.7308, + "step": 32505 + }, + { + "epoch": 1.83, + "grad_norm": 6.529705026388461, + "learning_rate": 3.950535936641527e-06, + "loss": 0.741, + "step": 32510 + }, + { + "epoch": 1.83, + "grad_norm": 5.914597086797618, + "learning_rate": 3.948933491020293e-06, + "loss": 0.7171, + "step": 32515 + }, + { + "epoch": 1.83, + "grad_norm": 5.0065243443796, + "learning_rate": 3.947331158340796e-06, + "loss": 0.751, + "step": 32520 + }, + { + "epoch": 1.83, + "grad_norm": 4.829841352091856, + "learning_rate": 3.945728938775213e-06, + "loss": 0.7525, + "step": 32525 + }, + { + "epoch": 1.83, + "grad_norm": 4.874341182645252, + "learning_rate": 3.944126832495705e-06, + "loss": 0.7685, + "step": 32530 + }, + { + "epoch": 1.83, + "grad_norm": 9.961402296516516, + "learning_rate": 3.94252483967443e-06, + "loss": 0.7186, + "step": 32535 + }, + { + "epoch": 1.83, + "grad_norm": 7.212277421026527, + "learning_rate": 3.940922960483528e-06, + "loss": 0.7439, + "step": 32540 + }, + { + "epoch": 1.83, + "grad_norm": 7.139824947731008, + "learning_rate": 3.939321195095128e-06, + "loss": 0.7, + "step": 32545 + }, + { + "epoch": 1.83, + "grad_norm": 13.384602426818018, + "learning_rate": 3.9377195436813455e-06, + "loss": 0.7088, + "step": 32550 + }, + { + "epoch": 1.83, + "grad_norm": 7.924989819051035, + "learning_rate": 3.936118006414289e-06, + "loss": 0.7283, + "step": 32555 + }, + { + "epoch": 1.83, + "grad_norm": 8.195762346201736, + "learning_rate": 3.934516583466048e-06, + "loss": 0.7697, + "step": 32560 + }, + { + "epoch": 1.83, + "grad_norm": 4.924731308787463, + "learning_rate": 3.9329152750086996e-06, + "loss": 0.7106, + "step": 32565 + }, + { + "epoch": 1.84, + "grad_norm": 6.19683831568055, + "learning_rate": 3.931314081214317e-06, + "loss": 0.7297, + "step": 32570 + }, + { + "epoch": 1.84, + "grad_norm": 4.780190002870143, + "learning_rate": 3.92971300225495e-06, + "loss": 0.7038, + "step": 32575 + }, + { + "epoch": 1.84, + "grad_norm": 5.429819226463751, + "learning_rate": 3.928112038302647e-06, + "loss": 0.7694, + "step": 32580 + }, + { + "epoch": 1.84, + "grad_norm": 5.061228380966536, + "learning_rate": 3.926511189529435e-06, + "loss": 0.7275, + "step": 32585 + }, + { + "epoch": 1.84, + "grad_norm": 5.880655402822012, + "learning_rate": 3.924910456107333e-06, + "loss": 0.7909, + "step": 32590 + }, + { + "epoch": 1.84, + "grad_norm": 12.148758703138844, + "learning_rate": 3.923309838208349e-06, + "loss": 0.7439, + "step": 32595 + }, + { + "epoch": 1.84, + "grad_norm": 7.109903427231474, + "learning_rate": 3.921709336004471e-06, + "loss": 0.7278, + "step": 32600 + }, + { + "epoch": 1.84, + "grad_norm": 5.192494836222281, + "learning_rate": 3.920108949667687e-06, + "loss": 0.7814, + "step": 32605 + }, + { + "epoch": 1.84, + "grad_norm": 7.978464096329849, + "learning_rate": 3.9185086793699625e-06, + "loss": 0.751, + "step": 32610 + }, + { + "epoch": 1.84, + "grad_norm": 7.304591997611985, + "learning_rate": 3.916908525283252e-06, + "loss": 0.7367, + "step": 32615 + }, + { + "epoch": 1.84, + "grad_norm": 5.2673916109453796, + "learning_rate": 3.915308487579501e-06, + "loss": 0.7647, + "step": 32620 + }, + { + "epoch": 1.84, + "grad_norm": 5.867734512133385, + "learning_rate": 3.913708566430639e-06, + "loss": 0.734, + "step": 32625 + }, + { + "epoch": 1.84, + "grad_norm": 4.756938881781644, + "learning_rate": 3.912108762008586e-06, + "loss": 0.7547, + "step": 32630 + }, + { + "epoch": 1.84, + "grad_norm": 5.629908597637033, + "learning_rate": 3.910509074485249e-06, + "loss": 0.721, + "step": 32635 + }, + { + "epoch": 1.84, + "grad_norm": 4.891973908041338, + "learning_rate": 3.908909504032519e-06, + "loss": 0.7008, + "step": 32640 + }, + { + "epoch": 1.84, + "grad_norm": 7.809622620180986, + "learning_rate": 3.90731005082228e-06, + "loss": 0.7223, + "step": 32645 + }, + { + "epoch": 1.84, + "grad_norm": 6.444578271276455, + "learning_rate": 3.905710715026396e-06, + "loss": 0.7038, + "step": 32650 + }, + { + "epoch": 1.84, + "grad_norm": 5.431709121181191, + "learning_rate": 3.904111496816727e-06, + "loss": 0.7937, + "step": 32655 + }, + { + "epoch": 1.84, + "grad_norm": 7.224569572129378, + "learning_rate": 3.902512396365113e-06, + "loss": 0.7325, + "step": 32660 + }, + { + "epoch": 1.84, + "grad_norm": 4.706558355927059, + "learning_rate": 3.900913413843387e-06, + "loss": 0.7295, + "step": 32665 + }, + { + "epoch": 1.84, + "grad_norm": 5.198248517035201, + "learning_rate": 3.899314549423364e-06, + "loss": 0.7261, + "step": 32670 + }, + { + "epoch": 1.84, + "grad_norm": 5.716783980003042, + "learning_rate": 3.89771580327685e-06, + "loss": 0.7579, + "step": 32675 + }, + { + "epoch": 1.84, + "grad_norm": 6.870755285483422, + "learning_rate": 3.8961171755756385e-06, + "loss": 0.7657, + "step": 32680 + }, + { + "epoch": 1.84, + "grad_norm": 6.1597131749413405, + "learning_rate": 3.894518666491507e-06, + "loss": 0.7335, + "step": 32685 + }, + { + "epoch": 1.84, + "grad_norm": 5.69179036006, + "learning_rate": 3.892920276196226e-06, + "loss": 0.7355, + "step": 32690 + }, + { + "epoch": 1.84, + "grad_norm": 4.888652128757496, + "learning_rate": 3.891322004861545e-06, + "loss": 0.7774, + "step": 32695 + }, + { + "epoch": 1.84, + "grad_norm": 4.745532279358965, + "learning_rate": 3.88972385265921e-06, + "loss": 0.7423, + "step": 32700 + }, + { + "epoch": 1.84, + "grad_norm": 5.46785471070572, + "learning_rate": 3.888125819760947e-06, + "loss": 0.7153, + "step": 32705 + }, + { + "epoch": 1.84, + "grad_norm": 5.664777069083331, + "learning_rate": 3.8865279063384695e-06, + "loss": 0.7199, + "step": 32710 + }, + { + "epoch": 1.84, + "grad_norm": 11.250637047504215, + "learning_rate": 3.884930112563484e-06, + "loss": 0.7388, + "step": 32715 + }, + { + "epoch": 1.84, + "grad_norm": 11.100398930029195, + "learning_rate": 3.883332438607677e-06, + "loss": 0.7264, + "step": 32720 + }, + { + "epoch": 1.84, + "grad_norm": 8.878890099982266, + "learning_rate": 3.88173488464273e-06, + "loss": 0.7085, + "step": 32725 + }, + { + "epoch": 1.84, + "grad_norm": 7.998895867735593, + "learning_rate": 3.880137450840305e-06, + "loss": 0.7531, + "step": 32730 + }, + { + "epoch": 1.84, + "grad_norm": 5.192499568647865, + "learning_rate": 3.878540137372051e-06, + "loss": 0.7167, + "step": 32735 + }, + { + "epoch": 1.84, + "grad_norm": 5.935148044262539, + "learning_rate": 3.876942944409612e-06, + "loss": 0.7391, + "step": 32740 + }, + { + "epoch": 1.84, + "grad_norm": 4.713431584224038, + "learning_rate": 3.8753458721246065e-06, + "loss": 0.7335, + "step": 32745 + }, + { + "epoch": 1.85, + "grad_norm": 7.154152487697281, + "learning_rate": 3.873748920688653e-06, + "loss": 0.7053, + "step": 32750 + }, + { + "epoch": 1.85, + "grad_norm": 7.250284148721991, + "learning_rate": 3.872152090273349e-06, + "loss": 0.7735, + "step": 32755 + }, + { + "epoch": 1.85, + "grad_norm": 8.628606587302714, + "learning_rate": 3.870555381050279e-06, + "loss": 0.7185, + "step": 32760 + }, + { + "epoch": 1.85, + "grad_norm": 6.948409626958064, + "learning_rate": 3.868958793191017e-06, + "loss": 0.7253, + "step": 32765 + }, + { + "epoch": 1.85, + "grad_norm": 7.46461713091953, + "learning_rate": 3.867362326867126e-06, + "loss": 0.6974, + "step": 32770 + }, + { + "epoch": 1.85, + "grad_norm": 6.726005685383899, + "learning_rate": 3.86576598225015e-06, + "loss": 0.6606, + "step": 32775 + }, + { + "epoch": 1.85, + "grad_norm": 6.835234711108583, + "learning_rate": 3.864169759511625e-06, + "loss": 0.7467, + "step": 32780 + }, + { + "epoch": 1.85, + "grad_norm": 5.697925745596195, + "learning_rate": 3.862573658823075e-06, + "loss": 0.7096, + "step": 32785 + }, + { + "epoch": 1.85, + "grad_norm": 7.802825289105715, + "learning_rate": 3.860977680356004e-06, + "loss": 0.7519, + "step": 32790 + }, + { + "epoch": 1.85, + "grad_norm": 5.749256680968329, + "learning_rate": 3.859381824281906e-06, + "loss": 0.7712, + "step": 32795 + }, + { + "epoch": 1.85, + "grad_norm": 6.726770427574885, + "learning_rate": 3.857786090772268e-06, + "loss": 0.7582, + "step": 32800 + }, + { + "epoch": 1.85, + "grad_norm": 9.721761866857864, + "learning_rate": 3.856190479998553e-06, + "loss": 0.7177, + "step": 32805 + }, + { + "epoch": 1.85, + "grad_norm": 11.158233470231522, + "learning_rate": 3.854594992132221e-06, + "loss": 0.7203, + "step": 32810 + }, + { + "epoch": 1.85, + "grad_norm": 8.827405380450159, + "learning_rate": 3.852999627344711e-06, + "loss": 0.6741, + "step": 32815 + }, + { + "epoch": 1.85, + "grad_norm": 5.828920311623465, + "learning_rate": 3.851404385807452e-06, + "loss": 0.7503, + "step": 32820 + }, + { + "epoch": 1.85, + "grad_norm": 5.042129343470339, + "learning_rate": 3.849809267691863e-06, + "loss": 0.7249, + "step": 32825 + }, + { + "epoch": 1.85, + "grad_norm": 4.8341905136822865, + "learning_rate": 3.8482142731693425e-06, + "loss": 0.7749, + "step": 32830 + }, + { + "epoch": 1.85, + "grad_norm": 5.158277977080501, + "learning_rate": 3.846619402411283e-06, + "loss": 0.6899, + "step": 32835 + }, + { + "epoch": 1.85, + "grad_norm": 5.736043582193756, + "learning_rate": 3.8450246555890595e-06, + "loss": 0.6957, + "step": 32840 + }, + { + "epoch": 1.85, + "grad_norm": 5.758376958474109, + "learning_rate": 3.843430032874032e-06, + "loss": 0.7464, + "step": 32845 + }, + { + "epoch": 1.85, + "grad_norm": 6.30633803296278, + "learning_rate": 3.841835534437554e-06, + "loss": 0.7352, + "step": 32850 + }, + { + "epoch": 1.85, + "grad_norm": 4.835532010013905, + "learning_rate": 3.840241160450957e-06, + "loss": 0.7794, + "step": 32855 + }, + { + "epoch": 1.85, + "grad_norm": 5.039623406688664, + "learning_rate": 3.838646911085566e-06, + "loss": 0.7566, + "step": 32860 + }, + { + "epoch": 1.85, + "grad_norm": 5.281947195613584, + "learning_rate": 3.837052786512691e-06, + "loss": 0.7357, + "step": 32865 + }, + { + "epoch": 1.85, + "grad_norm": 5.168850515003406, + "learning_rate": 3.835458786903627e-06, + "loss": 0.7608, + "step": 32870 + }, + { + "epoch": 1.85, + "grad_norm": 5.867820813372498, + "learning_rate": 3.833864912429655e-06, + "loss": 0.734, + "step": 32875 + }, + { + "epoch": 1.85, + "grad_norm": 6.53356939155142, + "learning_rate": 3.8322711632620434e-06, + "loss": 0.7068, + "step": 32880 + }, + { + "epoch": 1.85, + "grad_norm": 4.84947732054856, + "learning_rate": 3.830677539572052e-06, + "loss": 0.7188, + "step": 32885 + }, + { + "epoch": 1.85, + "grad_norm": 8.341087049202473, + "learning_rate": 3.829084041530916e-06, + "loss": 0.742, + "step": 32890 + }, + { + "epoch": 1.85, + "grad_norm": 7.51850578911999, + "learning_rate": 3.827490669309871e-06, + "loss": 0.7338, + "step": 32895 + }, + { + "epoch": 1.85, + "grad_norm": 7.777830711988462, + "learning_rate": 3.825897423080126e-06, + "loss": 0.6848, + "step": 32900 + }, + { + "epoch": 1.85, + "grad_norm": 6.090353498447795, + "learning_rate": 3.824304303012884e-06, + "loss": 0.7496, + "step": 32905 + }, + { + "epoch": 1.85, + "grad_norm": 4.6346802442528725, + "learning_rate": 3.8227113092793355e-06, + "loss": 0.7971, + "step": 32910 + }, + { + "epoch": 1.85, + "grad_norm": 5.615440735398652, + "learning_rate": 3.82111844205065e-06, + "loss": 0.7531, + "step": 32915 + }, + { + "epoch": 1.85, + "grad_norm": 4.5549202926848125, + "learning_rate": 3.819525701497994e-06, + "loss": 0.7452, + "step": 32920 + }, + { + "epoch": 1.86, + "grad_norm": 5.884484306822319, + "learning_rate": 3.81793308779251e-06, + "loss": 0.7312, + "step": 32925 + }, + { + "epoch": 1.86, + "grad_norm": 4.954313405873962, + "learning_rate": 3.816340601105331e-06, + "loss": 0.7356, + "step": 32930 + }, + { + "epoch": 1.86, + "grad_norm": 7.771480389507657, + "learning_rate": 3.81474824160758e-06, + "loss": 0.7476, + "step": 32935 + }, + { + "epoch": 1.86, + "grad_norm": 5.646446813417095, + "learning_rate": 3.8131560094703597e-06, + "loss": 0.7214, + "step": 32940 + }, + { + "epoch": 1.86, + "grad_norm": 6.139651060425786, + "learning_rate": 3.8115639048647656e-06, + "loss": 0.735, + "step": 32945 + }, + { + "epoch": 1.86, + "grad_norm": 6.570965526611558, + "learning_rate": 3.8099719279618735e-06, + "loss": 0.7725, + "step": 32950 + }, + { + "epoch": 1.86, + "grad_norm": 8.016560523626401, + "learning_rate": 3.8083800789327508e-06, + "loss": 0.6994, + "step": 32955 + }, + { + "epoch": 1.86, + "grad_norm": 6.0086040446261455, + "learning_rate": 3.8067883579484483e-06, + "loss": 0.7449, + "step": 32960 + }, + { + "epoch": 1.86, + "grad_norm": 8.251575214555384, + "learning_rate": 3.805196765180001e-06, + "loss": 0.7212, + "step": 32965 + }, + { + "epoch": 1.86, + "grad_norm": 5.609832168174631, + "learning_rate": 3.803605300798436e-06, + "loss": 0.7261, + "step": 32970 + }, + { + "epoch": 1.86, + "grad_norm": 5.085790608966236, + "learning_rate": 3.80201396497476e-06, + "loss": 0.7001, + "step": 32975 + }, + { + "epoch": 1.86, + "grad_norm": 4.789936282743735, + "learning_rate": 3.8004227578799723e-06, + "loss": 0.727, + "step": 32980 + }, + { + "epoch": 1.86, + "grad_norm": 5.770146878627893, + "learning_rate": 3.7988316796850533e-06, + "loss": 0.762, + "step": 32985 + }, + { + "epoch": 1.86, + "grad_norm": 12.06776056933221, + "learning_rate": 3.797240730560969e-06, + "loss": 0.7501, + "step": 32990 + }, + { + "epoch": 1.86, + "grad_norm": 5.152363973893559, + "learning_rate": 3.795649910678679e-06, + "loss": 0.7479, + "step": 32995 + }, + { + "epoch": 1.86, + "grad_norm": 6.858114408066479, + "learning_rate": 3.79405922020912e-06, + "loss": 0.7296, + "step": 33000 + }, + { + "epoch": 1.86, + "grad_norm": 6.219572523046081, + "learning_rate": 3.79246865932322e-06, + "loss": 0.7491, + "step": 33005 + }, + { + "epoch": 1.86, + "grad_norm": 7.22601817351407, + "learning_rate": 3.7908782281918934e-06, + "loss": 0.7763, + "step": 33010 + }, + { + "epoch": 1.86, + "grad_norm": 4.923123165622001, + "learning_rate": 3.7892879269860348e-06, + "loss": 0.7335, + "step": 33015 + }, + { + "epoch": 1.86, + "grad_norm": 4.878097118926254, + "learning_rate": 3.7876977558765344e-06, + "loss": 0.7478, + "step": 33020 + }, + { + "epoch": 1.86, + "grad_norm": 6.740463168941638, + "learning_rate": 3.786107715034258e-06, + "loss": 0.6969, + "step": 33025 + }, + { + "epoch": 1.86, + "grad_norm": 8.83498617013336, + "learning_rate": 3.784517804630067e-06, + "loss": 0.7134, + "step": 33030 + }, + { + "epoch": 1.86, + "grad_norm": 4.607155632496121, + "learning_rate": 3.7829280248348e-06, + "loss": 0.7279, + "step": 33035 + }, + { + "epoch": 1.86, + "grad_norm": 5.254495005295007, + "learning_rate": 3.7813383758192906e-06, + "loss": 0.7649, + "step": 33040 + }, + { + "epoch": 1.86, + "grad_norm": 5.339475826830785, + "learning_rate": 3.7797488577543494e-06, + "loss": 0.7334, + "step": 33045 + }, + { + "epoch": 1.86, + "grad_norm": 5.010207978408473, + "learning_rate": 3.778159470810778e-06, + "loss": 0.7294, + "step": 33050 + }, + { + "epoch": 1.86, + "grad_norm": 5.52219248375397, + "learning_rate": 3.7765702151593663e-06, + "loss": 0.7453, + "step": 33055 + }, + { + "epoch": 1.86, + "grad_norm": 5.864643104171137, + "learning_rate": 3.7749810909708824e-06, + "loss": 0.7276, + "step": 33060 + }, + { + "epoch": 1.86, + "grad_norm": 5.555456985361319, + "learning_rate": 3.773392098416089e-06, + "loss": 0.6888, + "step": 33065 + }, + { + "epoch": 1.86, + "grad_norm": 4.639870938612942, + "learning_rate": 3.771803237665727e-06, + "loss": 0.7415, + "step": 33070 + }, + { + "epoch": 1.86, + "grad_norm": 9.419973200174649, + "learning_rate": 3.770214508890526e-06, + "loss": 0.7374, + "step": 33075 + }, + { + "epoch": 1.86, + "grad_norm": 15.743170008193136, + "learning_rate": 3.7686259122612057e-06, + "loss": 0.7429, + "step": 33080 + }, + { + "epoch": 1.86, + "grad_norm": 13.7084056169955, + "learning_rate": 3.7670374479484635e-06, + "loss": 0.7328, + "step": 33085 + }, + { + "epoch": 1.86, + "grad_norm": 6.5140714440406375, + "learning_rate": 3.76544911612299e-06, + "loss": 0.7574, + "step": 33090 + }, + { + "epoch": 1.86, + "grad_norm": 16.230336189391053, + "learning_rate": 3.7638609169554584e-06, + "loss": 0.7611, + "step": 33095 + }, + { + "epoch": 1.86, + "grad_norm": 5.12368552845721, + "learning_rate": 3.7622728506165247e-06, + "loss": 0.7384, + "step": 33100 + }, + { + "epoch": 1.87, + "grad_norm": 5.381520027505094, + "learning_rate": 3.760684917276838e-06, + "loss": 0.7302, + "step": 33105 + }, + { + "epoch": 1.87, + "grad_norm": 4.858433678077069, + "learning_rate": 3.759097117107024e-06, + "loss": 0.7331, + "step": 33110 + }, + { + "epoch": 1.87, + "grad_norm": 15.432298389929707, + "learning_rate": 3.7575094502777038e-06, + "loss": 0.7029, + "step": 33115 + }, + { + "epoch": 1.87, + "grad_norm": 5.598382904597247, + "learning_rate": 3.7559219169594742e-06, + "loss": 0.6795, + "step": 33120 + }, + { + "epoch": 1.87, + "grad_norm": 4.787806713123113, + "learning_rate": 3.7543345173229274e-06, + "loss": 0.6868, + "step": 33125 + }, + { + "epoch": 1.87, + "grad_norm": 8.749646146539174, + "learning_rate": 3.752747251538633e-06, + "loss": 0.7897, + "step": 33130 + }, + { + "epoch": 1.87, + "grad_norm": 5.679588388653044, + "learning_rate": 3.751160119777152e-06, + "loss": 0.7627, + "step": 33135 + }, + { + "epoch": 1.87, + "grad_norm": 9.333879132444865, + "learning_rate": 3.7495731222090263e-06, + "loss": 0.6887, + "step": 33140 + }, + { + "epoch": 1.87, + "grad_norm": 4.838699235196046, + "learning_rate": 3.7479862590047868e-06, + "loss": 0.7324, + "step": 33145 + }, + { + "epoch": 1.87, + "grad_norm": 5.495523715311052, + "learning_rate": 3.7463995303349516e-06, + "loss": 0.7341, + "step": 33150 + }, + { + "epoch": 1.87, + "grad_norm": 4.962021011382005, + "learning_rate": 3.744812936370019e-06, + "loss": 0.7222, + "step": 33155 + }, + { + "epoch": 1.87, + "grad_norm": 5.06668332820931, + "learning_rate": 3.743226477280474e-06, + "loss": 0.7076, + "step": 33160 + }, + { + "epoch": 1.87, + "grad_norm": 7.789074340187547, + "learning_rate": 3.741640153236793e-06, + "loss": 0.7071, + "step": 33165 + }, + { + "epoch": 1.87, + "grad_norm": 9.063639419988654, + "learning_rate": 3.740053964409428e-06, + "loss": 0.7113, + "step": 33170 + }, + { + "epoch": 1.87, + "grad_norm": 8.783536229577145, + "learning_rate": 3.7384679109688284e-06, + "loss": 0.7867, + "step": 33175 + }, + { + "epoch": 1.87, + "grad_norm": 5.940079076665862, + "learning_rate": 3.7368819930854165e-06, + "loss": 0.7687, + "step": 33180 + }, + { + "epoch": 1.87, + "grad_norm": 5.1263887119496525, + "learning_rate": 3.7352962109296103e-06, + "loss": 0.734, + "step": 33185 + }, + { + "epoch": 1.87, + "grad_norm": 9.428773217140986, + "learning_rate": 3.7337105646718086e-06, + "loss": 0.738, + "step": 33190 + }, + { + "epoch": 1.87, + "grad_norm": 9.187692467521277, + "learning_rate": 3.732125054482394e-06, + "loss": 0.7588, + "step": 33195 + }, + { + "epoch": 1.87, + "grad_norm": 7.5226357128826224, + "learning_rate": 3.7305396805317394e-06, + "loss": 0.7376, + "step": 33200 + }, + { + "epoch": 1.87, + "grad_norm": 9.837206423449839, + "learning_rate": 3.7289544429901962e-06, + "loss": 0.6971, + "step": 33205 + }, + { + "epoch": 1.87, + "grad_norm": 15.115755968885098, + "learning_rate": 3.7273693420281097e-06, + "loss": 0.7183, + "step": 33210 + }, + { + "epoch": 1.87, + "grad_norm": 5.403755501235667, + "learning_rate": 3.725784377815805e-06, + "loss": 0.6442, + "step": 33215 + }, + { + "epoch": 1.87, + "grad_norm": 4.774291127446088, + "learning_rate": 3.7241995505235893e-06, + "loss": 0.698, + "step": 33220 + }, + { + "epoch": 1.87, + "grad_norm": 7.9821489305670195, + "learning_rate": 3.722614860321764e-06, + "loss": 0.7453, + "step": 33225 + }, + { + "epoch": 1.87, + "grad_norm": 4.582326438191975, + "learning_rate": 3.72103030738061e-06, + "loss": 0.7474, + "step": 33230 + }, + { + "epoch": 1.87, + "grad_norm": 5.116080293871856, + "learning_rate": 3.719445891870393e-06, + "loss": 0.7642, + "step": 33235 + }, + { + "epoch": 1.87, + "grad_norm": 7.805096225135345, + "learning_rate": 3.7178616139613676e-06, + "loss": 0.753, + "step": 33240 + }, + { + "epoch": 1.87, + "grad_norm": 5.1202490150168085, + "learning_rate": 3.716277473823768e-06, + "loss": 0.7356, + "step": 33245 + }, + { + "epoch": 1.87, + "grad_norm": 8.983644991572485, + "learning_rate": 3.714693471627823e-06, + "loss": 0.7536, + "step": 33250 + }, + { + "epoch": 1.87, + "grad_norm": 9.85707263913975, + "learning_rate": 3.713109607543733e-06, + "loss": 0.7287, + "step": 33255 + }, + { + "epoch": 1.87, + "grad_norm": 5.3630504564674, + "learning_rate": 3.711525881741698e-06, + "loss": 0.7633, + "step": 33260 + }, + { + "epoch": 1.87, + "grad_norm": 15.331488525980054, + "learning_rate": 3.709942294391892e-06, + "loss": 0.7158, + "step": 33265 + }, + { + "epoch": 1.87, + "grad_norm": 5.552899225732661, + "learning_rate": 3.708358845664481e-06, + "loss": 0.7345, + "step": 33270 + }, + { + "epoch": 1.87, + "grad_norm": 4.969317872162896, + "learning_rate": 3.706775535729612e-06, + "loss": 0.727, + "step": 33275 + }, + { + "epoch": 1.88, + "grad_norm": 6.639046499207537, + "learning_rate": 3.7051923647574172e-06, + "loss": 0.7539, + "step": 33280 + }, + { + "epoch": 1.88, + "grad_norm": 5.504805829946549, + "learning_rate": 3.7036093329180206e-06, + "loss": 0.7676, + "step": 33285 + }, + { + "epoch": 1.88, + "grad_norm": 17.476568490832356, + "learning_rate": 3.70202644038152e-06, + "loss": 0.7336, + "step": 33290 + }, + { + "epoch": 1.88, + "grad_norm": 10.832739640367027, + "learning_rate": 3.7004436873180083e-06, + "loss": 0.7267, + "step": 33295 + }, + { + "epoch": 1.88, + "grad_norm": 4.767689201380548, + "learning_rate": 3.6988610738975585e-06, + "loss": 0.71, + "step": 33300 + }, + { + "epoch": 1.88, + "grad_norm": 7.826464465345845, + "learning_rate": 3.697278600290226e-06, + "loss": 0.7865, + "step": 33305 + }, + { + "epoch": 1.88, + "grad_norm": 6.330790330324903, + "learning_rate": 3.695696266666059e-06, + "loss": 0.7579, + "step": 33310 + }, + { + "epoch": 1.88, + "grad_norm": 5.028007907363844, + "learning_rate": 3.694114073195084e-06, + "loss": 0.658, + "step": 33315 + }, + { + "epoch": 1.88, + "grad_norm": 8.069186402198259, + "learning_rate": 3.692532020047316e-06, + "loss": 0.684, + "step": 33320 + }, + { + "epoch": 1.88, + "grad_norm": 8.097981862953665, + "learning_rate": 3.690950107392752e-06, + "loss": 0.7477, + "step": 33325 + }, + { + "epoch": 1.88, + "grad_norm": 4.876920740713685, + "learning_rate": 3.6893683354013752e-06, + "loss": 0.6985, + "step": 33330 + }, + { + "epoch": 1.88, + "grad_norm": 5.8937221390226355, + "learning_rate": 3.6877867042431575e-06, + "loss": 0.7423, + "step": 33335 + }, + { + "epoch": 1.88, + "grad_norm": 6.050457359324089, + "learning_rate": 3.6862052140880474e-06, + "loss": 0.7166, + "step": 33340 + }, + { + "epoch": 1.88, + "grad_norm": 8.19579683854763, + "learning_rate": 3.6846238651059874e-06, + "loss": 0.7302, + "step": 33345 + }, + { + "epoch": 1.88, + "grad_norm": 5.521345566681065, + "learning_rate": 3.6830426574668953e-06, + "loss": 0.7746, + "step": 33350 + }, + { + "epoch": 1.88, + "grad_norm": 5.151783416296471, + "learning_rate": 3.6814615913406846e-06, + "loss": 0.7431, + "step": 33355 + }, + { + "epoch": 1.88, + "grad_norm": 5.38410660950617, + "learning_rate": 3.6798806668972443e-06, + "loss": 0.7377, + "step": 33360 + }, + { + "epoch": 1.88, + "grad_norm": 4.897383699146823, + "learning_rate": 3.678299884306451e-06, + "loss": 0.6312, + "step": 33365 + }, + { + "epoch": 1.88, + "grad_norm": 5.248718368567392, + "learning_rate": 3.6767192437381705e-06, + "loss": 0.7071, + "step": 33370 + }, + { + "epoch": 1.88, + "grad_norm": 9.236465170126863, + "learning_rate": 3.675138745362245e-06, + "loss": 0.7521, + "step": 33375 + }, + { + "epoch": 1.88, + "grad_norm": 5.390162974629451, + "learning_rate": 3.6735583893485104e-06, + "loss": 0.7272, + "step": 33380 + }, + { + "epoch": 1.88, + "grad_norm": 5.016337995684026, + "learning_rate": 3.671978175866782e-06, + "loss": 0.7513, + "step": 33385 + }, + { + "epoch": 1.88, + "grad_norm": 5.835615259113764, + "learning_rate": 3.6703981050868563e-06, + "loss": 0.6902, + "step": 33390 + }, + { + "epoch": 1.88, + "grad_norm": 10.795694439318842, + "learning_rate": 3.6688181771785246e-06, + "loss": 0.7399, + "step": 33395 + }, + { + "epoch": 1.88, + "grad_norm": 6.1015435728597565, + "learning_rate": 3.667238392311553e-06, + "loss": 0.7502, + "step": 33400 + }, + { + "epoch": 1.88, + "grad_norm": 7.619985644047551, + "learning_rate": 3.6656587506556984e-06, + "loss": 0.7173, + "step": 33405 + }, + { + "epoch": 1.88, + "grad_norm": 8.303040301753041, + "learning_rate": 3.664079252380701e-06, + "loss": 0.7417, + "step": 33410 + }, + { + "epoch": 1.88, + "grad_norm": 7.9985413869010635, + "learning_rate": 3.6624998976562824e-06, + "loss": 0.7207, + "step": 33415 + }, + { + "epoch": 1.88, + "grad_norm": 4.825821084031015, + "learning_rate": 3.6609206866521546e-06, + "loss": 0.7142, + "step": 33420 + }, + { + "epoch": 1.88, + "grad_norm": 6.997454816895571, + "learning_rate": 3.659341619538006e-06, + "loss": 0.6869, + "step": 33425 + }, + { + "epoch": 1.88, + "grad_norm": 5.52407682161989, + "learning_rate": 3.65776269648352e-06, + "loss": 0.736, + "step": 33430 + }, + { + "epoch": 1.88, + "grad_norm": 6.977158052908027, + "learning_rate": 3.6561839176583536e-06, + "loss": 0.7837, + "step": 33435 + }, + { + "epoch": 1.88, + "grad_norm": 5.333562391025884, + "learning_rate": 3.654605283232159e-06, + "loss": 0.7257, + "step": 33440 + }, + { + "epoch": 1.88, + "grad_norm": 6.123224172007378, + "learning_rate": 3.6530267933745623e-06, + "loss": 0.7105, + "step": 33445 + }, + { + "epoch": 1.88, + "grad_norm": 21.807940846621342, + "learning_rate": 3.6514484482551817e-06, + "loss": 0.7555, + "step": 33450 + }, + { + "epoch": 1.89, + "grad_norm": 5.509095728610213, + "learning_rate": 3.6498702480436167e-06, + "loss": 0.7124, + "step": 33455 + }, + { + "epoch": 1.89, + "grad_norm": 6.424522271011536, + "learning_rate": 3.6482921929094507e-06, + "loss": 0.7087, + "step": 33460 + }, + { + "epoch": 1.89, + "grad_norm": 13.596304477963848, + "learning_rate": 3.6467142830222566e-06, + "loss": 0.759, + "step": 33465 + }, + { + "epoch": 1.89, + "grad_norm": 5.76604585438323, + "learning_rate": 3.645136518551585e-06, + "loss": 0.7096, + "step": 33470 + }, + { + "epoch": 1.89, + "grad_norm": 4.844500528948266, + "learning_rate": 3.6435588996669726e-06, + "loss": 0.7187, + "step": 33475 + }, + { + "epoch": 1.89, + "grad_norm": 8.83270439365789, + "learning_rate": 3.641981426537945e-06, + "loss": 0.6937, + "step": 33480 + }, + { + "epoch": 1.89, + "grad_norm": 6.993599292802973, + "learning_rate": 3.6404040993340038e-06, + "loss": 0.7443, + "step": 33485 + }, + { + "epoch": 1.89, + "grad_norm": 11.47810486396018, + "learning_rate": 3.6388269182246444e-06, + "loss": 0.7402, + "step": 33490 + }, + { + "epoch": 1.89, + "grad_norm": 5.301075576795128, + "learning_rate": 3.637249883379339e-06, + "loss": 0.6943, + "step": 33495 + }, + { + "epoch": 1.89, + "grad_norm": 5.864076173611569, + "learning_rate": 3.6356729949675474e-06, + "loss": 0.7046, + "step": 33500 + }, + { + "epoch": 1.89, + "grad_norm": 5.756086870112118, + "learning_rate": 3.6340962531587153e-06, + "loss": 0.7301, + "step": 33505 + }, + { + "epoch": 1.89, + "grad_norm": 5.617545638067697, + "learning_rate": 3.632519658122268e-06, + "loss": 0.7547, + "step": 33510 + }, + { + "epoch": 1.89, + "grad_norm": 5.277245488928964, + "learning_rate": 3.6309432100276193e-06, + "loss": 0.7378, + "step": 33515 + }, + { + "epoch": 1.89, + "grad_norm": 5.168528630142161, + "learning_rate": 3.629366909044163e-06, + "loss": 0.7512, + "step": 33520 + }, + { + "epoch": 1.89, + "grad_norm": 5.352948888128745, + "learning_rate": 3.627790755341284e-06, + "loss": 0.7119, + "step": 33525 + }, + { + "epoch": 1.89, + "grad_norm": 7.735412290404789, + "learning_rate": 3.6262147490883436e-06, + "loss": 0.7822, + "step": 33530 + }, + { + "epoch": 1.89, + "grad_norm": 4.92245499413314, + "learning_rate": 3.6246388904546905e-06, + "loss": 0.6771, + "step": 33535 + }, + { + "epoch": 1.89, + "grad_norm": 5.61480618008788, + "learning_rate": 3.6230631796096584e-06, + "loss": 0.7735, + "step": 33540 + }, + { + "epoch": 1.89, + "grad_norm": 5.943676178400943, + "learning_rate": 3.6214876167225648e-06, + "loss": 0.6994, + "step": 33545 + }, + { + "epoch": 1.89, + "grad_norm": 5.898413698136308, + "learning_rate": 3.619912201962711e-06, + "loss": 0.7006, + "step": 33550 + }, + { + "epoch": 1.89, + "grad_norm": 5.480076412106544, + "learning_rate": 3.6183369354993827e-06, + "loss": 0.7271, + "step": 33555 + }, + { + "epoch": 1.89, + "grad_norm": 9.202830880100374, + "learning_rate": 3.616761817501846e-06, + "loss": 0.7322, + "step": 33560 + }, + { + "epoch": 1.89, + "grad_norm": 5.678698441288536, + "learning_rate": 3.6151868481393592e-06, + "loss": 0.7178, + "step": 33565 + }, + { + "epoch": 1.89, + "grad_norm": 4.551095927024385, + "learning_rate": 3.6136120275811554e-06, + "loss": 0.6984, + "step": 33570 + }, + { + "epoch": 1.89, + "grad_norm": 5.91493372463631, + "learning_rate": 3.6120373559964593e-06, + "loss": 0.6605, + "step": 33575 + }, + { + "epoch": 1.89, + "grad_norm": 7.399261163381923, + "learning_rate": 3.6104628335544743e-06, + "loss": 0.7247, + "step": 33580 + }, + { + "epoch": 1.89, + "grad_norm": 5.399695530011383, + "learning_rate": 3.60888846042439e-06, + "loss": 0.7189, + "step": 33585 + }, + { + "epoch": 1.89, + "grad_norm": 7.4772390707745995, + "learning_rate": 3.6073142367753805e-06, + "loss": 0.701, + "step": 33590 + }, + { + "epoch": 1.89, + "grad_norm": 7.842214399711519, + "learning_rate": 3.6057401627766007e-06, + "loss": 0.7019, + "step": 33595 + }, + { + "epoch": 1.89, + "grad_norm": 5.611602914394154, + "learning_rate": 3.604166238597197e-06, + "loss": 0.6859, + "step": 33600 + }, + { + "epoch": 1.89, + "grad_norm": 6.802945363645624, + "learning_rate": 3.602592464406288e-06, + "loss": 0.7219, + "step": 33605 + }, + { + "epoch": 1.89, + "grad_norm": 4.753396496273493, + "learning_rate": 3.6010188403729893e-06, + "loss": 0.6851, + "step": 33610 + }, + { + "epoch": 1.89, + "grad_norm": 5.238619830630508, + "learning_rate": 3.59944536666639e-06, + "loss": 0.7184, + "step": 33615 + }, + { + "epoch": 1.89, + "grad_norm": 7.131524168937709, + "learning_rate": 3.5978720434555646e-06, + "loss": 0.7288, + "step": 33620 + }, + { + "epoch": 1.89, + "grad_norm": 5.224120621845438, + "learning_rate": 3.596298870909578e-06, + "loss": 0.7214, + "step": 33625 + }, + { + "epoch": 1.89, + "grad_norm": 8.805090207311748, + "learning_rate": 3.594725849197471e-06, + "loss": 0.7196, + "step": 33630 + }, + { + "epoch": 1.9, + "grad_norm": 6.394838594112325, + "learning_rate": 3.5931529784882748e-06, + "loss": 0.7105, + "step": 33635 + }, + { + "epoch": 1.9, + "grad_norm": 5.980071892581502, + "learning_rate": 3.5915802589510005e-06, + "loss": 0.7879, + "step": 33640 + }, + { + "epoch": 1.9, + "grad_norm": 6.32604819181313, + "learning_rate": 3.590007690754642e-06, + "loss": 0.7558, + "step": 33645 + }, + { + "epoch": 1.9, + "grad_norm": 14.763935024362741, + "learning_rate": 3.5884352740681816e-06, + "loss": 0.7389, + "step": 33650 + }, + { + "epoch": 1.9, + "grad_norm": 8.46121061875503, + "learning_rate": 3.5868630090605784e-06, + "loss": 0.7454, + "step": 33655 + }, + { + "epoch": 1.9, + "grad_norm": 5.305726612329147, + "learning_rate": 3.585290895900784e-06, + "loss": 0.6992, + "step": 33660 + }, + { + "epoch": 1.9, + "grad_norm": 7.174971837692179, + "learning_rate": 3.5837189347577234e-06, + "loss": 0.7343, + "step": 33665 + }, + { + "epoch": 1.9, + "grad_norm": 13.650068931154648, + "learning_rate": 3.582147125800316e-06, + "loss": 0.715, + "step": 33670 + }, + { + "epoch": 1.9, + "grad_norm": 4.887708001835577, + "learning_rate": 3.580575469197457e-06, + "loss": 0.7363, + "step": 33675 + }, + { + "epoch": 1.9, + "grad_norm": 4.662671837795487, + "learning_rate": 3.579003965118028e-06, + "loss": 0.7382, + "step": 33680 + }, + { + "epoch": 1.9, + "grad_norm": 7.762266468819768, + "learning_rate": 3.5774326137308944e-06, + "loss": 0.7154, + "step": 33685 + }, + { + "epoch": 1.9, + "grad_norm": 15.601980321054873, + "learning_rate": 3.575861415204904e-06, + "loss": 0.7213, + "step": 33690 + }, + { + "epoch": 1.9, + "grad_norm": 5.108175915047928, + "learning_rate": 3.574290369708891e-06, + "loss": 0.6971, + "step": 33695 + }, + { + "epoch": 1.9, + "grad_norm": 5.761560473533494, + "learning_rate": 3.5727194774116702e-06, + "loss": 0.7546, + "step": 33700 + }, + { + "epoch": 1.9, + "grad_norm": 30.499099568455062, + "learning_rate": 3.571148738482039e-06, + "loss": 0.7239, + "step": 33705 + }, + { + "epoch": 1.9, + "grad_norm": 8.189839583208315, + "learning_rate": 3.5695781530887834e-06, + "loss": 0.6914, + "step": 33710 + }, + { + "epoch": 1.9, + "grad_norm": 6.638723198655214, + "learning_rate": 3.5680077214006664e-06, + "loss": 0.7422, + "step": 33715 + }, + { + "epoch": 1.9, + "grad_norm": 4.994479886348189, + "learning_rate": 3.5664374435864414e-06, + "loss": 0.7155, + "step": 33720 + }, + { + "epoch": 1.9, + "grad_norm": 7.134008146783192, + "learning_rate": 3.5648673198148394e-06, + "loss": 0.7359, + "step": 33725 + }, + { + "epoch": 1.9, + "grad_norm": 6.486958398123869, + "learning_rate": 3.563297350254576e-06, + "loss": 0.7086, + "step": 33730 + }, + { + "epoch": 1.9, + "grad_norm": 8.670895683960037, + "learning_rate": 3.5617275350743553e-06, + "loss": 0.7719, + "step": 33735 + }, + { + "epoch": 1.9, + "grad_norm": 6.827079092448349, + "learning_rate": 3.5601578744428565e-06, + "loss": 0.6919, + "step": 33740 + }, + { + "epoch": 1.9, + "grad_norm": 5.4745233786034015, + "learning_rate": 3.5585883685287504e-06, + "loss": 0.72, + "step": 33745 + }, + { + "epoch": 1.9, + "grad_norm": 4.621025207694066, + "learning_rate": 3.5570190175006836e-06, + "loss": 0.6989, + "step": 33750 + }, + { + "epoch": 1.9, + "grad_norm": 5.806551040034962, + "learning_rate": 3.5554498215272938e-06, + "loss": 0.7461, + "step": 33755 + }, + { + "epoch": 1.9, + "grad_norm": 5.199203112108226, + "learning_rate": 3.5538807807771956e-06, + "loss": 0.663, + "step": 33760 + }, + { + "epoch": 1.9, + "grad_norm": 4.8222320361041895, + "learning_rate": 3.552311895418988e-06, + "loss": 0.6926, + "step": 33765 + }, + { + "epoch": 1.9, + "grad_norm": 12.32950681888661, + "learning_rate": 3.5507431656212575e-06, + "loss": 0.7439, + "step": 33770 + }, + { + "epoch": 1.9, + "grad_norm": 10.892683554255738, + "learning_rate": 3.54917459155257e-06, + "loss": 0.7089, + "step": 33775 + }, + { + "epoch": 1.9, + "grad_norm": 6.152434360621356, + "learning_rate": 3.547606173381476e-06, + "loss": 0.7351, + "step": 33780 + }, + { + "epoch": 1.9, + "grad_norm": 11.34090279185434, + "learning_rate": 3.546037911276509e-06, + "loss": 0.7556, + "step": 33785 + }, + { + "epoch": 1.9, + "grad_norm": 8.751353575444856, + "learning_rate": 3.5444698054061834e-06, + "loss": 0.7285, + "step": 33790 + }, + { + "epoch": 1.9, + "grad_norm": 6.224222411745411, + "learning_rate": 3.5429018559390026e-06, + "loss": 0.6832, + "step": 33795 + }, + { + "epoch": 1.9, + "grad_norm": 5.734189281023405, + "learning_rate": 3.5413340630434463e-06, + "loss": 0.6813, + "step": 33800 + }, + { + "epoch": 1.9, + "grad_norm": 6.424637752154393, + "learning_rate": 3.5397664268879844e-06, + "loss": 0.6999, + "step": 33805 + }, + { + "epoch": 1.91, + "grad_norm": 5.183836385427888, + "learning_rate": 3.5381989476410637e-06, + "loss": 0.7281, + "step": 33810 + }, + { + "epoch": 1.91, + "grad_norm": 5.462419754167492, + "learning_rate": 3.5366316254711176e-06, + "loss": 0.7442, + "step": 33815 + }, + { + "epoch": 1.91, + "grad_norm": 11.56039018391325, + "learning_rate": 3.5350644605465612e-06, + "loss": 0.6912, + "step": 33820 + }, + { + "epoch": 1.91, + "grad_norm": 5.727771553841209, + "learning_rate": 3.5334974530357928e-06, + "loss": 0.6866, + "step": 33825 + }, + { + "epoch": 1.91, + "grad_norm": 11.508507776083809, + "learning_rate": 3.5319306031071977e-06, + "loss": 0.7128, + "step": 33830 + }, + { + "epoch": 1.91, + "grad_norm": 9.069321571392466, + "learning_rate": 3.530363910929136e-06, + "loss": 0.7264, + "step": 33835 + }, + { + "epoch": 1.91, + "grad_norm": 7.145850859528512, + "learning_rate": 3.52879737666996e-06, + "loss": 0.7524, + "step": 33840 + }, + { + "epoch": 1.91, + "grad_norm": 5.3433397719816345, + "learning_rate": 3.527231000497999e-06, + "loss": 0.7089, + "step": 33845 + }, + { + "epoch": 1.91, + "grad_norm": 6.354804226800192, + "learning_rate": 3.5256647825815647e-06, + "loss": 0.7212, + "step": 33850 + }, + { + "epoch": 1.91, + "grad_norm": 4.674002308860486, + "learning_rate": 3.5240987230889588e-06, + "loss": 0.6799, + "step": 33855 + }, + { + "epoch": 1.91, + "grad_norm": 4.746560071597657, + "learning_rate": 3.522532822188456e-06, + "loss": 0.6992, + "step": 33860 + }, + { + "epoch": 1.91, + "grad_norm": 5.797984805419982, + "learning_rate": 3.520967080048323e-06, + "loss": 0.6963, + "step": 33865 + }, + { + "epoch": 1.91, + "grad_norm": 9.040668604292417, + "learning_rate": 3.5194014968368064e-06, + "loss": 0.7128, + "step": 33870 + }, + { + "epoch": 1.91, + "grad_norm": 6.759423601666029, + "learning_rate": 3.517836072722131e-06, + "loss": 0.7009, + "step": 33875 + }, + { + "epoch": 1.91, + "grad_norm": 4.9749233946751374, + "learning_rate": 3.5162708078725127e-06, + "loss": 0.7393, + "step": 33880 + }, + { + "epoch": 1.91, + "grad_norm": 5.402478597008374, + "learning_rate": 3.5147057024561426e-06, + "loss": 0.7264, + "step": 33885 + }, + { + "epoch": 1.91, + "grad_norm": 5.206926525704742, + "learning_rate": 3.5131407566412017e-06, + "loss": 0.7052, + "step": 33890 + }, + { + "epoch": 1.91, + "grad_norm": 4.9274820775913835, + "learning_rate": 3.5115759705958486e-06, + "loss": 0.7663, + "step": 33895 + }, + { + "epoch": 1.91, + "grad_norm": 6.039420048469241, + "learning_rate": 3.5100113444882246e-06, + "loss": 0.7327, + "step": 33900 + }, + { + "epoch": 1.91, + "grad_norm": 7.3161812038229606, + "learning_rate": 3.508446878486458e-06, + "loss": 0.6992, + "step": 33905 + }, + { + "epoch": 1.91, + "grad_norm": 4.615202691945004, + "learning_rate": 3.5068825727586575e-06, + "loss": 0.6852, + "step": 33910 + }, + { + "epoch": 1.91, + "grad_norm": 5.301251088519508, + "learning_rate": 3.5053184274729135e-06, + "loss": 0.7051, + "step": 33915 + }, + { + "epoch": 1.91, + "grad_norm": 5.2922900641527155, + "learning_rate": 3.5037544427973e-06, + "loss": 0.7436, + "step": 33920 + }, + { + "epoch": 1.91, + "grad_norm": 4.819673966870845, + "learning_rate": 3.502190618899878e-06, + "loss": 0.7124, + "step": 33925 + }, + { + "epoch": 1.91, + "grad_norm": 9.339869741268306, + "learning_rate": 3.5006269559486837e-06, + "loss": 0.7239, + "step": 33930 + }, + { + "epoch": 1.91, + "grad_norm": 5.903721571102292, + "learning_rate": 3.4990634541117383e-06, + "loss": 0.719, + "step": 33935 + }, + { + "epoch": 1.91, + "grad_norm": 5.3735729078079535, + "learning_rate": 3.4975001135570506e-06, + "loss": 0.7214, + "step": 33940 + }, + { + "epoch": 1.91, + "grad_norm": 6.223521645337661, + "learning_rate": 3.495936934452605e-06, + "loss": 0.6887, + "step": 33945 + }, + { + "epoch": 1.91, + "grad_norm": 5.04464718672468, + "learning_rate": 3.494373916966376e-06, + "loss": 0.6751, + "step": 33950 + }, + { + "epoch": 1.91, + "grad_norm": 5.491194395049443, + "learning_rate": 3.4928110612663135e-06, + "loss": 0.7298, + "step": 33955 + }, + { + "epoch": 1.91, + "grad_norm": 4.885557763092124, + "learning_rate": 3.491248367520352e-06, + "loss": 0.6642, + "step": 33960 + }, + { + "epoch": 1.91, + "grad_norm": 5.193732538609462, + "learning_rate": 3.489685835896416e-06, + "loss": 0.6913, + "step": 33965 + }, + { + "epoch": 1.91, + "grad_norm": 5.772620079382848, + "learning_rate": 3.4881234665623984e-06, + "loss": 0.7063, + "step": 33970 + }, + { + "epoch": 1.91, + "grad_norm": 5.207586008164692, + "learning_rate": 3.4865612596861903e-06, + "loss": 0.7483, + "step": 33975 + }, + { + "epoch": 1.91, + "grad_norm": 7.241460421403689, + "learning_rate": 3.484999215435654e-06, + "loss": 0.7176, + "step": 33980 + }, + { + "epoch": 1.91, + "grad_norm": 6.130022812908069, + "learning_rate": 3.4834373339786354e-06, + "loss": 0.6604, + "step": 33985 + }, + { + "epoch": 1.92, + "grad_norm": 6.902386369940835, + "learning_rate": 3.481875615482971e-06, + "loss": 0.7335, + "step": 33990 + }, + { + "epoch": 1.92, + "grad_norm": 7.211343008779445, + "learning_rate": 3.4803140601164697e-06, + "loss": 0.7406, + "step": 33995 + }, + { + "epoch": 1.92, + "grad_norm": 4.992851876311853, + "learning_rate": 3.4787526680469297e-06, + "loss": 0.7392, + "step": 34000 + }, + { + "epoch": 1.92, + "grad_norm": 6.987220985759034, + "learning_rate": 3.4771914394421303e-06, + "loss": 0.716, + "step": 34005 + }, + { + "epoch": 1.92, + "grad_norm": 5.046684606330692, + "learning_rate": 3.4756303744698306e-06, + "loss": 0.7175, + "step": 34010 + }, + { + "epoch": 1.92, + "grad_norm": 4.7129756591946945, + "learning_rate": 3.4740694732977766e-06, + "loss": 0.6947, + "step": 34015 + }, + { + "epoch": 1.92, + "grad_norm": 6.1962218242675675, + "learning_rate": 3.472508736093689e-06, + "loss": 0.6953, + "step": 34020 + }, + { + "epoch": 1.92, + "grad_norm": 7.007780165649483, + "learning_rate": 3.4709481630252822e-06, + "loss": 0.6763, + "step": 34025 + }, + { + "epoch": 1.92, + "grad_norm": 5.4200840203855805, + "learning_rate": 3.469387754260241e-06, + "loss": 0.7273, + "step": 34030 + }, + { + "epoch": 1.92, + "grad_norm": 17.22927267066624, + "learning_rate": 3.467827509966243e-06, + "loss": 0.6916, + "step": 34035 + }, + { + "epoch": 1.92, + "grad_norm": 5.363394374762744, + "learning_rate": 3.4662674303109394e-06, + "loss": 0.7153, + "step": 34040 + }, + { + "epoch": 1.92, + "grad_norm": 5.122240000977692, + "learning_rate": 3.4647075154619695e-06, + "loss": 0.7313, + "step": 34045 + }, + { + "epoch": 1.92, + "grad_norm": 6.417056892609173, + "learning_rate": 3.4631477655869535e-06, + "loss": 0.6756, + "step": 34050 + }, + { + "epoch": 1.92, + "grad_norm": 4.9046395257729305, + "learning_rate": 3.461588180853491e-06, + "loss": 0.7144, + "step": 34055 + }, + { + "epoch": 1.92, + "grad_norm": 4.948375831133929, + "learning_rate": 3.4600287614291704e-06, + "loss": 0.7268, + "step": 34060 + }, + { + "epoch": 1.92, + "grad_norm": 10.81818192888927, + "learning_rate": 3.4584695074815556e-06, + "loss": 0.6954, + "step": 34065 + }, + { + "epoch": 1.92, + "grad_norm": 7.056126713134114, + "learning_rate": 3.4569104191781934e-06, + "loss": 0.6938, + "step": 34070 + }, + { + "epoch": 1.92, + "grad_norm": 6.97982971155647, + "learning_rate": 3.4553514966866187e-06, + "loss": 0.7434, + "step": 34075 + }, + { + "epoch": 1.92, + "grad_norm": 5.0804405657430225, + "learning_rate": 3.4537927401743408e-06, + "loss": 0.6989, + "step": 34080 + }, + { + "epoch": 1.92, + "grad_norm": 6.232370335315725, + "learning_rate": 3.4522341498088596e-06, + "loss": 0.7343, + "step": 34085 + }, + { + "epoch": 1.92, + "grad_norm": 4.6929324607892315, + "learning_rate": 3.4506757257576474e-06, + "loss": 0.6838, + "step": 34090 + }, + { + "epoch": 1.92, + "grad_norm": 13.847684941651293, + "learning_rate": 3.4491174681881667e-06, + "loss": 0.7998, + "step": 34095 + }, + { + "epoch": 1.92, + "grad_norm": 13.044830561057324, + "learning_rate": 3.4475593772678608e-06, + "loss": 0.6488, + "step": 34100 + }, + { + "epoch": 1.92, + "grad_norm": 10.711354549085556, + "learning_rate": 3.446001453164148e-06, + "loss": 0.706, + "step": 34105 + }, + { + "epoch": 1.92, + "grad_norm": 7.048989575698363, + "learning_rate": 3.4444436960444393e-06, + "loss": 0.6854, + "step": 34110 + }, + { + "epoch": 1.92, + "grad_norm": 4.257284662689208, + "learning_rate": 3.4428861060761198e-06, + "loss": 0.6976, + "step": 34115 + }, + { + "epoch": 1.92, + "grad_norm": 21.04206986966952, + "learning_rate": 3.4413286834265615e-06, + "loss": 0.7153, + "step": 34120 + }, + { + "epoch": 1.92, + "grad_norm": 10.270449177250299, + "learning_rate": 3.4397714282631157e-06, + "loss": 0.7192, + "step": 34125 + }, + { + "epoch": 1.92, + "grad_norm": 5.781081714048268, + "learning_rate": 3.4382143407531126e-06, + "loss": 0.7171, + "step": 34130 + }, + { + "epoch": 1.92, + "grad_norm": 7.091730267944795, + "learning_rate": 3.436657421063873e-06, + "loss": 0.7316, + "step": 34135 + }, + { + "epoch": 1.92, + "grad_norm": 7.933812731737008, + "learning_rate": 3.4351006693626933e-06, + "loss": 0.702, + "step": 34140 + }, + { + "epoch": 1.92, + "grad_norm": 4.372362762285348, + "learning_rate": 3.4335440858168523e-06, + "loss": 0.7398, + "step": 34145 + }, + { + "epoch": 1.92, + "grad_norm": 5.8455483710133995, + "learning_rate": 3.431987670593613e-06, + "loss": 0.7201, + "step": 34150 + }, + { + "epoch": 1.92, + "grad_norm": 6.572167621918964, + "learning_rate": 3.4304314238602195e-06, + "loss": 0.7274, + "step": 34155 + }, + { + "epoch": 1.92, + "grad_norm": 5.318767412363042, + "learning_rate": 3.4288753457838965e-06, + "loss": 0.7392, + "step": 34160 + }, + { + "epoch": 1.93, + "grad_norm": 5.545359597527433, + "learning_rate": 3.4273194365318494e-06, + "loss": 0.7101, + "step": 34165 + }, + { + "epoch": 1.93, + "grad_norm": 7.267444881515774, + "learning_rate": 3.425763696271273e-06, + "loss": 0.7128, + "step": 34170 + }, + { + "epoch": 1.93, + "grad_norm": 8.28827760130778, + "learning_rate": 3.4242081251693315e-06, + "loss": 0.6922, + "step": 34175 + }, + { + "epoch": 1.93, + "grad_norm": 7.857576972315481, + "learning_rate": 3.422652723393184e-06, + "loss": 0.7162, + "step": 34180 + }, + { + "epoch": 1.93, + "grad_norm": 8.239104105271066, + "learning_rate": 3.421097491109962e-06, + "loss": 0.7343, + "step": 34185 + }, + { + "epoch": 1.93, + "grad_norm": 6.458692609742278, + "learning_rate": 3.4195424284867816e-06, + "loss": 0.7026, + "step": 34190 + }, + { + "epoch": 1.93, + "grad_norm": 4.642379913819716, + "learning_rate": 3.4179875356907443e-06, + "loss": 0.7026, + "step": 34195 + }, + { + "epoch": 1.93, + "grad_norm": 5.044826048221263, + "learning_rate": 3.4164328128889264e-06, + "loss": 0.6843, + "step": 34200 + }, + { + "epoch": 1.93, + "grad_norm": 6.799607512515753, + "learning_rate": 3.4148782602483933e-06, + "loss": 0.6681, + "step": 34205 + }, + { + "epoch": 1.93, + "grad_norm": 4.921927178432561, + "learning_rate": 3.4133238779361867e-06, + "loss": 0.6966, + "step": 34210 + }, + { + "epoch": 1.93, + "grad_norm": 5.056524593418422, + "learning_rate": 3.41176966611933e-06, + "loss": 0.7182, + "step": 34215 + }, + { + "epoch": 1.93, + "grad_norm": 6.086715413497012, + "learning_rate": 3.4102156249648344e-06, + "loss": 0.6893, + "step": 34220 + }, + { + "epoch": 1.93, + "grad_norm": 9.818002941235232, + "learning_rate": 3.4086617546396844e-06, + "loss": 0.7129, + "step": 34225 + }, + { + "epoch": 1.93, + "grad_norm": 5.5628820242695225, + "learning_rate": 3.4071080553108527e-06, + "loss": 0.7204, + "step": 34230 + }, + { + "epoch": 1.93, + "grad_norm": 4.692911681535861, + "learning_rate": 3.4055545271452907e-06, + "loss": 0.7289, + "step": 34235 + }, + { + "epoch": 1.93, + "grad_norm": 5.05193984747011, + "learning_rate": 3.4040011703099308e-06, + "loss": 0.6832, + "step": 34240 + }, + { + "epoch": 1.93, + "grad_norm": 12.04610368711286, + "learning_rate": 3.40244798497169e-06, + "loss": 0.7035, + "step": 34245 + }, + { + "epoch": 1.93, + "grad_norm": 4.981004110026776, + "learning_rate": 3.400894971297462e-06, + "loss": 0.7044, + "step": 34250 + }, + { + "epoch": 1.93, + "grad_norm": 5.02871742649193, + "learning_rate": 3.399342129454129e-06, + "loss": 0.673, + "step": 34255 + }, + { + "epoch": 1.93, + "grad_norm": 5.0891710187106565, + "learning_rate": 3.397789459608546e-06, + "loss": 0.6471, + "step": 34260 + }, + { + "epoch": 1.93, + "grad_norm": 10.509849143862967, + "learning_rate": 3.3962369619275594e-06, + "loss": 0.7028, + "step": 34265 + }, + { + "epoch": 1.93, + "grad_norm": 5.21109221125414, + "learning_rate": 3.3946846365779874e-06, + "loss": 0.7047, + "step": 34270 + }, + { + "epoch": 1.93, + "grad_norm": 7.069199609629398, + "learning_rate": 3.3931324837266356e-06, + "loss": 0.6802, + "step": 34275 + }, + { + "epoch": 1.93, + "grad_norm": 4.995862586321246, + "learning_rate": 3.391580503540291e-06, + "loss": 0.6839, + "step": 34280 + }, + { + "epoch": 1.93, + "grad_norm": 5.225962015890741, + "learning_rate": 3.390028696185718e-06, + "loss": 0.7228, + "step": 34285 + }, + { + "epoch": 1.93, + "grad_norm": 4.7826968736433955, + "learning_rate": 3.3884770618296696e-06, + "loss": 0.6992, + "step": 34290 + }, + { + "epoch": 1.93, + "grad_norm": 7.215579942389127, + "learning_rate": 3.3869256006388727e-06, + "loss": 0.6615, + "step": 34295 + }, + { + "epoch": 1.93, + "grad_norm": 5.8171014225910485, + "learning_rate": 3.3853743127800375e-06, + "loss": 0.6712, + "step": 34300 + }, + { + "epoch": 1.93, + "grad_norm": 5.154691343420838, + "learning_rate": 3.38382319841986e-06, + "loss": 0.712, + "step": 34305 + }, + { + "epoch": 1.93, + "grad_norm": 7.728149003069718, + "learning_rate": 3.382272257725011e-06, + "loss": 0.6618, + "step": 34310 + }, + { + "epoch": 1.93, + "grad_norm": 6.180089532624329, + "learning_rate": 3.380721490862149e-06, + "loss": 0.7202, + "step": 34315 + }, + { + "epoch": 1.93, + "grad_norm": 5.852505187684544, + "learning_rate": 3.3791708979979073e-06, + "loss": 0.7027, + "step": 34320 + }, + { + "epoch": 1.93, + "grad_norm": 5.424161113556859, + "learning_rate": 3.3776204792989064e-06, + "loss": 0.7438, + "step": 34325 + }, + { + "epoch": 1.93, + "grad_norm": 6.969015351711339, + "learning_rate": 3.3760702349317464e-06, + "loss": 0.6872, + "step": 34330 + }, + { + "epoch": 1.93, + "grad_norm": 15.092021792098919, + "learning_rate": 3.3745201650630055e-06, + "loss": 0.7189, + "step": 34335 + }, + { + "epoch": 1.93, + "grad_norm": 4.83639774476023, + "learning_rate": 3.3729702698592478e-06, + "loss": 0.7595, + "step": 34340 + }, + { + "epoch": 1.94, + "grad_norm": 9.044030685192272, + "learning_rate": 3.3714205494870134e-06, + "loss": 0.7027, + "step": 34345 + }, + { + "epoch": 1.94, + "grad_norm": 5.060222704112878, + "learning_rate": 3.3698710041128303e-06, + "loss": 0.6926, + "step": 34350 + }, + { + "epoch": 1.94, + "grad_norm": 5.331748524456981, + "learning_rate": 3.3683216339032023e-06, + "loss": 0.6764, + "step": 34355 + }, + { + "epoch": 1.94, + "grad_norm": 7.327864521187992, + "learning_rate": 3.3667724390246137e-06, + "loss": 0.7165, + "step": 34360 + }, + { + "epoch": 1.94, + "grad_norm": 9.99815157543103, + "learning_rate": 3.3652234196435357e-06, + "loss": 0.7129, + "step": 34365 + }, + { + "epoch": 1.94, + "grad_norm": 6.466694965390953, + "learning_rate": 3.3636745759264155e-06, + "loss": 0.7277, + "step": 34370 + }, + { + "epoch": 1.94, + "grad_norm": 4.379631250627289, + "learning_rate": 3.3621259080396847e-06, + "loss": 0.7197, + "step": 34375 + }, + { + "epoch": 1.94, + "grad_norm": 5.017399773871821, + "learning_rate": 3.360577416149754e-06, + "loss": 0.7095, + "step": 34380 + }, + { + "epoch": 1.94, + "grad_norm": 11.37714353547322, + "learning_rate": 3.3590291004230134e-06, + "loss": 0.7209, + "step": 34385 + }, + { + "epoch": 1.94, + "grad_norm": 5.21106889210857, + "learning_rate": 3.3574809610258403e-06, + "loss": 0.7217, + "step": 34390 + }, + { + "epoch": 1.94, + "grad_norm": 5.020769066940849, + "learning_rate": 3.3559329981245845e-06, + "loss": 0.6942, + "step": 34395 + }, + { + "epoch": 1.94, + "grad_norm": 10.535013418423105, + "learning_rate": 3.3543852118855863e-06, + "loss": 0.7216, + "step": 34400 + }, + { + "epoch": 1.94, + "grad_norm": 11.833457039317395, + "learning_rate": 3.3528376024751572e-06, + "loss": 0.6834, + "step": 34405 + }, + { + "epoch": 1.94, + "grad_norm": 16.466722213019253, + "learning_rate": 3.3512901700596e-06, + "loss": 0.687, + "step": 34410 + }, + { + "epoch": 1.94, + "grad_norm": 4.699603331606502, + "learning_rate": 3.3497429148051886e-06, + "loss": 0.7176, + "step": 34415 + }, + { + "epoch": 1.94, + "grad_norm": 14.078155850945397, + "learning_rate": 3.348195836878183e-06, + "loss": 0.7339, + "step": 34420 + }, + { + "epoch": 1.94, + "grad_norm": 8.371680160155005, + "learning_rate": 3.3466489364448273e-06, + "loss": 0.7319, + "step": 34425 + }, + { + "epoch": 1.94, + "grad_norm": 6.967862303147182, + "learning_rate": 3.3451022136713387e-06, + "loss": 0.7241, + "step": 34430 + }, + { + "epoch": 1.94, + "grad_norm": 10.160505088786074, + "learning_rate": 3.343555668723922e-06, + "loss": 0.7006, + "step": 34435 + }, + { + "epoch": 1.94, + "grad_norm": 6.370597144765361, + "learning_rate": 3.342009301768759e-06, + "loss": 0.6614, + "step": 34440 + }, + { + "epoch": 1.94, + "grad_norm": 4.908732261224134, + "learning_rate": 3.3404631129720123e-06, + "loss": 0.728, + "step": 34445 + }, + { + "epoch": 1.94, + "grad_norm": 8.464322679582324, + "learning_rate": 3.3389171024998302e-06, + "loss": 0.7296, + "step": 34450 + }, + { + "epoch": 1.94, + "grad_norm": 8.446735218347529, + "learning_rate": 3.3373712705183335e-06, + "loss": 0.719, + "step": 34455 + }, + { + "epoch": 1.94, + "grad_norm": 12.430665155686357, + "learning_rate": 3.3358256171936334e-06, + "loss": 0.7079, + "step": 34460 + }, + { + "epoch": 1.94, + "grad_norm": 8.07094148537387, + "learning_rate": 3.334280142691816e-06, + "loss": 0.7402, + "step": 34465 + }, + { + "epoch": 1.94, + "grad_norm": 4.958396301774057, + "learning_rate": 3.3327348471789465e-06, + "loss": 0.669, + "step": 34470 + }, + { + "epoch": 1.94, + "grad_norm": 7.3002737488699205, + "learning_rate": 3.3311897308210784e-06, + "loss": 0.6967, + "step": 34475 + }, + { + "epoch": 1.94, + "grad_norm": 6.901253329894225, + "learning_rate": 3.3296447937842365e-06, + "loss": 0.7611, + "step": 34480 + }, + { + "epoch": 1.94, + "grad_norm": 5.409048107446193, + "learning_rate": 3.328100036234435e-06, + "loss": 0.6883, + "step": 34485 + }, + { + "epoch": 1.94, + "grad_norm": 5.825450357177326, + "learning_rate": 3.3265554583376613e-06, + "loss": 0.7539, + "step": 34490 + }, + { + "epoch": 1.94, + "grad_norm": 8.540915432854963, + "learning_rate": 3.3250110602598915e-06, + "loss": 0.7062, + "step": 34495 + }, + { + "epoch": 1.94, + "grad_norm": 4.981442890497445, + "learning_rate": 3.3234668421670745e-06, + "loss": 0.6951, + "step": 34500 + }, + { + "epoch": 1.94, + "grad_norm": 4.436484307213503, + "learning_rate": 3.3219228042251444e-06, + "loss": 0.6662, + "step": 34505 + }, + { + "epoch": 1.94, + "grad_norm": 4.757233251489815, + "learning_rate": 3.3203789466000145e-06, + "loss": 0.6971, + "step": 34510 + }, + { + "epoch": 1.94, + "grad_norm": 5.516152299769269, + "learning_rate": 3.318835269457579e-06, + "loss": 0.701, + "step": 34515 + }, + { + "epoch": 1.95, + "grad_norm": 5.311327387586023, + "learning_rate": 3.3172917729637153e-06, + "loss": 0.6577, + "step": 34520 + }, + { + "epoch": 1.95, + "grad_norm": 5.142789565685974, + "learning_rate": 3.3157484572842767e-06, + "loss": 0.6826, + "step": 34525 + }, + { + "epoch": 1.95, + "grad_norm": 16.767767972624377, + "learning_rate": 3.3142053225850977e-06, + "loss": 0.6983, + "step": 34530 + }, + { + "epoch": 1.95, + "grad_norm": 8.93963829978935, + "learning_rate": 3.312662369031999e-06, + "loss": 0.7298, + "step": 34535 + }, + { + "epoch": 1.95, + "grad_norm": 5.955742890723362, + "learning_rate": 3.311119596790773e-06, + "loss": 0.7179, + "step": 34540 + }, + { + "epoch": 1.95, + "grad_norm": 10.4240534447418, + "learning_rate": 3.3095770060272027e-06, + "loss": 0.6544, + "step": 34545 + }, + { + "epoch": 1.95, + "grad_norm": 11.177569405537723, + "learning_rate": 3.3080345969070406e-06, + "loss": 0.7166, + "step": 34550 + }, + { + "epoch": 1.95, + "grad_norm": 10.40433047937323, + "learning_rate": 3.3064923695960293e-06, + "loss": 0.6646, + "step": 34555 + }, + { + "epoch": 1.95, + "grad_norm": 11.958549895696814, + "learning_rate": 3.304950324259889e-06, + "loss": 0.6739, + "step": 34560 + }, + { + "epoch": 1.95, + "grad_norm": 8.718905234860555, + "learning_rate": 3.303408461064314e-06, + "loss": 0.6344, + "step": 34565 + }, + { + "epoch": 1.95, + "grad_norm": 5.886058161844482, + "learning_rate": 3.3018667801749905e-06, + "loss": 0.7017, + "step": 34570 + }, + { + "epoch": 1.95, + "grad_norm": 7.861357601286246, + "learning_rate": 3.300325281757573e-06, + "loss": 0.7016, + "step": 34575 + }, + { + "epoch": 1.95, + "grad_norm": 4.902415222533037, + "learning_rate": 3.2987839659777075e-06, + "loss": 0.7254, + "step": 34580 + }, + { + "epoch": 1.95, + "grad_norm": 5.397106547037049, + "learning_rate": 3.2972428330010133e-06, + "loss": 0.7209, + "step": 34585 + }, + { + "epoch": 1.95, + "grad_norm": 5.233412595583847, + "learning_rate": 3.295701882993089e-06, + "loss": 0.7159, + "step": 34590 + }, + { + "epoch": 1.95, + "grad_norm": 4.898698516838979, + "learning_rate": 3.29416111611952e-06, + "loss": 0.7264, + "step": 34595 + }, + { + "epoch": 1.95, + "grad_norm": 11.151982890610785, + "learning_rate": 3.292620532545868e-06, + "loss": 0.678, + "step": 34600 + }, + { + "epoch": 1.95, + "grad_norm": 8.847001237003898, + "learning_rate": 3.291080132437674e-06, + "loss": 0.7123, + "step": 34605 + }, + { + "epoch": 1.95, + "grad_norm": 6.519189860016127, + "learning_rate": 3.2895399159604643e-06, + "loss": 0.7116, + "step": 34610 + }, + { + "epoch": 1.95, + "grad_norm": 5.0700122522342514, + "learning_rate": 3.2879998832797367e-06, + "loss": 0.6714, + "step": 34615 + }, + { + "epoch": 1.95, + "grad_norm": 10.656954353614141, + "learning_rate": 3.2864600345609788e-06, + "loss": 0.7002, + "step": 34620 + }, + { + "epoch": 1.95, + "grad_norm": 8.060961806551473, + "learning_rate": 3.284920369969651e-06, + "loss": 0.6637, + "step": 34625 + }, + { + "epoch": 1.95, + "grad_norm": 27.81425414714497, + "learning_rate": 3.283380889671201e-06, + "loss": 0.7247, + "step": 34630 + }, + { + "epoch": 1.95, + "grad_norm": 5.35180489889317, + "learning_rate": 3.2818415938310482e-06, + "loss": 0.6577, + "step": 34635 + }, + { + "epoch": 1.95, + "grad_norm": 8.20720735762461, + "learning_rate": 3.2803024826146012e-06, + "loss": 0.7122, + "step": 34640 + }, + { + "epoch": 1.95, + "grad_norm": 9.968011892621076, + "learning_rate": 3.2787635561872407e-06, + "loss": 0.6988, + "step": 34645 + }, + { + "epoch": 1.95, + "grad_norm": 8.627838913286878, + "learning_rate": 3.2772248147143314e-06, + "loss": 0.6984, + "step": 34650 + }, + { + "epoch": 1.95, + "grad_norm": 20.29426359885549, + "learning_rate": 3.2756862583612216e-06, + "loss": 0.6838, + "step": 34655 + }, + { + "epoch": 1.95, + "grad_norm": 5.206233227106853, + "learning_rate": 3.2741478872932304e-06, + "loss": 0.6747, + "step": 34660 + }, + { + "epoch": 1.95, + "grad_norm": 5.49570254513729, + "learning_rate": 3.2726097016756673e-06, + "loss": 0.6961, + "step": 34665 + }, + { + "epoch": 1.95, + "grad_norm": 7.765673366068342, + "learning_rate": 3.271071701673817e-06, + "loss": 0.6956, + "step": 34670 + }, + { + "epoch": 1.95, + "grad_norm": 31.278675608315666, + "learning_rate": 3.269533887452939e-06, + "loss": 0.6775, + "step": 34675 + }, + { + "epoch": 1.95, + "grad_norm": 6.296567576866515, + "learning_rate": 3.2679962591782843e-06, + "loss": 0.7264, + "step": 34680 + }, + { + "epoch": 1.95, + "grad_norm": 5.480026200208731, + "learning_rate": 3.266458817015074e-06, + "loss": 0.7041, + "step": 34685 + }, + { + "epoch": 1.95, + "grad_norm": 6.8713397591813, + "learning_rate": 3.2649215611285157e-06, + "loss": 0.7449, + "step": 34690 + }, + { + "epoch": 1.95, + "grad_norm": 4.996493401901696, + "learning_rate": 3.2633844916837933e-06, + "loss": 0.7304, + "step": 34695 + }, + { + "epoch": 1.96, + "grad_norm": 5.3405845221110715, + "learning_rate": 3.26184760884607e-06, + "loss": 0.7047, + "step": 34700 + }, + { + "epoch": 1.96, + "grad_norm": 9.21416176409067, + "learning_rate": 3.260310912780494e-06, + "loss": 0.7197, + "step": 34705 + }, + { + "epoch": 1.96, + "grad_norm": 5.478343052250641, + "learning_rate": 3.258774403652186e-06, + "loss": 0.6853, + "step": 34710 + }, + { + "epoch": 1.96, + "grad_norm": 4.736976102011249, + "learning_rate": 3.2572380816262554e-06, + "loss": 0.6808, + "step": 34715 + }, + { + "epoch": 1.96, + "grad_norm": 5.847908944633629, + "learning_rate": 3.255701946867782e-06, + "loss": 0.6935, + "step": 34720 + }, + { + "epoch": 1.96, + "grad_norm": 7.995112079997536, + "learning_rate": 3.2541659995418352e-06, + "loss": 0.654, + "step": 34725 + }, + { + "epoch": 1.96, + "grad_norm": 13.432544965454138, + "learning_rate": 3.252630239813455e-06, + "loss": 0.6921, + "step": 34730 + }, + { + "epoch": 1.96, + "grad_norm": 5.41514470068817, + "learning_rate": 3.2510946678476677e-06, + "loss": 0.6501, + "step": 34735 + }, + { + "epoch": 1.96, + "grad_norm": 6.04100676135012, + "learning_rate": 3.2495592838094768e-06, + "loss": 0.6964, + "step": 34740 + }, + { + "epoch": 1.96, + "grad_norm": 5.132702607976753, + "learning_rate": 3.2480240878638646e-06, + "loss": 0.6656, + "step": 34745 + }, + { + "epoch": 1.96, + "grad_norm": 4.984308471395036, + "learning_rate": 3.2464890801757987e-06, + "loss": 0.7329, + "step": 34750 + }, + { + "epoch": 1.96, + "grad_norm": 4.37083283436118, + "learning_rate": 3.2449542609102203e-06, + "loss": 0.6692, + "step": 34755 + }, + { + "epoch": 1.96, + "grad_norm": 6.065891822018525, + "learning_rate": 3.2434196302320498e-06, + "loss": 0.6915, + "step": 34760 + }, + { + "epoch": 1.96, + "grad_norm": 4.506466007238628, + "learning_rate": 3.2418851883061942e-06, + "loss": 0.6818, + "step": 34765 + }, + { + "epoch": 1.96, + "grad_norm": 4.540527651253667, + "learning_rate": 3.2403509352975325e-06, + "loss": 0.7146, + "step": 34770 + }, + { + "epoch": 1.96, + "grad_norm": 5.253856929743469, + "learning_rate": 3.238816871370931e-06, + "loss": 0.6754, + "step": 34775 + }, + { + "epoch": 1.96, + "grad_norm": 5.973745054767074, + "learning_rate": 3.2372829966912276e-06, + "loss": 0.7016, + "step": 34780 + }, + { + "epoch": 1.96, + "grad_norm": 6.4143701951065, + "learning_rate": 3.235749311423245e-06, + "loss": 0.7051, + "step": 34785 + }, + { + "epoch": 1.96, + "grad_norm": 7.256718611345136, + "learning_rate": 3.2342158157317875e-06, + "loss": 0.7005, + "step": 34790 + }, + { + "epoch": 1.96, + "grad_norm": 6.430052768862708, + "learning_rate": 3.232682509781632e-06, + "loss": 0.6717, + "step": 34795 + }, + { + "epoch": 1.96, + "grad_norm": 7.122382147228777, + "learning_rate": 3.2311493937375414e-06, + "loss": 0.7403, + "step": 34800 + }, + { + "epoch": 1.96, + "grad_norm": 4.850098024675329, + "learning_rate": 3.2296164677642538e-06, + "loss": 0.7054, + "step": 34805 + }, + { + "epoch": 1.96, + "grad_norm": 6.928735150390296, + "learning_rate": 3.2280837320264913e-06, + "loss": 0.7012, + "step": 34810 + }, + { + "epoch": 1.96, + "grad_norm": 7.971053368580964, + "learning_rate": 3.2265511866889528e-06, + "loss": 0.7092, + "step": 34815 + }, + { + "epoch": 1.96, + "grad_norm": 9.383201044098856, + "learning_rate": 3.2250188319163133e-06, + "loss": 0.6976, + "step": 34820 + }, + { + "epoch": 1.96, + "grad_norm": 4.725235772265117, + "learning_rate": 3.2234866678732356e-06, + "loss": 0.6857, + "step": 34825 + }, + { + "epoch": 1.96, + "grad_norm": 8.891194956713214, + "learning_rate": 3.221954694724355e-06, + "loss": 0.6958, + "step": 34830 + }, + { + "epoch": 1.96, + "grad_norm": 8.258838383391693, + "learning_rate": 3.2204229126342903e-06, + "loss": 0.6743, + "step": 34835 + }, + { + "epoch": 1.96, + "grad_norm": 16.058831689741183, + "learning_rate": 3.2188913217676384e-06, + "loss": 0.7022, + "step": 34840 + }, + { + "epoch": 1.96, + "grad_norm": 13.434261438384548, + "learning_rate": 3.2173599222889716e-06, + "loss": 0.7183, + "step": 34845 + }, + { + "epoch": 1.96, + "grad_norm": 5.054370650282337, + "learning_rate": 3.215828714362852e-06, + "loss": 0.7057, + "step": 34850 + }, + { + "epoch": 1.96, + "grad_norm": 4.774936223876148, + "learning_rate": 3.2142976981538087e-06, + "loss": 0.6832, + "step": 34855 + }, + { + "epoch": 1.96, + "grad_norm": 5.153082902012476, + "learning_rate": 3.2127668738263607e-06, + "loss": 0.7129, + "step": 34860 + }, + { + "epoch": 1.96, + "grad_norm": 8.438502406162897, + "learning_rate": 3.2112362415449983e-06, + "loss": 0.7127, + "step": 34865 + }, + { + "epoch": 1.96, + "grad_norm": 5.687628359593135, + "learning_rate": 3.2097058014741963e-06, + "loss": 0.6941, + "step": 34870 + }, + { + "epoch": 1.97, + "grad_norm": 5.208079044546652, + "learning_rate": 3.208175553778407e-06, + "loss": 0.6832, + "step": 34875 + }, + { + "epoch": 1.97, + "grad_norm": 5.083614473178835, + "learning_rate": 3.206645498622062e-06, + "loss": 0.735, + "step": 34880 + }, + { + "epoch": 1.97, + "grad_norm": 5.662208474418582, + "learning_rate": 3.205115636169574e-06, + "loss": 0.6765, + "step": 34885 + }, + { + "epoch": 1.97, + "grad_norm": 9.730622620037803, + "learning_rate": 3.203585966585331e-06, + "loss": 0.7013, + "step": 34890 + }, + { + "epoch": 1.97, + "grad_norm": 9.724229039966694, + "learning_rate": 3.2020564900337057e-06, + "loss": 0.7409, + "step": 34895 + }, + { + "epoch": 1.97, + "grad_norm": 4.749165581230447, + "learning_rate": 3.200527206679046e-06, + "loss": 0.6501, + "step": 34900 + }, + { + "epoch": 1.97, + "grad_norm": 5.376249727339917, + "learning_rate": 3.1989981166856767e-06, + "loss": 0.6926, + "step": 34905 + }, + { + "epoch": 1.97, + "grad_norm": 8.746816413591672, + "learning_rate": 3.1974692202179107e-06, + "loss": 0.7186, + "step": 34910 + }, + { + "epoch": 1.97, + "grad_norm": 6.3990200178856345, + "learning_rate": 3.1959405174400303e-06, + "loss": 0.7007, + "step": 34915 + }, + { + "epoch": 1.97, + "grad_norm": 5.916186341656027, + "learning_rate": 3.1944120085163045e-06, + "loss": 0.7204, + "step": 34920 + }, + { + "epoch": 1.97, + "grad_norm": 5.14631137566495, + "learning_rate": 3.1928836936109776e-06, + "loss": 0.7184, + "step": 34925 + }, + { + "epoch": 1.97, + "grad_norm": 6.220870589885455, + "learning_rate": 3.1913555728882706e-06, + "loss": 0.6801, + "step": 34930 + }, + { + "epoch": 1.97, + "grad_norm": 4.683925784817751, + "learning_rate": 3.1898276465123923e-06, + "loss": 0.6963, + "step": 34935 + }, + { + "epoch": 1.97, + "grad_norm": 5.498276001612888, + "learning_rate": 3.18829991464752e-06, + "loss": 0.7236, + "step": 34940 + }, + { + "epoch": 1.97, + "grad_norm": 6.310380146467167, + "learning_rate": 3.18677237745782e-06, + "loss": 0.643, + "step": 34945 + }, + { + "epoch": 1.97, + "grad_norm": 5.183976333866489, + "learning_rate": 3.18524503510743e-06, + "loss": 0.6871, + "step": 34950 + }, + { + "epoch": 1.97, + "grad_norm": 4.843396474345437, + "learning_rate": 3.183717887760468e-06, + "loss": 0.7395, + "step": 34955 + }, + { + "epoch": 1.97, + "grad_norm": 10.8160977151335, + "learning_rate": 3.182190935581036e-06, + "loss": 0.7199, + "step": 34960 + }, + { + "epoch": 1.97, + "grad_norm": 4.728666747082901, + "learning_rate": 3.1806641787332105e-06, + "loss": 0.6712, + "step": 34965 + }, + { + "epoch": 1.97, + "grad_norm": 7.3161951589156775, + "learning_rate": 3.1791376173810485e-06, + "loss": 0.7148, + "step": 34970 + }, + { + "epoch": 1.97, + "grad_norm": 4.649087116983282, + "learning_rate": 3.177611251688585e-06, + "loss": 0.6711, + "step": 34975 + }, + { + "epoch": 1.97, + "grad_norm": 5.358917128358095, + "learning_rate": 3.1760850818198364e-06, + "loss": 0.7179, + "step": 34980 + }, + { + "epoch": 1.97, + "grad_norm": 7.315739908454177, + "learning_rate": 3.174559107938796e-06, + "loss": 0.6969, + "step": 34985 + }, + { + "epoch": 1.97, + "grad_norm": 5.721395234434675, + "learning_rate": 3.173033330209433e-06, + "loss": 0.6818, + "step": 34990 + }, + { + "epoch": 1.97, + "grad_norm": 4.9793097019291315, + "learning_rate": 3.1715077487957046e-06, + "loss": 0.6921, + "step": 34995 + }, + { + "epoch": 1.97, + "grad_norm": 7.932958413519551, + "learning_rate": 3.1699823638615356e-06, + "loss": 0.7227, + "step": 35000 + }, + { + "epoch": 1.97, + "grad_norm": 6.266642265568164, + "learning_rate": 3.1684571755708405e-06, + "loss": 0.6626, + "step": 35005 + }, + { + "epoch": 1.97, + "grad_norm": 5.923864184283424, + "learning_rate": 3.1669321840875036e-06, + "loss": 0.6955, + "step": 35010 + }, + { + "epoch": 1.97, + "grad_norm": 4.773322611824129, + "learning_rate": 3.1654073895753927e-06, + "loss": 0.6571, + "step": 35015 + }, + { + "epoch": 1.97, + "grad_norm": 5.894107093078364, + "learning_rate": 3.163882792198357e-06, + "loss": 0.7053, + "step": 35020 + }, + { + "epoch": 1.97, + "grad_norm": 5.90151505620469, + "learning_rate": 3.1623583921202163e-06, + "loss": 0.6834, + "step": 35025 + }, + { + "epoch": 1.97, + "grad_norm": 5.962497029735831, + "learning_rate": 3.1608341895047784e-06, + "loss": 0.6902, + "step": 35030 + }, + { + "epoch": 1.97, + "grad_norm": 5.26088204113902, + "learning_rate": 3.1593101845158215e-06, + "loss": 0.718, + "step": 35035 + }, + { + "epoch": 1.97, + "grad_norm": 5.144310418494661, + "learning_rate": 3.1577863773171114e-06, + "loss": 0.6717, + "step": 35040 + }, + { + "epoch": 1.97, + "grad_norm": 7.875648665970789, + "learning_rate": 3.156262768072385e-06, + "loss": 0.6907, + "step": 35045 + }, + { + "epoch": 1.97, + "grad_norm": 5.664963596240274, + "learning_rate": 3.1547393569453596e-06, + "loss": 0.6864, + "step": 35050 + }, + { + "epoch": 1.98, + "grad_norm": 5.27004589472345, + "learning_rate": 3.1532161440997346e-06, + "loss": 0.7028, + "step": 35055 + }, + { + "epoch": 1.98, + "grad_norm": 6.323651522139277, + "learning_rate": 3.1516931296991852e-06, + "loss": 0.682, + "step": 35060 + }, + { + "epoch": 1.98, + "grad_norm": 4.790824661300352, + "learning_rate": 3.150170313907367e-06, + "loss": 0.677, + "step": 35065 + }, + { + "epoch": 1.98, + "grad_norm": 4.8124891660377775, + "learning_rate": 3.1486476968879133e-06, + "loss": 0.648, + "step": 35070 + }, + { + "epoch": 1.98, + "grad_norm": 5.709249276514523, + "learning_rate": 3.1471252788044326e-06, + "loss": 0.7221, + "step": 35075 + }, + { + "epoch": 1.98, + "grad_norm": 5.5922070158533845, + "learning_rate": 3.14560305982052e-06, + "loss": 0.6912, + "step": 35080 + }, + { + "epoch": 1.98, + "grad_norm": 5.052275705629103, + "learning_rate": 3.1440810400997415e-06, + "loss": 0.7063, + "step": 35085 + }, + { + "epoch": 1.98, + "grad_norm": 4.614530670321845, + "learning_rate": 3.1425592198056475e-06, + "loss": 0.6681, + "step": 35090 + }, + { + "epoch": 1.98, + "grad_norm": 5.213499876329178, + "learning_rate": 3.1410375991017623e-06, + "loss": 0.6812, + "step": 35095 + }, + { + "epoch": 1.98, + "grad_norm": 5.39707774006819, + "learning_rate": 3.1395161781515907e-06, + "loss": 0.6478, + "step": 35100 + }, + { + "epoch": 1.98, + "grad_norm": 5.183880018678881, + "learning_rate": 3.1379949571186173e-06, + "loss": 0.7054, + "step": 35105 + }, + { + "epoch": 1.98, + "grad_norm": 4.790529024945295, + "learning_rate": 3.136473936166302e-06, + "loss": 0.7118, + "step": 35110 + }, + { + "epoch": 1.98, + "grad_norm": 5.790714271544594, + "learning_rate": 3.1349531154580885e-06, + "loss": 0.6785, + "step": 35115 + }, + { + "epoch": 1.98, + "grad_norm": 5.363396283304108, + "learning_rate": 3.1334324951573925e-06, + "loss": 0.6911, + "step": 35120 + }, + { + "epoch": 1.98, + "grad_norm": 4.67172662481413, + "learning_rate": 3.131912075427614e-06, + "loss": 0.7088, + "step": 35125 + }, + { + "epoch": 1.98, + "grad_norm": 5.1825079244181635, + "learning_rate": 3.1303918564321276e-06, + "loss": 0.6561, + "step": 35130 + }, + { + "epoch": 1.98, + "grad_norm": 5.127097794226152, + "learning_rate": 3.1288718383342853e-06, + "loss": 0.6708, + "step": 35135 + }, + { + "epoch": 1.98, + "grad_norm": 8.517761463954615, + "learning_rate": 3.1273520212974244e-06, + "loss": 0.6742, + "step": 35140 + }, + { + "epoch": 1.98, + "grad_norm": 6.768996000200358, + "learning_rate": 3.125832405484851e-06, + "loss": 0.7208, + "step": 35145 + }, + { + "epoch": 1.98, + "grad_norm": 4.54907973874482, + "learning_rate": 3.1243129910598587e-06, + "loss": 0.6833, + "step": 35150 + }, + { + "epoch": 1.98, + "grad_norm": 6.040848945756802, + "learning_rate": 3.1227937781857137e-06, + "loss": 0.6692, + "step": 35155 + }, + { + "epoch": 1.98, + "grad_norm": 7.359439672587436, + "learning_rate": 3.1212747670256606e-06, + "loss": 0.6849, + "step": 35160 + }, + { + "epoch": 1.98, + "grad_norm": 8.601260479168754, + "learning_rate": 3.119755957742927e-06, + "loss": 0.666, + "step": 35165 + }, + { + "epoch": 1.98, + "grad_norm": 5.818423116929544, + "learning_rate": 3.1182373505007115e-06, + "loss": 0.7284, + "step": 35170 + }, + { + "epoch": 1.98, + "grad_norm": 11.28760642783991, + "learning_rate": 3.1167189454622e-06, + "loss": 0.6949, + "step": 35175 + }, + { + "epoch": 1.98, + "grad_norm": 6.919285582455202, + "learning_rate": 3.1152007427905496e-06, + "loss": 0.6965, + "step": 35180 + }, + { + "epoch": 1.98, + "grad_norm": 7.602421231140983, + "learning_rate": 3.113682742648895e-06, + "loss": 0.6468, + "step": 35185 + }, + { + "epoch": 1.98, + "grad_norm": 5.18047545646941, + "learning_rate": 3.1121649452003553e-06, + "loss": 0.6905, + "step": 35190 + }, + { + "epoch": 1.98, + "grad_norm": 5.692723737245233, + "learning_rate": 3.1106473506080248e-06, + "loss": 0.6641, + "step": 35195 + }, + { + "epoch": 1.98, + "grad_norm": 6.089993644237553, + "learning_rate": 3.1091299590349744e-06, + "loss": 0.7022, + "step": 35200 + }, + { + "epoch": 1.98, + "grad_norm": 18.464348102001022, + "learning_rate": 3.107612770644254e-06, + "loss": 0.7242, + "step": 35205 + }, + { + "epoch": 1.98, + "grad_norm": 9.417910039588145, + "learning_rate": 3.1060957855988956e-06, + "loss": 0.6852, + "step": 35210 + }, + { + "epoch": 1.98, + "grad_norm": 10.911349146203593, + "learning_rate": 3.104579004061903e-06, + "loss": 0.6824, + "step": 35215 + }, + { + "epoch": 1.98, + "grad_norm": 5.9585363784038154, + "learning_rate": 3.1030624261962595e-06, + "loss": 0.7463, + "step": 35220 + }, + { + "epoch": 1.98, + "grad_norm": 10.927299731693227, + "learning_rate": 3.101546052164932e-06, + "loss": 0.7165, + "step": 35225 + }, + { + "epoch": 1.99, + "grad_norm": 18.82840268503806, + "learning_rate": 3.1000298821308583e-06, + "loss": 0.6777, + "step": 35230 + }, + { + "epoch": 1.99, + "grad_norm": 26.543236627565317, + "learning_rate": 3.098513916256961e-06, + "loss": 0.7074, + "step": 35235 + }, + { + "epoch": 1.99, + "grad_norm": 18.053590209899767, + "learning_rate": 3.0969981547061333e-06, + "loss": 0.7189, + "step": 35240 + }, + { + "epoch": 1.99, + "grad_norm": 17.09854583103734, + "learning_rate": 3.0954825976412528e-06, + "loss": 0.7315, + "step": 35245 + }, + { + "epoch": 1.99, + "grad_norm": 17.441055497384916, + "learning_rate": 3.0939672452251736e-06, + "loss": 0.7075, + "step": 35250 + }, + { + "epoch": 1.99, + "grad_norm": 4.874551333299127, + "learning_rate": 3.0924520976207238e-06, + "loss": 0.7354, + "step": 35255 + }, + { + "epoch": 1.99, + "grad_norm": 17.548203979119037, + "learning_rate": 3.0909371549907175e-06, + "loss": 0.7256, + "step": 35260 + }, + { + "epoch": 1.99, + "grad_norm": 16.1006428562444, + "learning_rate": 3.089422417497938e-06, + "loss": 0.7272, + "step": 35265 + }, + { + "epoch": 1.99, + "grad_norm": 13.063491743642588, + "learning_rate": 3.0879078853051506e-06, + "loss": 0.6953, + "step": 35270 + }, + { + "epoch": 1.99, + "grad_norm": 5.957301907855168, + "learning_rate": 3.0863935585751014e-06, + "loss": 0.6526, + "step": 35275 + }, + { + "epoch": 1.99, + "grad_norm": 5.94978234221093, + "learning_rate": 3.0848794374705083e-06, + "loss": 0.7051, + "step": 35280 + }, + { + "epoch": 1.99, + "grad_norm": 5.441672951653837, + "learning_rate": 3.083365522154072e-06, + "loss": 0.6943, + "step": 35285 + }, + { + "epoch": 1.99, + "grad_norm": 4.792613427444279, + "learning_rate": 3.0818518127884693e-06, + "loss": 0.6771, + "step": 35290 + }, + { + "epoch": 1.99, + "grad_norm": 8.209539089258062, + "learning_rate": 3.0803383095363547e-06, + "loss": 0.6916, + "step": 35295 + }, + { + "epoch": 1.99, + "grad_norm": 5.5721290185461605, + "learning_rate": 3.0788250125603624e-06, + "loss": 0.6846, + "step": 35300 + }, + { + "epoch": 1.99, + "grad_norm": 7.630528358446945, + "learning_rate": 3.0773119220230986e-06, + "loss": 0.6525, + "step": 35305 + }, + { + "epoch": 1.99, + "grad_norm": 4.930745118257468, + "learning_rate": 3.0757990380871567e-06, + "loss": 0.6725, + "step": 35310 + }, + { + "epoch": 1.99, + "grad_norm": 9.407960926449672, + "learning_rate": 3.074286360915098e-06, + "loss": 0.6855, + "step": 35315 + }, + { + "epoch": 1.99, + "grad_norm": 6.699747509500149, + "learning_rate": 3.0727738906694703e-06, + "loss": 0.6812, + "step": 35320 + }, + { + "epoch": 1.99, + "grad_norm": 12.908081523308288, + "learning_rate": 3.0712616275127937e-06, + "loss": 0.6618, + "step": 35325 + }, + { + "epoch": 1.99, + "grad_norm": 5.064774130169687, + "learning_rate": 3.0697495716075665e-06, + "loss": 0.6746, + "step": 35330 + }, + { + "epoch": 1.99, + "grad_norm": 6.04715897242109, + "learning_rate": 3.068237723116266e-06, + "loss": 0.722, + "step": 35335 + }, + { + "epoch": 1.99, + "grad_norm": 6.107756854479823, + "learning_rate": 3.066726082201348e-06, + "loss": 0.6685, + "step": 35340 + }, + { + "epoch": 1.99, + "grad_norm": 4.940410971828386, + "learning_rate": 3.0652146490252466e-06, + "loss": 0.6744, + "step": 35345 + }, + { + "epoch": 1.99, + "grad_norm": 10.438299431843944, + "learning_rate": 3.0637034237503694e-06, + "loss": 0.6887, + "step": 35350 + }, + { + "epoch": 1.99, + "grad_norm": 5.286761988035629, + "learning_rate": 3.062192406539103e-06, + "loss": 0.6548, + "step": 35355 + }, + { + "epoch": 1.99, + "grad_norm": 5.1315447819054265, + "learning_rate": 3.0606815975538166e-06, + "loss": 0.6672, + "step": 35360 + }, + { + "epoch": 1.99, + "grad_norm": 5.546797293429852, + "learning_rate": 3.0591709969568496e-06, + "loss": 0.6797, + "step": 35365 + }, + { + "epoch": 1.99, + "grad_norm": 4.893488072686185, + "learning_rate": 3.0576606049105267e-06, + "loss": 0.711, + "step": 35370 + }, + { + "epoch": 1.99, + "grad_norm": 5.333598157410917, + "learning_rate": 3.056150421577142e-06, + "loss": 0.7055, + "step": 35375 + }, + { + "epoch": 1.99, + "grad_norm": 4.630504334203577, + "learning_rate": 3.054640447118974e-06, + "loss": 0.6762, + "step": 35380 + }, + { + "epoch": 1.99, + "grad_norm": 10.818021257760977, + "learning_rate": 3.0531306816982764e-06, + "loss": 0.6943, + "step": 35385 + }, + { + "epoch": 1.99, + "grad_norm": 9.459984953599493, + "learning_rate": 3.0516211254772777e-06, + "loss": 0.6862, + "step": 35390 + }, + { + "epoch": 1.99, + "grad_norm": 5.05717280742564, + "learning_rate": 3.05011177861819e-06, + "loss": 0.6932, + "step": 35395 + }, + { + "epoch": 1.99, + "grad_norm": 5.159867206313164, + "learning_rate": 3.048602641283195e-06, + "loss": 0.6862, + "step": 35400 + }, + { + "epoch": 1.99, + "grad_norm": 8.700555157614799, + "learning_rate": 3.04709371363446e-06, + "loss": 0.6797, + "step": 35405 + }, + { + "epoch": 2.0, + "grad_norm": 4.504531105194354, + "learning_rate": 3.045584995834124e-06, + "loss": 0.7213, + "step": 35410 + }, + { + "epoch": 2.0, + "grad_norm": 5.583940979099118, + "learning_rate": 3.0440764880443045e-06, + "loss": 0.6655, + "step": 35415 + }, + { + "epoch": 2.0, + "grad_norm": 7.2446812458027345, + "learning_rate": 3.0425681904270993e-06, + "loss": 0.6685, + "step": 35420 + }, + { + "epoch": 2.0, + "grad_norm": 5.98702038239393, + "learning_rate": 3.0410601031445807e-06, + "loss": 0.6659, + "step": 35425 + }, + { + "epoch": 2.0, + "grad_norm": 5.30765691435782, + "learning_rate": 3.0395522263587996e-06, + "loss": 0.6463, + "step": 35430 + }, + { + "epoch": 2.0, + "grad_norm": 10.507169948067107, + "learning_rate": 3.038044560231785e-06, + "loss": 0.697, + "step": 35435 + }, + { + "epoch": 2.0, + "grad_norm": 8.80795821968365, + "learning_rate": 3.036537104925539e-06, + "loss": 0.6565, + "step": 35440 + }, + { + "epoch": 2.0, + "grad_norm": 6.580353059297831, + "learning_rate": 3.035029860602049e-06, + "loss": 0.6895, + "step": 35445 + }, + { + "epoch": 2.0, + "grad_norm": 4.90561869904271, + "learning_rate": 3.0335228274232697e-06, + "loss": 0.7017, + "step": 35450 + }, + { + "epoch": 2.0, + "grad_norm": 5.256834525834763, + "learning_rate": 3.032016005551144e-06, + "loss": 0.6792, + "step": 35455 + }, + { + "epoch": 2.0, + "grad_norm": 7.568160748735181, + "learning_rate": 3.030509395147581e-06, + "loss": 0.6821, + "step": 35460 + }, + { + "epoch": 2.0, + "grad_norm": 9.40151804531815, + "learning_rate": 3.0290029963744783e-06, + "loss": 0.6535, + "step": 35465 + }, + { + "epoch": 2.0, + "grad_norm": 9.454250046103105, + "learning_rate": 3.0274968093937013e-06, + "loss": 0.6958, + "step": 35470 + }, + { + "epoch": 2.0, + "grad_norm": 6.2595836898954405, + "learning_rate": 3.0259908343670964e-06, + "loss": 0.6589, + "step": 35475 + }, + { + "epoch": 2.0, + "grad_norm": 6.171782526094596, + "learning_rate": 3.02448507145649e-06, + "loss": 0.6818, + "step": 35480 + }, + { + "epoch": 2.0, + "grad_norm": 5.538920031530736, + "learning_rate": 3.0229795208236793e-06, + "loss": 0.6603, + "step": 35485 + }, + { + "epoch": 2.0, + "grad_norm": 7.316265528218628, + "learning_rate": 3.0214741826304466e-06, + "loss": 0.6717, + "step": 35490 + }, + { + "epoch": 2.0, + "grad_norm": 20.807201792994373, + "learning_rate": 3.019969057038545e-06, + "loss": 0.6614, + "step": 35495 + }, + { + "epoch": 2.0, + "eval_loss": 1.1088519096374512, + "eval_runtime": 25.3868, + "eval_samples_per_second": 31.749, + "eval_steps_per_second": 3.978, + "step": 35496 + }, + { + "epoch": 2.0, + "grad_norm": 6.624162199485662, + "learning_rate": 3.018464144209704e-06, + "loss": 0.4827, + "step": 35500 + }, + { + "epoch": 2.0, + "grad_norm": 5.251439545333477, + "learning_rate": 3.0169594443056386e-06, + "loss": 0.4616, + "step": 35505 + }, + { + "epoch": 2.0, + "grad_norm": 5.592487760809804, + "learning_rate": 3.015454957488031e-06, + "loss": 0.422, + "step": 35510 + }, + { + "epoch": 2.0, + "grad_norm": 5.445055475848834, + "learning_rate": 3.0139506839185473e-06, + "loss": 0.4291, + "step": 35515 + }, + { + "epoch": 2.0, + "grad_norm": 4.73350203023127, + "learning_rate": 3.012446623758827e-06, + "loss": 0.4073, + "step": 35520 + }, + { + "epoch": 2.0, + "grad_norm": 4.610318191731628, + "learning_rate": 3.0109427771704903e-06, + "loss": 0.4327, + "step": 35525 + }, + { + "epoch": 2.0, + "grad_norm": 4.798895152834253, + "learning_rate": 3.0094391443151306e-06, + "loss": 0.414, + "step": 35530 + }, + { + "epoch": 2.0, + "grad_norm": 4.8561901802041305, + "learning_rate": 3.007935725354317e-06, + "loss": 0.4212, + "step": 35535 + }, + { + "epoch": 2.0, + "grad_norm": 5.573398710816627, + "learning_rate": 3.0064325204496046e-06, + "loss": 0.418, + "step": 35540 + }, + { + "epoch": 2.0, + "grad_norm": 13.95415125209879, + "learning_rate": 3.004929529762514e-06, + "loss": 0.4089, + "step": 35545 + }, + { + "epoch": 2.0, + "grad_norm": 5.18703664328013, + "learning_rate": 3.0034267534545525e-06, + "loss": 0.4446, + "step": 35550 + }, + { + "epoch": 2.0, + "grad_norm": 4.877676239718891, + "learning_rate": 3.0019241916871968e-06, + "loss": 0.438, + "step": 35555 + }, + { + "epoch": 2.0, + "grad_norm": 4.423217455654268, + "learning_rate": 3.000421844621905e-06, + "loss": 0.425, + "step": 35560 + }, + { + "epoch": 2.0, + "grad_norm": 5.086485665867204, + "learning_rate": 2.99891971242011e-06, + "loss": 0.4715, + "step": 35565 + }, + { + "epoch": 2.0, + "grad_norm": 5.471798415265517, + "learning_rate": 2.9974177952432227e-06, + "loss": 0.4301, + "step": 35570 + }, + { + "epoch": 2.0, + "grad_norm": 5.310256810777645, + "learning_rate": 2.9959160932526333e-06, + "loss": 0.437, + "step": 35575 + }, + { + "epoch": 2.0, + "grad_norm": 4.945012475712052, + "learning_rate": 2.994414606609705e-06, + "loss": 0.4287, + "step": 35580 + }, + { + "epoch": 2.01, + "grad_norm": 4.799289907826112, + "learning_rate": 2.9929133354757754e-06, + "loss": 0.4317, + "step": 35585 + }, + { + "epoch": 2.01, + "grad_norm": 4.670616354730726, + "learning_rate": 2.9914122800121693e-06, + "loss": 0.436, + "step": 35590 + }, + { + "epoch": 2.01, + "grad_norm": 6.938748044740877, + "learning_rate": 2.9899114403801754e-06, + "loss": 0.4419, + "step": 35595 + }, + { + "epoch": 2.01, + "grad_norm": 4.820127001934945, + "learning_rate": 2.9884108167410703e-06, + "loss": 0.4261, + "step": 35600 + }, + { + "epoch": 2.01, + "grad_norm": 4.644965024100297, + "learning_rate": 2.9869104092560993e-06, + "loss": 0.4271, + "step": 35605 + }, + { + "epoch": 2.01, + "grad_norm": 5.087396027260219, + "learning_rate": 2.9854102180864895e-06, + "loss": 0.399, + "step": 35610 + }, + { + "epoch": 2.01, + "grad_norm": 5.92111872490377, + "learning_rate": 2.9839102433934446e-06, + "loss": 0.4188, + "step": 35615 + }, + { + "epoch": 2.01, + "grad_norm": 4.95032930360502, + "learning_rate": 2.9824104853381396e-06, + "loss": 0.4215, + "step": 35620 + }, + { + "epoch": 2.01, + "grad_norm": 4.565444835470105, + "learning_rate": 2.980910944081734e-06, + "loss": 0.4269, + "step": 35625 + }, + { + "epoch": 2.01, + "grad_norm": 5.299176971333008, + "learning_rate": 2.9794116197853564e-06, + "loss": 0.4459, + "step": 35630 + }, + { + "epoch": 2.01, + "grad_norm": 4.964758881245675, + "learning_rate": 2.97791251261012e-06, + "loss": 0.4502, + "step": 35635 + }, + { + "epoch": 2.01, + "grad_norm": 4.883842627489473, + "learning_rate": 2.976413622717108e-06, + "loss": 0.4043, + "step": 35640 + }, + { + "epoch": 2.01, + "grad_norm": 6.6760575472965185, + "learning_rate": 2.974914950267381e-06, + "loss": 0.4433, + "step": 35645 + }, + { + "epoch": 2.01, + "grad_norm": 4.589695042834639, + "learning_rate": 2.9734164954219812e-06, + "loss": 0.4467, + "step": 35650 + }, + { + "epoch": 2.01, + "grad_norm": 5.199176387428726, + "learning_rate": 2.9719182583419233e-06, + "loss": 0.4263, + "step": 35655 + }, + { + "epoch": 2.01, + "grad_norm": 5.4164360665117925, + "learning_rate": 2.9704202391881987e-06, + "loss": 0.4137, + "step": 35660 + }, + { + "epoch": 2.01, + "grad_norm": 4.9496041243339395, + "learning_rate": 2.9689224381217774e-06, + "loss": 0.427, + "step": 35665 + }, + { + "epoch": 2.01, + "grad_norm": 4.413073547077309, + "learning_rate": 2.9674248553036022e-06, + "loss": 0.4254, + "step": 35670 + }, + { + "epoch": 2.01, + "grad_norm": 4.632442675004174, + "learning_rate": 2.965927490894599e-06, + "loss": 0.4133, + "step": 35675 + }, + { + "epoch": 2.01, + "grad_norm": 7.153190155452668, + "learning_rate": 2.9644303450556617e-06, + "loss": 0.4454, + "step": 35680 + }, + { + "epoch": 2.01, + "grad_norm": 4.114342055733055, + "learning_rate": 2.96293341794767e-06, + "loss": 0.4234, + "step": 35685 + }, + { + "epoch": 2.01, + "grad_norm": 4.495183768494921, + "learning_rate": 2.96143670973147e-06, + "loss": 0.4171, + "step": 35690 + }, + { + "epoch": 2.01, + "grad_norm": 5.216950966436851, + "learning_rate": 2.9599402205678945e-06, + "loss": 0.3834, + "step": 35695 + }, + { + "epoch": 2.01, + "grad_norm": 4.836099168905307, + "learning_rate": 2.9584439506177453e-06, + "loss": 0.4115, + "step": 35700 + }, + { + "epoch": 2.01, + "grad_norm": 4.9788850849651105, + "learning_rate": 2.956947900041802e-06, + "loss": 0.4315, + "step": 35705 + }, + { + "epoch": 2.01, + "grad_norm": 5.335694047766549, + "learning_rate": 2.955452069000826e-06, + "loss": 0.4046, + "step": 35710 + }, + { + "epoch": 2.01, + "grad_norm": 5.659384034868762, + "learning_rate": 2.9539564576555462e-06, + "loss": 0.4144, + "step": 35715 + }, + { + "epoch": 2.01, + "grad_norm": 4.337211105652029, + "learning_rate": 2.952461066166677e-06, + "loss": 0.4186, + "step": 35720 + }, + { + "epoch": 2.01, + "grad_norm": 6.139699583710478, + "learning_rate": 2.950965894694903e-06, + "loss": 0.451, + "step": 35725 + }, + { + "epoch": 2.01, + "grad_norm": 5.061656896769589, + "learning_rate": 2.949470943400884e-06, + "loss": 0.41, + "step": 35730 + }, + { + "epoch": 2.01, + "grad_norm": 6.797408797001112, + "learning_rate": 2.947976212445265e-06, + "loss": 0.4397, + "step": 35735 + }, + { + "epoch": 2.01, + "grad_norm": 7.272477965943136, + "learning_rate": 2.946481701988655e-06, + "loss": 0.4468, + "step": 35740 + }, + { + "epoch": 2.01, + "grad_norm": 10.313697508432313, + "learning_rate": 2.9449874121916504e-06, + "loss": 0.4348, + "step": 35745 + }, + { + "epoch": 2.01, + "grad_norm": 6.078313122337126, + "learning_rate": 2.9434933432148194e-06, + "loss": 0.4323, + "step": 35750 + }, + { + "epoch": 2.01, + "grad_norm": 15.740935713715533, + "learning_rate": 2.9419994952187013e-06, + "loss": 0.4118, + "step": 35755 + }, + { + "epoch": 2.01, + "grad_norm": 6.6839709542325405, + "learning_rate": 2.9405058683638233e-06, + "loss": 0.4438, + "step": 35760 + }, + { + "epoch": 2.02, + "grad_norm": 6.198213094334123, + "learning_rate": 2.939012462810677e-06, + "loss": 0.4368, + "step": 35765 + }, + { + "epoch": 2.02, + "grad_norm": 5.0077983133807695, + "learning_rate": 2.9375192787197392e-06, + "loss": 0.427, + "step": 35770 + }, + { + "epoch": 2.02, + "grad_norm": 4.8076979963613695, + "learning_rate": 2.9360263162514554e-06, + "loss": 0.3971, + "step": 35775 + }, + { + "epoch": 2.02, + "grad_norm": 4.9645393002102365, + "learning_rate": 2.934533575566254e-06, + "loss": 0.4248, + "step": 35780 + }, + { + "epoch": 2.02, + "grad_norm": 4.598701108429405, + "learning_rate": 2.933041056824536e-06, + "loss": 0.44, + "step": 35785 + }, + { + "epoch": 2.02, + "grad_norm": 5.3368283017412175, + "learning_rate": 2.9315487601866777e-06, + "loss": 0.4002, + "step": 35790 + }, + { + "epoch": 2.02, + "grad_norm": 6.185065483388467, + "learning_rate": 2.9300566858130343e-06, + "loss": 0.4707, + "step": 35795 + }, + { + "epoch": 2.02, + "grad_norm": 5.604580349140249, + "learning_rate": 2.928564833863934e-06, + "loss": 0.4243, + "step": 35800 + }, + { + "epoch": 2.02, + "grad_norm": 5.5612185838977135, + "learning_rate": 2.9270732044996864e-06, + "loss": 0.4131, + "step": 35805 + }, + { + "epoch": 2.02, + "grad_norm": 4.813337345210239, + "learning_rate": 2.925581797880571e-06, + "loss": 0.4174, + "step": 35810 + }, + { + "epoch": 2.02, + "grad_norm": 5.592818903498818, + "learning_rate": 2.9240906141668457e-06, + "loss": 0.4526, + "step": 35815 + }, + { + "epoch": 2.02, + "grad_norm": 4.463786035143501, + "learning_rate": 2.9225996535187464e-06, + "loss": 0.3919, + "step": 35820 + }, + { + "epoch": 2.02, + "grad_norm": 4.827176495670314, + "learning_rate": 2.9211089160964813e-06, + "loss": 0.4226, + "step": 35825 + }, + { + "epoch": 2.02, + "grad_norm": 5.077025005993725, + "learning_rate": 2.9196184020602397e-06, + "loss": 0.4139, + "step": 35830 + }, + { + "epoch": 2.02, + "grad_norm": 4.551503871376388, + "learning_rate": 2.9181281115701816e-06, + "loss": 0.4059, + "step": 35835 + }, + { + "epoch": 2.02, + "grad_norm": 10.107936959906947, + "learning_rate": 2.9166380447864444e-06, + "loss": 0.4717, + "step": 35840 + }, + { + "epoch": 2.02, + "grad_norm": 4.368268058379852, + "learning_rate": 2.915148201869146e-06, + "loss": 0.3893, + "step": 35845 + }, + { + "epoch": 2.02, + "grad_norm": 5.909165442524433, + "learning_rate": 2.9136585829783727e-06, + "loss": 0.44, + "step": 35850 + }, + { + "epoch": 2.02, + "grad_norm": 4.7662995337880565, + "learning_rate": 2.912169188274194e-06, + "loss": 0.4275, + "step": 35855 + }, + { + "epoch": 2.02, + "grad_norm": 4.6140968907568825, + "learning_rate": 2.910680017916649e-06, + "loss": 0.419, + "step": 35860 + }, + { + "epoch": 2.02, + "grad_norm": 4.5432678858739575, + "learning_rate": 2.9091910720657584e-06, + "loss": 0.4251, + "step": 35865 + }, + { + "epoch": 2.02, + "grad_norm": 4.89283026920533, + "learning_rate": 2.9077023508815148e-06, + "loss": 0.4366, + "step": 35870 + }, + { + "epoch": 2.02, + "grad_norm": 5.060170416042779, + "learning_rate": 2.9062138545238855e-06, + "loss": 0.4583, + "step": 35875 + }, + { + "epoch": 2.02, + "grad_norm": 4.644859363623836, + "learning_rate": 2.9047255831528196e-06, + "loss": 0.4277, + "step": 35880 + }, + { + "epoch": 2.02, + "grad_norm": 4.839867757789757, + "learning_rate": 2.903237536928236e-06, + "loss": 0.4145, + "step": 35885 + }, + { + "epoch": 2.02, + "grad_norm": 5.422980378197918, + "learning_rate": 2.901749716010034e-06, + "loss": 0.4215, + "step": 35890 + }, + { + "epoch": 2.02, + "grad_norm": 8.076494068953284, + "learning_rate": 2.9002621205580857e-06, + "loss": 0.4491, + "step": 35895 + }, + { + "epoch": 2.02, + "grad_norm": 4.869737017386848, + "learning_rate": 2.8987747507322373e-06, + "loss": 0.4449, + "step": 35900 + }, + { + "epoch": 2.02, + "grad_norm": 4.5067336712799815, + "learning_rate": 2.897287606692315e-06, + "loss": 0.461, + "step": 35905 + }, + { + "epoch": 2.02, + "grad_norm": 4.730046760882599, + "learning_rate": 2.8958006885981216e-06, + "loss": 0.456, + "step": 35910 + }, + { + "epoch": 2.02, + "grad_norm": 5.114492849485682, + "learning_rate": 2.894313996609429e-06, + "loss": 0.4181, + "step": 35915 + }, + { + "epoch": 2.02, + "grad_norm": 4.616343883285859, + "learning_rate": 2.89282753088599e-06, + "loss": 0.4385, + "step": 35920 + }, + { + "epoch": 2.02, + "grad_norm": 5.164659013183382, + "learning_rate": 2.891341291587535e-06, + "loss": 0.4289, + "step": 35925 + }, + { + "epoch": 2.02, + "grad_norm": 4.5568870158173365, + "learning_rate": 2.8898552788737643e-06, + "loss": 0.4288, + "step": 35930 + }, + { + "epoch": 2.02, + "grad_norm": 4.892143230641789, + "learning_rate": 2.8883694929043547e-06, + "loss": 0.4195, + "step": 35935 + }, + { + "epoch": 2.03, + "grad_norm": 4.995254126052123, + "learning_rate": 2.8868839338389644e-06, + "loss": 0.4462, + "step": 35940 + }, + { + "epoch": 2.03, + "grad_norm": 4.66834365046065, + "learning_rate": 2.8853986018372195e-06, + "loss": 0.427, + "step": 35945 + }, + { + "epoch": 2.03, + "grad_norm": 4.984404586363791, + "learning_rate": 2.8839134970587284e-06, + "loss": 0.4164, + "step": 35950 + }, + { + "epoch": 2.03, + "grad_norm": 5.299787020257302, + "learning_rate": 2.8824286196630712e-06, + "loss": 0.4321, + "step": 35955 + }, + { + "epoch": 2.03, + "grad_norm": 5.275047001853112, + "learning_rate": 2.880943969809803e-06, + "loss": 0.4337, + "step": 35960 + }, + { + "epoch": 2.03, + "grad_norm": 12.162259568469418, + "learning_rate": 2.8794595476584597e-06, + "loss": 0.4329, + "step": 35965 + }, + { + "epoch": 2.03, + "grad_norm": 8.81831957712956, + "learning_rate": 2.877975353368544e-06, + "loss": 0.4145, + "step": 35970 + }, + { + "epoch": 2.03, + "grad_norm": 6.528143333174243, + "learning_rate": 2.876491387099544e-06, + "loss": 0.3957, + "step": 35975 + }, + { + "epoch": 2.03, + "grad_norm": 5.231655154828122, + "learning_rate": 2.875007649010916e-06, + "loss": 0.4281, + "step": 35980 + }, + { + "epoch": 2.03, + "grad_norm": 7.461544004566585, + "learning_rate": 2.873524139262093e-06, + "loss": 0.4101, + "step": 35985 + }, + { + "epoch": 2.03, + "grad_norm": 5.135560679089954, + "learning_rate": 2.872040858012487e-06, + "loss": 0.4174, + "step": 35990 + }, + { + "epoch": 2.03, + "grad_norm": 10.469069434737028, + "learning_rate": 2.8705578054214813e-06, + "loss": 0.41, + "step": 35995 + }, + { + "epoch": 2.03, + "grad_norm": 6.668290524868912, + "learning_rate": 2.8690749816484363e-06, + "loss": 0.4635, + "step": 36000 + }, + { + "epoch": 2.03, + "grad_norm": 4.427870015516103, + "learning_rate": 2.8675923868526924e-06, + "loss": 0.4084, + "step": 36005 + }, + { + "epoch": 2.03, + "grad_norm": 5.1801009084144525, + "learning_rate": 2.8661100211935544e-06, + "loss": 0.4329, + "step": 36010 + }, + { + "epoch": 2.03, + "grad_norm": 5.8453693226143, + "learning_rate": 2.864627884830314e-06, + "loss": 0.4363, + "step": 36015 + }, + { + "epoch": 2.03, + "grad_norm": 6.703891728875428, + "learning_rate": 2.8631459779222304e-06, + "loss": 0.3968, + "step": 36020 + }, + { + "epoch": 2.03, + "grad_norm": 6.425389133222261, + "learning_rate": 2.861664300628544e-06, + "loss": 0.4379, + "step": 36025 + }, + { + "epoch": 2.03, + "grad_norm": 4.521892542208789, + "learning_rate": 2.8601828531084635e-06, + "loss": 0.3961, + "step": 36030 + }, + { + "epoch": 2.03, + "grad_norm": 4.841144524461962, + "learning_rate": 2.858701635521181e-06, + "loss": 0.4389, + "step": 36035 + }, + { + "epoch": 2.03, + "grad_norm": 4.51812169464078, + "learning_rate": 2.857220648025858e-06, + "loss": 0.4123, + "step": 36040 + }, + { + "epoch": 2.03, + "grad_norm": 4.435356632303837, + "learning_rate": 2.8557398907816315e-06, + "loss": 0.3939, + "step": 36045 + }, + { + "epoch": 2.03, + "grad_norm": 5.02000080834046, + "learning_rate": 2.854259363947619e-06, + "loss": 0.4284, + "step": 36050 + }, + { + "epoch": 2.03, + "grad_norm": 4.810024106384719, + "learning_rate": 2.852779067682906e-06, + "loss": 0.4115, + "step": 36055 + }, + { + "epoch": 2.03, + "grad_norm": 5.726470021066952, + "learning_rate": 2.85129900214656e-06, + "loss": 0.4044, + "step": 36060 + }, + { + "epoch": 2.03, + "grad_norm": 6.019711301925271, + "learning_rate": 2.8498191674976196e-06, + "loss": 0.4631, + "step": 36065 + }, + { + "epoch": 2.03, + "grad_norm": 4.993576964009328, + "learning_rate": 2.8483395638950965e-06, + "loss": 0.4265, + "step": 36070 + }, + { + "epoch": 2.03, + "grad_norm": 4.984751673257927, + "learning_rate": 2.846860191497986e-06, + "loss": 0.4211, + "step": 36075 + }, + { + "epoch": 2.03, + "grad_norm": 5.662789550312592, + "learning_rate": 2.845381050465248e-06, + "loss": 0.429, + "step": 36080 + }, + { + "epoch": 2.03, + "grad_norm": 4.526399649423495, + "learning_rate": 2.8439021409558277e-06, + "loss": 0.4188, + "step": 36085 + }, + { + "epoch": 2.03, + "grad_norm": 4.279901147187739, + "learning_rate": 2.8424234631286353e-06, + "loss": 0.4272, + "step": 36090 + }, + { + "epoch": 2.03, + "grad_norm": 4.2157143690530186, + "learning_rate": 2.8409450171425644e-06, + "loss": 0.4148, + "step": 36095 + }, + { + "epoch": 2.03, + "grad_norm": 5.022163582817568, + "learning_rate": 2.8394668031564813e-06, + "loss": 0.4071, + "step": 36100 + }, + { + "epoch": 2.03, + "grad_norm": 5.2367445406550726, + "learning_rate": 2.837988821329224e-06, + "loss": 0.3917, + "step": 36105 + }, + { + "epoch": 2.03, + "grad_norm": 5.42671527299746, + "learning_rate": 2.8365110718196114e-06, + "loss": 0.4413, + "step": 36110 + }, + { + "epoch": 2.03, + "grad_norm": 5.099769921367586, + "learning_rate": 2.8350335547864306e-06, + "loss": 0.4097, + "step": 36115 + }, + { + "epoch": 2.04, + "grad_norm": 7.498264773431116, + "learning_rate": 2.8335562703884504e-06, + "loss": 0.4185, + "step": 36120 + }, + { + "epoch": 2.04, + "grad_norm": 5.818740377878313, + "learning_rate": 2.832079218784411e-06, + "loss": 0.418, + "step": 36125 + }, + { + "epoch": 2.04, + "grad_norm": 4.90611092781113, + "learning_rate": 2.8306024001330247e-06, + "loss": 0.4109, + "step": 36130 + }, + { + "epoch": 2.04, + "grad_norm": 4.657551988019308, + "learning_rate": 2.8291258145929866e-06, + "loss": 0.4104, + "step": 36135 + }, + { + "epoch": 2.04, + "grad_norm": 4.682180917766779, + "learning_rate": 2.8276494623229596e-06, + "loss": 0.4345, + "step": 36140 + }, + { + "epoch": 2.04, + "grad_norm": 6.47116196035868, + "learning_rate": 2.826173343481586e-06, + "loss": 0.4266, + "step": 36145 + }, + { + "epoch": 2.04, + "grad_norm": 10.294404794988088, + "learning_rate": 2.824697458227481e-06, + "loss": 0.4475, + "step": 36150 + }, + { + "epoch": 2.04, + "grad_norm": 8.68025229717112, + "learning_rate": 2.823221806719233e-06, + "loss": 0.4277, + "step": 36155 + }, + { + "epoch": 2.04, + "grad_norm": 8.643429093035008, + "learning_rate": 2.8217463891154096e-06, + "loss": 0.4266, + "step": 36160 + }, + { + "epoch": 2.04, + "grad_norm": 4.703612087835522, + "learning_rate": 2.820271205574549e-06, + "loss": 0.4008, + "step": 36165 + }, + { + "epoch": 2.04, + "grad_norm": 7.020606870142804, + "learning_rate": 2.8187962562551687e-06, + "loss": 0.4037, + "step": 36170 + }, + { + "epoch": 2.04, + "grad_norm": 5.732231467897349, + "learning_rate": 2.817321541315755e-06, + "loss": 0.4272, + "step": 36175 + }, + { + "epoch": 2.04, + "grad_norm": 4.801694588023581, + "learning_rate": 2.815847060914777e-06, + "loss": 0.4154, + "step": 36180 + }, + { + "epoch": 2.04, + "grad_norm": 4.7854836844884305, + "learning_rate": 2.8143728152106697e-06, + "loss": 0.4325, + "step": 36185 + }, + { + "epoch": 2.04, + "grad_norm": 5.9685100375024405, + "learning_rate": 2.8128988043618488e-06, + "loss": 0.475, + "step": 36190 + }, + { + "epoch": 2.04, + "grad_norm": 5.297093421081957, + "learning_rate": 2.8114250285267063e-06, + "loss": 0.4299, + "step": 36195 + }, + { + "epoch": 2.04, + "grad_norm": 5.360894529551778, + "learning_rate": 2.8099514878636014e-06, + "loss": 0.4259, + "step": 36200 + }, + { + "epoch": 2.04, + "grad_norm": 5.01379139379236, + "learning_rate": 2.808478182530876e-06, + "loss": 0.4363, + "step": 36205 + }, + { + "epoch": 2.04, + "grad_norm": 10.703639972534212, + "learning_rate": 2.8070051126868424e-06, + "loss": 0.4129, + "step": 36210 + }, + { + "epoch": 2.04, + "grad_norm": 4.466941149260079, + "learning_rate": 2.8055322784897854e-06, + "loss": 0.4035, + "step": 36215 + }, + { + "epoch": 2.04, + "grad_norm": 4.458612789550459, + "learning_rate": 2.8040596800979723e-06, + "loss": 0.4301, + "step": 36220 + }, + { + "epoch": 2.04, + "grad_norm": 4.199020870941774, + "learning_rate": 2.802587317669636e-06, + "loss": 0.4038, + "step": 36225 + }, + { + "epoch": 2.04, + "grad_norm": 4.404110959429534, + "learning_rate": 2.801115191362992e-06, + "loss": 0.4356, + "step": 36230 + }, + { + "epoch": 2.04, + "grad_norm": 4.252463521404466, + "learning_rate": 2.7996433013362256e-06, + "loss": 0.4067, + "step": 36235 + }, + { + "epoch": 2.04, + "grad_norm": 4.8637602019605035, + "learning_rate": 2.798171647747495e-06, + "loss": 0.4095, + "step": 36240 + }, + { + "epoch": 2.04, + "grad_norm": 4.712489662144385, + "learning_rate": 2.7967002307549406e-06, + "loss": 0.4049, + "step": 36245 + }, + { + "epoch": 2.04, + "grad_norm": 7.059442174944944, + "learning_rate": 2.7952290505166677e-06, + "loss": 0.4298, + "step": 36250 + }, + { + "epoch": 2.04, + "grad_norm": 4.788027891102387, + "learning_rate": 2.793758107190766e-06, + "loss": 0.4238, + "step": 36255 + }, + { + "epoch": 2.04, + "grad_norm": 6.483292394201127, + "learning_rate": 2.7922874009352908e-06, + "loss": 0.4098, + "step": 36260 + }, + { + "epoch": 2.04, + "grad_norm": 4.497815878063458, + "learning_rate": 2.79081693190828e-06, + "loss": 0.4158, + "step": 36265 + }, + { + "epoch": 2.04, + "grad_norm": 5.125556353120704, + "learning_rate": 2.7893467002677375e-06, + "loss": 0.4452, + "step": 36270 + }, + { + "epoch": 2.04, + "grad_norm": 4.554810986836358, + "learning_rate": 2.78787670617165e-06, + "loss": 0.3933, + "step": 36275 + }, + { + "epoch": 2.04, + "grad_norm": 5.364484430678394, + "learning_rate": 2.7864069497779716e-06, + "loss": 0.4755, + "step": 36280 + }, + { + "epoch": 2.04, + "grad_norm": 5.097687269323784, + "learning_rate": 2.784937431244636e-06, + "loss": 0.4409, + "step": 36285 + }, + { + "epoch": 2.04, + "grad_norm": 5.2833162297273555, + "learning_rate": 2.7834681507295507e-06, + "loss": 0.3931, + "step": 36290 + }, + { + "epoch": 2.05, + "grad_norm": 5.2170404685208815, + "learning_rate": 2.781999108390595e-06, + "loss": 0.4616, + "step": 36295 + }, + { + "epoch": 2.05, + "grad_norm": 5.328508502860927, + "learning_rate": 2.7805303043856225e-06, + "loss": 0.4197, + "step": 36300 + }, + { + "epoch": 2.05, + "grad_norm": 5.403230131051744, + "learning_rate": 2.7790617388724657e-06, + "loss": 0.4085, + "step": 36305 + }, + { + "epoch": 2.05, + "grad_norm": 5.404895228455706, + "learning_rate": 2.7775934120089245e-06, + "loss": 0.4304, + "step": 36310 + }, + { + "epoch": 2.05, + "grad_norm": 6.690811282782428, + "learning_rate": 2.776125323952782e-06, + "loss": 0.439, + "step": 36315 + }, + { + "epoch": 2.05, + "grad_norm": 5.470672483781449, + "learning_rate": 2.774657474861787e-06, + "loss": 0.4623, + "step": 36320 + }, + { + "epoch": 2.05, + "grad_norm": 8.62030239134579, + "learning_rate": 2.773189864893667e-06, + "loss": 0.411, + "step": 36325 + }, + { + "epoch": 2.05, + "grad_norm": 5.4739398209745955, + "learning_rate": 2.7717224942061248e-06, + "loss": 0.413, + "step": 36330 + }, + { + "epoch": 2.05, + "grad_norm": 8.823506594901438, + "learning_rate": 2.770255362956833e-06, + "loss": 0.4111, + "step": 36335 + }, + { + "epoch": 2.05, + "grad_norm": 4.7321082441764535, + "learning_rate": 2.768788471303445e-06, + "loss": 0.4027, + "step": 36340 + }, + { + "epoch": 2.05, + "grad_norm": 4.479022310462112, + "learning_rate": 2.7673218194035805e-06, + "loss": 0.4245, + "step": 36345 + }, + { + "epoch": 2.05, + "grad_norm": 4.398404333860972, + "learning_rate": 2.7658554074148426e-06, + "loss": 0.4026, + "step": 36350 + }, + { + "epoch": 2.05, + "grad_norm": 10.521729262212316, + "learning_rate": 2.764389235494801e-06, + "loss": 0.4111, + "step": 36355 + }, + { + "epoch": 2.05, + "grad_norm": 4.5946069384225785, + "learning_rate": 2.7629233038010007e-06, + "loss": 0.3934, + "step": 36360 + }, + { + "epoch": 2.05, + "grad_norm": 4.948986975965735, + "learning_rate": 2.761457612490964e-06, + "loss": 0.3975, + "step": 36365 + }, + { + "epoch": 2.05, + "grad_norm": 4.78800726522519, + "learning_rate": 2.759992161722188e-06, + "loss": 0.4083, + "step": 36370 + }, + { + "epoch": 2.05, + "grad_norm": 5.285161015457855, + "learning_rate": 2.758526951652139e-06, + "loss": 0.4657, + "step": 36375 + }, + { + "epoch": 2.05, + "grad_norm": 4.789030658528038, + "learning_rate": 2.757061982438263e-06, + "loss": 0.4205, + "step": 36380 + }, + { + "epoch": 2.05, + "grad_norm": 4.666308310236844, + "learning_rate": 2.755597254237974e-06, + "loss": 0.3834, + "step": 36385 + }, + { + "epoch": 2.05, + "grad_norm": 5.315779074073722, + "learning_rate": 2.754132767208667e-06, + "loss": 0.394, + "step": 36390 + }, + { + "epoch": 2.05, + "grad_norm": 6.199855913540895, + "learning_rate": 2.7526685215077043e-06, + "loss": 0.4019, + "step": 36395 + }, + { + "epoch": 2.05, + "grad_norm": 5.404346693967891, + "learning_rate": 2.7512045172924294e-06, + "loss": 0.4264, + "step": 36400 + }, + { + "epoch": 2.05, + "grad_norm": 10.028006412437586, + "learning_rate": 2.7497407547201516e-06, + "loss": 0.4334, + "step": 36405 + }, + { + "epoch": 2.05, + "grad_norm": 7.097449562431746, + "learning_rate": 2.7482772339481622e-06, + "loss": 0.3969, + "step": 36410 + }, + { + "epoch": 2.05, + "grad_norm": 9.02155729977443, + "learning_rate": 2.7468139551337224e-06, + "loss": 0.387, + "step": 36415 + }, + { + "epoch": 2.05, + "grad_norm": 4.7772469708864485, + "learning_rate": 2.745350918434065e-06, + "loss": 0.4279, + "step": 36420 + }, + { + "epoch": 2.05, + "grad_norm": 4.209891010891415, + "learning_rate": 2.743888124006404e-06, + "loss": 0.4104, + "step": 36425 + }, + { + "epoch": 2.05, + "grad_norm": 7.662079746116483, + "learning_rate": 2.7424255720079183e-06, + "loss": 0.4243, + "step": 36430 + }, + { + "epoch": 2.05, + "grad_norm": 7.17076897894065, + "learning_rate": 2.7409632625957712e-06, + "loss": 0.3811, + "step": 36435 + }, + { + "epoch": 2.05, + "grad_norm": 7.502074816601167, + "learning_rate": 2.739501195927091e-06, + "loss": 0.4217, + "step": 36440 + }, + { + "epoch": 2.05, + "grad_norm": 5.316906021402658, + "learning_rate": 2.738039372158982e-06, + "loss": 0.3941, + "step": 36445 + }, + { + "epoch": 2.05, + "grad_norm": 5.248481035019215, + "learning_rate": 2.736577791448527e-06, + "loss": 0.3969, + "step": 36450 + }, + { + "epoch": 2.05, + "grad_norm": 4.422605647751632, + "learning_rate": 2.7351164539527754e-06, + "loss": 0.4063, + "step": 36455 + }, + { + "epoch": 2.05, + "grad_norm": 5.664164693517471, + "learning_rate": 2.7336553598287567e-06, + "loss": 0.4254, + "step": 36460 + }, + { + "epoch": 2.05, + "grad_norm": 4.954230322342542, + "learning_rate": 2.7321945092334738e-06, + "loss": 0.4406, + "step": 36465 + }, + { + "epoch": 2.05, + "grad_norm": 4.6998602593014915, + "learning_rate": 2.7307339023238977e-06, + "loss": 0.4037, + "step": 36470 + }, + { + "epoch": 2.06, + "grad_norm": 5.326544427509304, + "learning_rate": 2.72927353925698e-06, + "loss": 0.4257, + "step": 36475 + }, + { + "epoch": 2.06, + "grad_norm": 4.667641852904444, + "learning_rate": 2.7278134201896413e-06, + "loss": 0.4291, + "step": 36480 + }, + { + "epoch": 2.06, + "grad_norm": 4.631164303783946, + "learning_rate": 2.726353545278779e-06, + "loss": 0.3805, + "step": 36485 + }, + { + "epoch": 2.06, + "grad_norm": 5.366719163763919, + "learning_rate": 2.724893914681262e-06, + "loss": 0.4486, + "step": 36490 + }, + { + "epoch": 2.06, + "grad_norm": 5.199164694763938, + "learning_rate": 2.7234345285539358e-06, + "loss": 0.4313, + "step": 36495 + }, + { + "epoch": 2.06, + "grad_norm": 4.664584444248726, + "learning_rate": 2.721975387053617e-06, + "loss": 0.3929, + "step": 36500 + }, + { + "epoch": 2.06, + "grad_norm": 6.528675809863526, + "learning_rate": 2.720516490337095e-06, + "loss": 0.4261, + "step": 36505 + }, + { + "epoch": 2.06, + "grad_norm": 7.344196832492743, + "learning_rate": 2.719057838561138e-06, + "loss": 0.4205, + "step": 36510 + }, + { + "epoch": 2.06, + "grad_norm": 5.44046019834484, + "learning_rate": 2.7175994318824806e-06, + "loss": 0.4308, + "step": 36515 + }, + { + "epoch": 2.06, + "grad_norm": 5.811771109033519, + "learning_rate": 2.716141270457839e-06, + "loss": 0.4422, + "step": 36520 + }, + { + "epoch": 2.06, + "grad_norm": 4.453069511352081, + "learning_rate": 2.7146833544438978e-06, + "loss": 0.4254, + "step": 36525 + }, + { + "epoch": 2.06, + "grad_norm": 5.355192848197981, + "learning_rate": 2.7132256839973138e-06, + "loss": 0.4278, + "step": 36530 + }, + { + "epoch": 2.06, + "grad_norm": 4.749061141858274, + "learning_rate": 2.711768259274724e-06, + "loss": 0.3878, + "step": 36535 + }, + { + "epoch": 2.06, + "grad_norm": 5.146496471026979, + "learning_rate": 2.710311080432732e-06, + "loss": 0.406, + "step": 36540 + }, + { + "epoch": 2.06, + "grad_norm": 5.329809819300412, + "learning_rate": 2.708854147627922e-06, + "loss": 0.4333, + "step": 36545 + }, + { + "epoch": 2.06, + "grad_norm": 5.2889672549764075, + "learning_rate": 2.7073974610168425e-06, + "loss": 0.4211, + "step": 36550 + }, + { + "epoch": 2.06, + "grad_norm": 6.2556861521690115, + "learning_rate": 2.7059410207560243e-06, + "loss": 0.4092, + "step": 36555 + }, + { + "epoch": 2.06, + "grad_norm": 4.9084648181916455, + "learning_rate": 2.7044848270019696e-06, + "loss": 0.403, + "step": 36560 + }, + { + "epoch": 2.06, + "grad_norm": 4.94150457712519, + "learning_rate": 2.703028879911149e-06, + "loss": 0.425, + "step": 36565 + }, + { + "epoch": 2.06, + "grad_norm": 4.661307617645895, + "learning_rate": 2.701573179640015e-06, + "loss": 0.4346, + "step": 36570 + }, + { + "epoch": 2.06, + "grad_norm": 4.768069120396337, + "learning_rate": 2.700117726344985e-06, + "loss": 0.414, + "step": 36575 + }, + { + "epoch": 2.06, + "grad_norm": 4.625248390632645, + "learning_rate": 2.698662520182457e-06, + "loss": 0.4168, + "step": 36580 + }, + { + "epoch": 2.06, + "grad_norm": 13.503170606133523, + "learning_rate": 2.6972075613087973e-06, + "loss": 0.4097, + "step": 36585 + }, + { + "epoch": 2.06, + "grad_norm": 5.130839794364503, + "learning_rate": 2.6957528498803464e-06, + "loss": 0.3994, + "step": 36590 + }, + { + "epoch": 2.06, + "grad_norm": 6.453089650499444, + "learning_rate": 2.6942983860534233e-06, + "loss": 0.4343, + "step": 36595 + }, + { + "epoch": 2.06, + "grad_norm": 7.799107495942954, + "learning_rate": 2.6928441699843127e-06, + "loss": 0.4211, + "step": 36600 + }, + { + "epoch": 2.06, + "grad_norm": 4.536252858238333, + "learning_rate": 2.6913902018292805e-06, + "loss": 0.4248, + "step": 36605 + }, + { + "epoch": 2.06, + "grad_norm": 5.646444414108311, + "learning_rate": 2.689936481744559e-06, + "loss": 0.4368, + "step": 36610 + }, + { + "epoch": 2.06, + "grad_norm": 5.89258333206394, + "learning_rate": 2.688483009886356e-06, + "loss": 0.3952, + "step": 36615 + }, + { + "epoch": 2.06, + "grad_norm": 5.0508522257959765, + "learning_rate": 2.6870297864108573e-06, + "loss": 0.4267, + "step": 36620 + }, + { + "epoch": 2.06, + "grad_norm": 4.676194204241176, + "learning_rate": 2.6855768114742135e-06, + "loss": 0.3846, + "step": 36625 + }, + { + "epoch": 2.06, + "grad_norm": 4.993402856958031, + "learning_rate": 2.684124085232558e-06, + "loss": 0.4624, + "step": 36630 + }, + { + "epoch": 2.06, + "grad_norm": 4.859789286184354, + "learning_rate": 2.6826716078419878e-06, + "loss": 0.4146, + "step": 36635 + }, + { + "epoch": 2.06, + "grad_norm": 10.109306457052867, + "learning_rate": 2.6812193794585827e-06, + "loss": 0.4041, + "step": 36640 + }, + { + "epoch": 2.06, + "grad_norm": 9.29970708060734, + "learning_rate": 2.6797674002383877e-06, + "loss": 0.4212, + "step": 36645 + }, + { + "epoch": 2.07, + "grad_norm": 7.516928644700587, + "learning_rate": 2.6783156703374246e-06, + "loss": 0.3952, + "step": 36650 + }, + { + "epoch": 2.07, + "grad_norm": 15.648332275307643, + "learning_rate": 2.676864189911691e-06, + "loss": 0.417, + "step": 36655 + }, + { + "epoch": 2.07, + "grad_norm": 4.9122091542143425, + "learning_rate": 2.675412959117152e-06, + "loss": 0.427, + "step": 36660 + }, + { + "epoch": 2.07, + "grad_norm": 4.645537064434362, + "learning_rate": 2.67396197810975e-06, + "loss": 0.4124, + "step": 36665 + }, + { + "epoch": 2.07, + "grad_norm": 5.503849013747944, + "learning_rate": 2.6725112470454006e-06, + "loss": 0.4048, + "step": 36670 + }, + { + "epoch": 2.07, + "grad_norm": 5.350843677064242, + "learning_rate": 2.671060766079987e-06, + "loss": 0.3947, + "step": 36675 + }, + { + "epoch": 2.07, + "grad_norm": 9.492872940290535, + "learning_rate": 2.6696105353693746e-06, + "loss": 0.4289, + "step": 36680 + }, + { + "epoch": 2.07, + "grad_norm": 5.955805009195729, + "learning_rate": 2.6681605550693935e-06, + "loss": 0.4201, + "step": 36685 + }, + { + "epoch": 2.07, + "grad_norm": 4.411660138052263, + "learning_rate": 2.6667108253358532e-06, + "loss": 0.3996, + "step": 36690 + }, + { + "epoch": 2.07, + "grad_norm": 4.780222465525253, + "learning_rate": 2.6652613463245325e-06, + "loss": 0.4146, + "step": 36695 + }, + { + "epoch": 2.07, + "grad_norm": 7.6609998734171, + "learning_rate": 2.6638121181911825e-06, + "loss": 0.3924, + "step": 36700 + }, + { + "epoch": 2.07, + "grad_norm": 5.0917966203365355, + "learning_rate": 2.662363141091532e-06, + "loss": 0.4242, + "step": 36705 + }, + { + "epoch": 2.07, + "grad_norm": 5.000887366140905, + "learning_rate": 2.660914415181277e-06, + "loss": 0.455, + "step": 36710 + }, + { + "epoch": 2.07, + "grad_norm": 4.598953488178567, + "learning_rate": 2.659465940616094e-06, + "loss": 0.4115, + "step": 36715 + }, + { + "epoch": 2.07, + "grad_norm": 5.2242195689340205, + "learning_rate": 2.6580177175516243e-06, + "loss": 0.3939, + "step": 36720 + }, + { + "epoch": 2.07, + "grad_norm": 4.859897513620613, + "learning_rate": 2.656569746143485e-06, + "loss": 0.3936, + "step": 36725 + }, + { + "epoch": 2.07, + "grad_norm": 4.996479697641248, + "learning_rate": 2.6551220265472695e-06, + "loss": 0.4305, + "step": 36730 + }, + { + "epoch": 2.07, + "grad_norm": 4.850121515215442, + "learning_rate": 2.6536745589185424e-06, + "loss": 0.3987, + "step": 36735 + }, + { + "epoch": 2.07, + "grad_norm": 5.570310442510414, + "learning_rate": 2.6522273434128375e-06, + "loss": 0.4228, + "step": 36740 + }, + { + "epoch": 2.07, + "grad_norm": 4.537257533717401, + "learning_rate": 2.650780380185666e-06, + "loss": 0.3883, + "step": 36745 + }, + { + "epoch": 2.07, + "grad_norm": 4.713636000212951, + "learning_rate": 2.649333669392512e-06, + "loss": 0.4405, + "step": 36750 + }, + { + "epoch": 2.07, + "grad_norm": 5.168481118979662, + "learning_rate": 2.64788721118883e-06, + "loss": 0.41, + "step": 36755 + }, + { + "epoch": 2.07, + "grad_norm": 4.695367622300172, + "learning_rate": 2.6464410057300456e-06, + "loss": 0.4189, + "step": 36760 + }, + { + "epoch": 2.07, + "grad_norm": 5.115583168043013, + "learning_rate": 2.6449950531715642e-06, + "loss": 0.3903, + "step": 36765 + }, + { + "epoch": 2.07, + "grad_norm": 4.817883421548684, + "learning_rate": 2.6435493536687558e-06, + "loss": 0.4011, + "step": 36770 + }, + { + "epoch": 2.07, + "grad_norm": 9.822750552321125, + "learning_rate": 2.642103907376971e-06, + "loss": 0.3957, + "step": 36775 + }, + { + "epoch": 2.07, + "grad_norm": 7.088921083813334, + "learning_rate": 2.6406587144515278e-06, + "loss": 0.427, + "step": 36780 + }, + { + "epoch": 2.07, + "grad_norm": 5.408250344342435, + "learning_rate": 2.639213775047716e-06, + "loss": 0.3833, + "step": 36785 + }, + { + "epoch": 2.07, + "grad_norm": 4.635394882245829, + "learning_rate": 2.6377690893208052e-06, + "loss": 0.4183, + "step": 36790 + }, + { + "epoch": 2.07, + "grad_norm": 4.769981995173298, + "learning_rate": 2.6363246574260292e-06, + "loss": 0.4261, + "step": 36795 + }, + { + "epoch": 2.07, + "grad_norm": 4.9216904514130055, + "learning_rate": 2.6348804795186023e-06, + "loss": 0.4237, + "step": 36800 + }, + { + "epoch": 2.07, + "grad_norm": 6.286144774587911, + "learning_rate": 2.633436555753706e-06, + "loss": 0.4218, + "step": 36805 + }, + { + "epoch": 2.07, + "grad_norm": 4.3700808677456395, + "learning_rate": 2.6319928862864943e-06, + "loss": 0.4194, + "step": 36810 + }, + { + "epoch": 2.07, + "grad_norm": 4.604255316573414, + "learning_rate": 2.6305494712720992e-06, + "loss": 0.4314, + "step": 36815 + }, + { + "epoch": 2.07, + "grad_norm": 4.596443876762925, + "learning_rate": 2.6291063108656196e-06, + "loss": 0.4069, + "step": 36820 + }, + { + "epoch": 2.07, + "grad_norm": 4.650878870583402, + "learning_rate": 2.6276634052221293e-06, + "loss": 0.4245, + "step": 36825 + }, + { + "epoch": 2.08, + "grad_norm": 4.732009022040906, + "learning_rate": 2.626220754496679e-06, + "loss": 0.3897, + "step": 36830 + }, + { + "epoch": 2.08, + "grad_norm": 7.94193684168628, + "learning_rate": 2.6247783588442825e-06, + "loss": 0.4425, + "step": 36835 + }, + { + "epoch": 2.08, + "grad_norm": 6.774766278576813, + "learning_rate": 2.6233362184199363e-06, + "loss": 0.4183, + "step": 36840 + }, + { + "epoch": 2.08, + "grad_norm": 7.584948319885389, + "learning_rate": 2.6218943333786e-06, + "loss": 0.4111, + "step": 36845 + }, + { + "epoch": 2.08, + "grad_norm": 7.931391197413627, + "learning_rate": 2.6204527038752152e-06, + "loss": 0.3966, + "step": 36850 + }, + { + "epoch": 2.08, + "grad_norm": 5.240101305107887, + "learning_rate": 2.6190113300646867e-06, + "loss": 0.46, + "step": 36855 + }, + { + "epoch": 2.08, + "grad_norm": 5.804345690368732, + "learning_rate": 2.617570212101902e-06, + "loss": 0.4524, + "step": 36860 + }, + { + "epoch": 2.08, + "grad_norm": 4.5173813188418865, + "learning_rate": 2.6161293501417118e-06, + "loss": 0.4205, + "step": 36865 + }, + { + "epoch": 2.08, + "grad_norm": 5.117152358658561, + "learning_rate": 2.6146887443389415e-06, + "loss": 0.4095, + "step": 36870 + }, + { + "epoch": 2.08, + "grad_norm": 4.774960543724538, + "learning_rate": 2.613248394848395e-06, + "loss": 0.3893, + "step": 36875 + }, + { + "epoch": 2.08, + "grad_norm": 4.827645382677662, + "learning_rate": 2.6118083018248395e-06, + "loss": 0.4253, + "step": 36880 + }, + { + "epoch": 2.08, + "grad_norm": 5.2976688080844445, + "learning_rate": 2.610368465423025e-06, + "loss": 0.372, + "step": 36885 + }, + { + "epoch": 2.08, + "grad_norm": 5.370441791084755, + "learning_rate": 2.6089288857976626e-06, + "loss": 0.446, + "step": 36890 + }, + { + "epoch": 2.08, + "grad_norm": 7.0315887646897375, + "learning_rate": 2.607489563103446e-06, + "loss": 0.3992, + "step": 36895 + }, + { + "epoch": 2.08, + "grad_norm": 6.172398603506704, + "learning_rate": 2.606050497495034e-06, + "loss": 0.4093, + "step": 36900 + }, + { + "epoch": 2.08, + "grad_norm": 4.777776860692738, + "learning_rate": 2.604611689127061e-06, + "loss": 0.4152, + "step": 36905 + }, + { + "epoch": 2.08, + "grad_norm": 5.475064907492985, + "learning_rate": 2.6031731381541343e-06, + "loss": 0.396, + "step": 36910 + }, + { + "epoch": 2.08, + "grad_norm": 7.097735557080617, + "learning_rate": 2.60173484473083e-06, + "loss": 0.4196, + "step": 36915 + }, + { + "epoch": 2.08, + "grad_norm": 5.27801905171361, + "learning_rate": 2.600296809011702e-06, + "loss": 0.4166, + "step": 36920 + }, + { + "epoch": 2.08, + "grad_norm": 7.261139695024201, + "learning_rate": 2.5988590311512737e-06, + "loss": 0.4076, + "step": 36925 + }, + { + "epoch": 2.08, + "grad_norm": 5.1881015452394035, + "learning_rate": 2.597421511304038e-06, + "loss": 0.4464, + "step": 36930 + }, + { + "epoch": 2.08, + "grad_norm": 4.988799242420278, + "learning_rate": 2.595984249624466e-06, + "loss": 0.3836, + "step": 36935 + }, + { + "epoch": 2.08, + "grad_norm": 4.5124362999881855, + "learning_rate": 2.594547246266994e-06, + "loss": 0.3913, + "step": 36940 + }, + { + "epoch": 2.08, + "grad_norm": 5.772434281021417, + "learning_rate": 2.5931105013860387e-06, + "loss": 0.4292, + "step": 36945 + }, + { + "epoch": 2.08, + "grad_norm": 5.003772279738226, + "learning_rate": 2.5916740151359822e-06, + "loss": 0.3996, + "step": 36950 + }, + { + "epoch": 2.08, + "grad_norm": 7.480029381711743, + "learning_rate": 2.5902377876711793e-06, + "loss": 0.4008, + "step": 36955 + }, + { + "epoch": 2.08, + "grad_norm": 4.289919052430077, + "learning_rate": 2.588801819145964e-06, + "loss": 0.4155, + "step": 36960 + }, + { + "epoch": 2.08, + "grad_norm": 5.714719928751562, + "learning_rate": 2.5873661097146317e-06, + "loss": 0.4115, + "step": 36965 + }, + { + "epoch": 2.08, + "grad_norm": 8.071001850463855, + "learning_rate": 2.5859306595314615e-06, + "loss": 0.4096, + "step": 36970 + }, + { + "epoch": 2.08, + "grad_norm": 5.155942230349109, + "learning_rate": 2.584495468750694e-06, + "loss": 0.4296, + "step": 36975 + }, + { + "epoch": 2.08, + "grad_norm": 4.429951845304517, + "learning_rate": 2.5830605375265506e-06, + "loss": 0.4276, + "step": 36980 + }, + { + "epoch": 2.08, + "grad_norm": 4.646532014474953, + "learning_rate": 2.581625866013219e-06, + "loss": 0.4019, + "step": 36985 + }, + { + "epoch": 2.08, + "grad_norm": 5.650484496248696, + "learning_rate": 2.58019145436486e-06, + "loss": 0.3936, + "step": 36990 + }, + { + "epoch": 2.08, + "grad_norm": 5.799052121226185, + "learning_rate": 2.5787573027356093e-06, + "loss": 0.4007, + "step": 36995 + }, + { + "epoch": 2.08, + "grad_norm": 4.79576543933088, + "learning_rate": 2.577323411279571e-06, + "loss": 0.4142, + "step": 37000 + }, + { + "epoch": 2.09, + "grad_norm": 6.246577989215455, + "learning_rate": 2.5758897801508265e-06, + "loss": 0.429, + "step": 37005 + }, + { + "epoch": 2.09, + "grad_norm": 4.696300069976304, + "learning_rate": 2.5744564095034207e-06, + "loss": 0.4128, + "step": 37010 + }, + { + "epoch": 2.09, + "grad_norm": 4.704977450684182, + "learning_rate": 2.5730232994913796e-06, + "loss": 0.4211, + "step": 37015 + }, + { + "epoch": 2.09, + "grad_norm": 15.250893250387021, + "learning_rate": 2.5715904502686974e-06, + "loss": 0.4333, + "step": 37020 + }, + { + "epoch": 2.09, + "grad_norm": 7.173325980955074, + "learning_rate": 2.5701578619893365e-06, + "loss": 0.4425, + "step": 37025 + }, + { + "epoch": 2.09, + "grad_norm": 5.08917208537564, + "learning_rate": 2.5687255348072395e-06, + "loss": 0.402, + "step": 37030 + }, + { + "epoch": 2.09, + "grad_norm": 4.392746900117666, + "learning_rate": 2.5672934688763127e-06, + "loss": 0.4284, + "step": 37035 + }, + { + "epoch": 2.09, + "grad_norm": 4.975233371641439, + "learning_rate": 2.5658616643504384e-06, + "loss": 0.3635, + "step": 37040 + }, + { + "epoch": 2.09, + "grad_norm": 6.500467969019714, + "learning_rate": 2.564430121383472e-06, + "loss": 0.4297, + "step": 37045 + }, + { + "epoch": 2.09, + "grad_norm": 4.709001829842048, + "learning_rate": 2.562998840129237e-06, + "loss": 0.3936, + "step": 37050 + }, + { + "epoch": 2.09, + "grad_norm": 4.8911817075677675, + "learning_rate": 2.5615678207415328e-06, + "loss": 0.4346, + "step": 37055 + }, + { + "epoch": 2.09, + "grad_norm": 4.844222912997217, + "learning_rate": 2.560137063374127e-06, + "loss": 0.3901, + "step": 37060 + }, + { + "epoch": 2.09, + "grad_norm": 4.50211780390919, + "learning_rate": 2.558706568180763e-06, + "loss": 0.3716, + "step": 37065 + }, + { + "epoch": 2.09, + "grad_norm": 4.859549667242615, + "learning_rate": 2.557276335315153e-06, + "loss": 0.4038, + "step": 37070 + }, + { + "epoch": 2.09, + "grad_norm": 10.992798825060795, + "learning_rate": 2.5558463649309784e-06, + "loss": 0.3991, + "step": 37075 + }, + { + "epoch": 2.09, + "grad_norm": 13.036823403276468, + "learning_rate": 2.554416657181902e-06, + "loss": 0.4323, + "step": 37080 + }, + { + "epoch": 2.09, + "grad_norm": 6.687095730396531, + "learning_rate": 2.552987212221546e-06, + "loss": 0.423, + "step": 37085 + }, + { + "epoch": 2.09, + "grad_norm": 6.738133789390756, + "learning_rate": 2.551558030203516e-06, + "loss": 0.4328, + "step": 37090 + }, + { + "epoch": 2.09, + "grad_norm": 5.246046285127597, + "learning_rate": 2.550129111281379e-06, + "loss": 0.4204, + "step": 37095 + }, + { + "epoch": 2.09, + "grad_norm": 4.6214212706319175, + "learning_rate": 2.5487004556086837e-06, + "loss": 0.4274, + "step": 37100 + }, + { + "epoch": 2.09, + "grad_norm": 4.393163629155765, + "learning_rate": 2.5472720633389404e-06, + "loss": 0.3987, + "step": 37105 + }, + { + "epoch": 2.09, + "grad_norm": 5.334542286236737, + "learning_rate": 2.545843934625639e-06, + "loss": 0.4213, + "step": 37110 + }, + { + "epoch": 2.09, + "grad_norm": 4.812260206538378, + "learning_rate": 2.5444160696222396e-06, + "loss": 0.4059, + "step": 37115 + }, + { + "epoch": 2.09, + "grad_norm": 4.436021657861293, + "learning_rate": 2.5429884684821703e-06, + "loss": 0.398, + "step": 37120 + }, + { + "epoch": 2.09, + "grad_norm": 8.027533356901605, + "learning_rate": 2.541561131358833e-06, + "loss": 0.4311, + "step": 37125 + }, + { + "epoch": 2.09, + "grad_norm": 4.786042334603683, + "learning_rate": 2.5401340584056027e-06, + "loss": 0.4231, + "step": 37130 + }, + { + "epoch": 2.09, + "grad_norm": 5.065005851699361, + "learning_rate": 2.5387072497758223e-06, + "loss": 0.428, + "step": 37135 + }, + { + "epoch": 2.09, + "grad_norm": 5.9724143256521005, + "learning_rate": 2.537280705622813e-06, + "loss": 0.3846, + "step": 37140 + }, + { + "epoch": 2.09, + "grad_norm": 10.114992931360732, + "learning_rate": 2.5358544260998586e-06, + "loss": 0.4211, + "step": 37145 + }, + { + "epoch": 2.09, + "grad_norm": 11.322275942712595, + "learning_rate": 2.534428411360223e-06, + "loss": 0.4235, + "step": 37150 + }, + { + "epoch": 2.09, + "grad_norm": 13.50726036133838, + "learning_rate": 2.533002661557136e-06, + "loss": 0.3908, + "step": 37155 + }, + { + "epoch": 2.09, + "grad_norm": 10.274073228679576, + "learning_rate": 2.5315771768437992e-06, + "loss": 0.3499, + "step": 37160 + }, + { + "epoch": 2.09, + "grad_norm": 9.31233599695848, + "learning_rate": 2.53015195737339e-06, + "loss": 0.4161, + "step": 37165 + }, + { + "epoch": 2.09, + "grad_norm": 12.790035088998128, + "learning_rate": 2.528727003299051e-06, + "loss": 0.4346, + "step": 37170 + }, + { + "epoch": 2.09, + "grad_norm": 9.223004381797734, + "learning_rate": 2.5273023147739053e-06, + "loss": 0.4195, + "step": 37175 + }, + { + "epoch": 2.09, + "grad_norm": 8.547919830178392, + "learning_rate": 2.5258778919510374e-06, + "loss": 0.4159, + "step": 37180 + }, + { + "epoch": 2.1, + "grad_norm": 11.777535512748289, + "learning_rate": 2.524453734983508e-06, + "loss": 0.3997, + "step": 37185 + }, + { + "epoch": 2.1, + "grad_norm": 4.944035923509637, + "learning_rate": 2.52302984402435e-06, + "loss": 0.4036, + "step": 37190 + }, + { + "epoch": 2.1, + "grad_norm": 7.355378288620898, + "learning_rate": 2.521606219226569e-06, + "loss": 0.4121, + "step": 37195 + }, + { + "epoch": 2.1, + "grad_norm": 5.7722295476357255, + "learning_rate": 2.5201828607431356e-06, + "loss": 0.4185, + "step": 37200 + }, + { + "epoch": 2.1, + "grad_norm": 6.97315776632713, + "learning_rate": 2.5187597687270004e-06, + "loss": 0.4339, + "step": 37205 + }, + { + "epoch": 2.1, + "grad_norm": 5.217279868258032, + "learning_rate": 2.517336943331076e-06, + "loss": 0.3967, + "step": 37210 + }, + { + "epoch": 2.1, + "grad_norm": 5.699923750449226, + "learning_rate": 2.5159143847082563e-06, + "loss": 0.4322, + "step": 37215 + }, + { + "epoch": 2.1, + "grad_norm": 8.261317614190745, + "learning_rate": 2.514492093011397e-06, + "loss": 0.4378, + "step": 37220 + }, + { + "epoch": 2.1, + "grad_norm": 8.93734667847287, + "learning_rate": 2.5130700683933333e-06, + "loss": 0.4149, + "step": 37225 + }, + { + "epoch": 2.1, + "grad_norm": 7.651863631151353, + "learning_rate": 2.511648311006865e-06, + "loss": 0.4272, + "step": 37230 + }, + { + "epoch": 2.1, + "grad_norm": 4.689396472362099, + "learning_rate": 2.510226821004769e-06, + "loss": 0.4158, + "step": 37235 + }, + { + "epoch": 2.1, + "grad_norm": 4.885267570508475, + "learning_rate": 2.5088055985397897e-06, + "loss": 0.4226, + "step": 37240 + }, + { + "epoch": 2.1, + "grad_norm": 5.22027601351842, + "learning_rate": 2.5073846437646414e-06, + "loss": 0.4306, + "step": 37245 + }, + { + "epoch": 2.1, + "grad_norm": 4.816526156854411, + "learning_rate": 2.505963956832016e-06, + "loss": 0.3883, + "step": 37250 + }, + { + "epoch": 2.1, + "grad_norm": 4.627145852240641, + "learning_rate": 2.504543537894568e-06, + "loss": 0.4228, + "step": 37255 + }, + { + "epoch": 2.1, + "grad_norm": 5.039465025094866, + "learning_rate": 2.503123387104932e-06, + "loss": 0.4077, + "step": 37260 + }, + { + "epoch": 2.1, + "grad_norm": 7.713766621382293, + "learning_rate": 2.5017035046157084e-06, + "loss": 0.422, + "step": 37265 + }, + { + "epoch": 2.1, + "grad_norm": 5.186512509520176, + "learning_rate": 2.500283890579467e-06, + "loss": 0.4163, + "step": 37270 + }, + { + "epoch": 2.1, + "grad_norm": 7.724030994072304, + "learning_rate": 2.498864545148753e-06, + "loss": 0.4196, + "step": 37275 + }, + { + "epoch": 2.1, + "grad_norm": 5.8461477265876605, + "learning_rate": 2.497445468476084e-06, + "loss": 0.4408, + "step": 37280 + }, + { + "epoch": 2.1, + "grad_norm": 4.586900027130816, + "learning_rate": 2.4960266607139425e-06, + "loss": 0.4095, + "step": 37285 + }, + { + "epoch": 2.1, + "grad_norm": 4.413684011014718, + "learning_rate": 2.4946081220147873e-06, + "loss": 0.4099, + "step": 37290 + }, + { + "epoch": 2.1, + "grad_norm": 4.252677108533727, + "learning_rate": 2.493189852531048e-06, + "loss": 0.4042, + "step": 37295 + }, + { + "epoch": 2.1, + "grad_norm": 4.781175451176674, + "learning_rate": 2.4917718524151223e-06, + "loss": 0.38, + "step": 37300 + }, + { + "epoch": 2.1, + "grad_norm": 5.735017657140206, + "learning_rate": 2.4903541218193793e-06, + "loss": 0.416, + "step": 37305 + }, + { + "epoch": 2.1, + "grad_norm": 5.225304668419313, + "learning_rate": 2.4889366608961634e-06, + "loss": 0.3999, + "step": 37310 + }, + { + "epoch": 2.1, + "grad_norm": 4.508559695575597, + "learning_rate": 2.487519469797784e-06, + "loss": 0.4062, + "step": 37315 + }, + { + "epoch": 2.1, + "grad_norm": 4.9961693946487875, + "learning_rate": 2.486102548676527e-06, + "loss": 0.4096, + "step": 37320 + }, + { + "epoch": 2.1, + "grad_norm": 5.084391144246881, + "learning_rate": 2.4846858976846467e-06, + "loss": 0.4193, + "step": 37325 + }, + { + "epoch": 2.1, + "grad_norm": 5.225517806011373, + "learning_rate": 2.4832695169743653e-06, + "loss": 0.3871, + "step": 37330 + }, + { + "epoch": 2.1, + "grad_norm": 4.948625096236599, + "learning_rate": 2.4818534066978832e-06, + "loss": 0.4008, + "step": 37335 + }, + { + "epoch": 2.1, + "grad_norm": 7.298715459546129, + "learning_rate": 2.480437567007365e-06, + "loss": 0.4288, + "step": 37340 + }, + { + "epoch": 2.1, + "grad_norm": 4.883682202729534, + "learning_rate": 2.4790219980549507e-06, + "loss": 0.4412, + "step": 37345 + }, + { + "epoch": 2.1, + "grad_norm": 5.0333951597344, + "learning_rate": 2.4776066999927494e-06, + "loss": 0.4204, + "step": 37350 + }, + { + "epoch": 2.1, + "grad_norm": 4.564161171379566, + "learning_rate": 2.476191672972838e-06, + "loss": 0.4022, + "step": 37355 + }, + { + "epoch": 2.11, + "grad_norm": 4.766537371421515, + "learning_rate": 2.474776917147272e-06, + "loss": 0.3914, + "step": 37360 + }, + { + "epoch": 2.11, + "grad_norm": 4.695060175376409, + "learning_rate": 2.473362432668069e-06, + "loss": 0.4184, + "step": 37365 + }, + { + "epoch": 2.11, + "grad_norm": 4.598763837265946, + "learning_rate": 2.471948219687223e-06, + "loss": 0.3979, + "step": 37370 + }, + { + "epoch": 2.11, + "grad_norm": 5.112981818948408, + "learning_rate": 2.4705342783567003e-06, + "loss": 0.4181, + "step": 37375 + }, + { + "epoch": 2.11, + "grad_norm": 5.515737409026771, + "learning_rate": 2.469120608828431e-06, + "loss": 0.404, + "step": 37380 + }, + { + "epoch": 2.11, + "grad_norm": 5.525117709831719, + "learning_rate": 2.467707211254323e-06, + "loss": 0.4391, + "step": 37385 + }, + { + "epoch": 2.11, + "grad_norm": 4.854523664756024, + "learning_rate": 2.46629408578625e-06, + "loss": 0.3884, + "step": 37390 + }, + { + "epoch": 2.11, + "grad_norm": 4.590883321139548, + "learning_rate": 2.4648812325760608e-06, + "loss": 0.4253, + "step": 37395 + }, + { + "epoch": 2.11, + "grad_norm": 6.241124085958273, + "learning_rate": 2.46346865177557e-06, + "loss": 0.4026, + "step": 37400 + }, + { + "epoch": 2.11, + "grad_norm": 4.855272957871328, + "learning_rate": 2.462056343536569e-06, + "loss": 0.4068, + "step": 37405 + }, + { + "epoch": 2.11, + "grad_norm": 5.730198366331148, + "learning_rate": 2.460644308010815e-06, + "loss": 0.3955, + "step": 37410 + }, + { + "epoch": 2.11, + "grad_norm": 4.361828360653933, + "learning_rate": 2.459232545350035e-06, + "loss": 0.3978, + "step": 37415 + }, + { + "epoch": 2.11, + "grad_norm": 5.1829538351207916, + "learning_rate": 2.4578210557059328e-06, + "loss": 0.4004, + "step": 37420 + }, + { + "epoch": 2.11, + "grad_norm": 5.580731984937707, + "learning_rate": 2.4564098392301762e-06, + "loss": 0.3872, + "step": 37425 + }, + { + "epoch": 2.11, + "grad_norm": 4.985120921495895, + "learning_rate": 2.4549988960744093e-06, + "loss": 0.4208, + "step": 37430 + }, + { + "epoch": 2.11, + "grad_norm": 4.6656444645193265, + "learning_rate": 2.4535882263902437e-06, + "loss": 0.3997, + "step": 37435 + }, + { + "epoch": 2.11, + "grad_norm": 5.185878999584126, + "learning_rate": 2.4521778303292593e-06, + "loss": 0.4112, + "step": 37440 + }, + { + "epoch": 2.11, + "grad_norm": 5.330915851906834, + "learning_rate": 2.4507677080430133e-06, + "loss": 0.4037, + "step": 37445 + }, + { + "epoch": 2.11, + "grad_norm": 4.812815108654893, + "learning_rate": 2.4493578596830253e-06, + "loss": 0.4251, + "step": 37450 + }, + { + "epoch": 2.11, + "grad_norm": 5.400125989976663, + "learning_rate": 2.447948285400795e-06, + "loss": 0.4144, + "step": 37455 + }, + { + "epoch": 2.11, + "grad_norm": 4.540735588095793, + "learning_rate": 2.4465389853477816e-06, + "loss": 0.4116, + "step": 37460 + }, + { + "epoch": 2.11, + "grad_norm": 4.474065047876905, + "learning_rate": 2.445129959675424e-06, + "loss": 0.3876, + "step": 37465 + }, + { + "epoch": 2.11, + "grad_norm": 5.283110997189188, + "learning_rate": 2.44372120853513e-06, + "loss": 0.4236, + "step": 37470 + }, + { + "epoch": 2.11, + "grad_norm": 4.935817852491185, + "learning_rate": 2.4423127320782713e-06, + "loss": 0.4063, + "step": 37475 + }, + { + "epoch": 2.11, + "grad_norm": 6.497913000376273, + "learning_rate": 2.4409045304561996e-06, + "loss": 0.4135, + "step": 37480 + }, + { + "epoch": 2.11, + "grad_norm": 5.066740557736832, + "learning_rate": 2.4394966038202283e-06, + "loss": 0.4357, + "step": 37485 + }, + { + "epoch": 2.11, + "grad_norm": 5.972230265299325, + "learning_rate": 2.4380889523216495e-06, + "loss": 0.4158, + "step": 37490 + }, + { + "epoch": 2.11, + "grad_norm": 7.318528186475073, + "learning_rate": 2.436681576111719e-06, + "loss": 0.4304, + "step": 37495 + }, + { + "epoch": 2.11, + "grad_norm": 7.279422060732313, + "learning_rate": 2.435274475341664e-06, + "loss": 0.4158, + "step": 37500 + }, + { + "epoch": 2.11, + "grad_norm": 7.031563289288517, + "learning_rate": 2.4338676501626878e-06, + "loss": 0.4319, + "step": 37505 + }, + { + "epoch": 2.11, + "grad_norm": 5.361142104309413, + "learning_rate": 2.432461100725956e-06, + "loss": 0.4223, + "step": 37510 + }, + { + "epoch": 2.11, + "grad_norm": 7.859435866442603, + "learning_rate": 2.431054827182612e-06, + "loss": 0.3935, + "step": 37515 + }, + { + "epoch": 2.11, + "grad_norm": 4.799037485664568, + "learning_rate": 2.429648829683764e-06, + "loss": 0.4007, + "step": 37520 + }, + { + "epoch": 2.11, + "grad_norm": 4.379476203172504, + "learning_rate": 2.428243108380492e-06, + "loss": 0.4124, + "step": 37525 + }, + { + "epoch": 2.11, + "grad_norm": 6.865293594312809, + "learning_rate": 2.4268376634238495e-06, + "loss": 0.4071, + "step": 37530 + }, + { + "epoch": 2.11, + "grad_norm": 5.5979140229018824, + "learning_rate": 2.425432494964854e-06, + "loss": 0.3967, + "step": 37535 + }, + { + "epoch": 2.12, + "grad_norm": 4.913608540213429, + "learning_rate": 2.4240276031545016e-06, + "loss": 0.4019, + "step": 37540 + }, + { + "epoch": 2.12, + "grad_norm": 4.825015519043568, + "learning_rate": 2.4226229881437502e-06, + "loss": 0.4032, + "step": 37545 + }, + { + "epoch": 2.12, + "grad_norm": 5.380107640660564, + "learning_rate": 2.4212186500835356e-06, + "loss": 0.4239, + "step": 37550 + }, + { + "epoch": 2.12, + "grad_norm": 4.574154260327579, + "learning_rate": 2.419814589124756e-06, + "loss": 0.4381, + "step": 37555 + }, + { + "epoch": 2.12, + "grad_norm": 5.329868686667875, + "learning_rate": 2.4184108054182858e-06, + "loss": 0.4444, + "step": 37560 + }, + { + "epoch": 2.12, + "grad_norm": 4.982523434416819, + "learning_rate": 2.41700729911497e-06, + "loss": 0.4402, + "step": 37565 + }, + { + "epoch": 2.12, + "grad_norm": 5.712082409711277, + "learning_rate": 2.4156040703656175e-06, + "loss": 0.3955, + "step": 37570 + }, + { + "epoch": 2.12, + "grad_norm": 4.05896953060982, + "learning_rate": 2.4142011193210158e-06, + "loss": 0.4109, + "step": 37575 + }, + { + "epoch": 2.12, + "grad_norm": 6.295058768864407, + "learning_rate": 2.4127984461319153e-06, + "loss": 0.4357, + "step": 37580 + }, + { + "epoch": 2.12, + "grad_norm": 4.528598664034226, + "learning_rate": 2.411396050949038e-06, + "loss": 0.4261, + "step": 37585 + }, + { + "epoch": 2.12, + "grad_norm": 4.51267752289972, + "learning_rate": 2.4099939339230814e-06, + "loss": 0.4417, + "step": 37590 + }, + { + "epoch": 2.12, + "grad_norm": 5.231566072249203, + "learning_rate": 2.4085920952047058e-06, + "loss": 0.4009, + "step": 37595 + }, + { + "epoch": 2.12, + "grad_norm": 4.612495242298953, + "learning_rate": 2.407190534944548e-06, + "loss": 0.4204, + "step": 37600 + }, + { + "epoch": 2.12, + "grad_norm": 3.9382712370806785, + "learning_rate": 2.40578925329321e-06, + "loss": 0.3692, + "step": 37605 + }, + { + "epoch": 2.12, + "grad_norm": 5.305763602048339, + "learning_rate": 2.404388250401264e-06, + "loss": 0.4083, + "step": 37610 + }, + { + "epoch": 2.12, + "grad_norm": 4.619393851912315, + "learning_rate": 2.4029875264192576e-06, + "loss": 0.4414, + "step": 37615 + }, + { + "epoch": 2.12, + "grad_norm": 4.159974390985439, + "learning_rate": 2.401587081497701e-06, + "loss": 0.3798, + "step": 37620 + }, + { + "epoch": 2.12, + "grad_norm": 6.299737112191288, + "learning_rate": 2.400186915787082e-06, + "loss": 0.3885, + "step": 37625 + }, + { + "epoch": 2.12, + "grad_norm": 10.129151557162364, + "learning_rate": 2.3987870294378513e-06, + "loss": 0.4163, + "step": 37630 + }, + { + "epoch": 2.12, + "grad_norm": 12.081109825709918, + "learning_rate": 2.3973874226004363e-06, + "loss": 0.3947, + "step": 37635 + }, + { + "epoch": 2.12, + "grad_norm": 13.669906888924857, + "learning_rate": 2.3959880954252263e-06, + "loss": 0.4058, + "step": 37640 + }, + { + "epoch": 2.12, + "grad_norm": 4.8184225449772, + "learning_rate": 2.3945890480625906e-06, + "loss": 0.3989, + "step": 37645 + }, + { + "epoch": 2.12, + "grad_norm": 10.162092643494919, + "learning_rate": 2.3931902806628576e-06, + "loss": 0.4113, + "step": 37650 + }, + { + "epoch": 2.12, + "grad_norm": 4.452591396243657, + "learning_rate": 2.391791793376334e-06, + "loss": 0.4079, + "step": 37655 + }, + { + "epoch": 2.12, + "grad_norm": 7.010348624722068, + "learning_rate": 2.390393586353295e-06, + "loss": 0.411, + "step": 37660 + }, + { + "epoch": 2.12, + "grad_norm": 5.073279424517539, + "learning_rate": 2.3889956597439824e-06, + "loss": 0.4246, + "step": 37665 + }, + { + "epoch": 2.12, + "grad_norm": 5.299062686226677, + "learning_rate": 2.387598013698607e-06, + "loss": 0.4225, + "step": 37670 + }, + { + "epoch": 2.12, + "grad_norm": 4.254463288894242, + "learning_rate": 2.386200648367357e-06, + "loss": 0.4212, + "step": 37675 + }, + { + "epoch": 2.12, + "grad_norm": 5.627641042334762, + "learning_rate": 2.3848035639003803e-06, + "loss": 0.4103, + "step": 37680 + }, + { + "epoch": 2.12, + "grad_norm": 4.701986690670523, + "learning_rate": 2.3834067604478046e-06, + "loss": 0.4254, + "step": 37685 + }, + { + "epoch": 2.12, + "grad_norm": 6.419882751567113, + "learning_rate": 2.3820102381597202e-06, + "loss": 0.4507, + "step": 37690 + }, + { + "epoch": 2.12, + "grad_norm": 4.904287101507349, + "learning_rate": 2.3806139971861884e-06, + "loss": 0.4177, + "step": 37695 + }, + { + "epoch": 2.12, + "grad_norm": 6.97503206025527, + "learning_rate": 2.3792180376772445e-06, + "loss": 0.4325, + "step": 37700 + }, + { + "epoch": 2.12, + "grad_norm": 5.386308205817909, + "learning_rate": 2.377822359782887e-06, + "loss": 0.4217, + "step": 37705 + }, + { + "epoch": 2.12, + "grad_norm": 4.583689489847988, + "learning_rate": 2.376426963653091e-06, + "loss": 0.4134, + "step": 37710 + }, + { + "epoch": 2.13, + "grad_norm": 5.735002306111745, + "learning_rate": 2.375031849437795e-06, + "loss": 0.3793, + "step": 37715 + }, + { + "epoch": 2.13, + "grad_norm": 5.140189357114928, + "learning_rate": 2.3736370172869133e-06, + "loss": 0.4252, + "step": 37720 + }, + { + "epoch": 2.13, + "grad_norm": 4.689969677875854, + "learning_rate": 2.372242467350325e-06, + "loss": 0.4206, + "step": 37725 + }, + { + "epoch": 2.13, + "grad_norm": 5.086416613113523, + "learning_rate": 2.3708481997778798e-06, + "loss": 0.4457, + "step": 37730 + }, + { + "epoch": 2.13, + "grad_norm": 5.990315329234146, + "learning_rate": 2.369454214719398e-06, + "loss": 0.4203, + "step": 37735 + }, + { + "epoch": 2.13, + "grad_norm": 4.4633217080904215, + "learning_rate": 2.3680605123246725e-06, + "loss": 0.389, + "step": 37740 + }, + { + "epoch": 2.13, + "grad_norm": 4.746868226179556, + "learning_rate": 2.3666670927434584e-06, + "loss": 0.3885, + "step": 37745 + }, + { + "epoch": 2.13, + "grad_norm": 4.7063579459136555, + "learning_rate": 2.3652739561254888e-06, + "loss": 0.3877, + "step": 37750 + }, + { + "epoch": 2.13, + "grad_norm": 4.896519934834272, + "learning_rate": 2.363881102620459e-06, + "loss": 0.3972, + "step": 37755 + }, + { + "epoch": 2.13, + "grad_norm": 4.913498064137382, + "learning_rate": 2.3624885323780407e-06, + "loss": 0.4095, + "step": 37760 + }, + { + "epoch": 2.13, + "grad_norm": 5.108919903583556, + "learning_rate": 2.3610962455478677e-06, + "loss": 0.4111, + "step": 37765 + }, + { + "epoch": 2.13, + "grad_norm": 4.921840185704436, + "learning_rate": 2.3597042422795513e-06, + "loss": 0.4306, + "step": 37770 + }, + { + "epoch": 2.13, + "grad_norm": 5.852995035964516, + "learning_rate": 2.358312522722665e-06, + "loss": 0.4043, + "step": 37775 + }, + { + "epoch": 2.13, + "grad_norm": 5.529711926116011, + "learning_rate": 2.3569210870267584e-06, + "loss": 0.4305, + "step": 37780 + }, + { + "epoch": 2.13, + "grad_norm": 4.657574902923141, + "learning_rate": 2.3555299353413462e-06, + "loss": 0.3874, + "step": 37785 + }, + { + "epoch": 2.13, + "grad_norm": 4.905875732974656, + "learning_rate": 2.354139067815911e-06, + "loss": 0.3995, + "step": 37790 + }, + { + "epoch": 2.13, + "grad_norm": 6.364117491954523, + "learning_rate": 2.352748484599912e-06, + "loss": 0.4365, + "step": 37795 + }, + { + "epoch": 2.13, + "grad_norm": 6.89228524091132, + "learning_rate": 2.35135818584277e-06, + "loss": 0.3904, + "step": 37800 + }, + { + "epoch": 2.13, + "grad_norm": 4.604604486773298, + "learning_rate": 2.349968171693882e-06, + "loss": 0.3895, + "step": 37805 + }, + { + "epoch": 2.13, + "grad_norm": 8.547158382550993, + "learning_rate": 2.34857844230261e-06, + "loss": 0.4219, + "step": 37810 + }, + { + "epoch": 2.13, + "grad_norm": 5.232739738429124, + "learning_rate": 2.3471889978182833e-06, + "loss": 0.4453, + "step": 37815 + }, + { + "epoch": 2.13, + "grad_norm": 9.120494392345778, + "learning_rate": 2.3457998383902087e-06, + "loss": 0.4093, + "step": 37820 + }, + { + "epoch": 2.13, + "grad_norm": 10.932578816784359, + "learning_rate": 2.344410964167653e-06, + "loss": 0.4396, + "step": 37825 + }, + { + "epoch": 2.13, + "grad_norm": 5.588241513071872, + "learning_rate": 2.34302237529986e-06, + "loss": 0.4199, + "step": 37830 + }, + { + "epoch": 2.13, + "grad_norm": 10.584702044689125, + "learning_rate": 2.34163407193604e-06, + "loss": 0.434, + "step": 37835 + }, + { + "epoch": 2.13, + "grad_norm": 8.187374255593626, + "learning_rate": 2.3402460542253697e-06, + "loss": 0.4314, + "step": 37840 + }, + { + "epoch": 2.13, + "grad_norm": 4.8905638875035145, + "learning_rate": 2.338858322317001e-06, + "loss": 0.3942, + "step": 37845 + }, + { + "epoch": 2.13, + "grad_norm": 4.839459242498384, + "learning_rate": 2.3374708763600486e-06, + "loss": 0.3866, + "step": 37850 + }, + { + "epoch": 2.13, + "grad_norm": 4.604624014402693, + "learning_rate": 2.336083716503603e-06, + "loss": 0.4483, + "step": 37855 + }, + { + "epoch": 2.13, + "grad_norm": 4.494063249026857, + "learning_rate": 2.3346968428967166e-06, + "loss": 0.4276, + "step": 37860 + }, + { + "epoch": 2.13, + "grad_norm": 4.29113972614164, + "learning_rate": 2.3333102556884195e-06, + "loss": 0.4001, + "step": 37865 + }, + { + "epoch": 2.13, + "grad_norm": 4.6989887823760075, + "learning_rate": 2.3319239550277046e-06, + "loss": 0.4072, + "step": 37870 + }, + { + "epoch": 2.13, + "grad_norm": 7.588016207966241, + "learning_rate": 2.3305379410635332e-06, + "loss": 0.3998, + "step": 37875 + }, + { + "epoch": 2.13, + "grad_norm": 13.386087370158286, + "learning_rate": 2.3291522139448435e-06, + "loss": 0.4152, + "step": 37880 + }, + { + "epoch": 2.13, + "grad_norm": 8.108323281361963, + "learning_rate": 2.3277667738205344e-06, + "loss": 0.3894, + "step": 37885 + }, + { + "epoch": 2.13, + "grad_norm": 7.355453885605402, + "learning_rate": 2.32638162083948e-06, + "loss": 0.4276, + "step": 37890 + }, + { + "epoch": 2.14, + "grad_norm": 11.577862098778375, + "learning_rate": 2.324996755150521e-06, + "loss": 0.3728, + "step": 37895 + }, + { + "epoch": 2.14, + "grad_norm": 5.876721549951402, + "learning_rate": 2.323612176902464e-06, + "loss": 0.4082, + "step": 37900 + }, + { + "epoch": 2.14, + "grad_norm": 4.893429653420833, + "learning_rate": 2.3222278862440917e-06, + "loss": 0.3795, + "step": 37905 + }, + { + "epoch": 2.14, + "grad_norm": 6.113507778468923, + "learning_rate": 2.3208438833241497e-06, + "loss": 0.4241, + "step": 37910 + }, + { + "epoch": 2.14, + "grad_norm": 79.98111007846417, + "learning_rate": 2.319460168291358e-06, + "loss": 0.4638, + "step": 37915 + }, + { + "epoch": 2.14, + "grad_norm": 6.9487472433427975, + "learning_rate": 2.3180767412943995e-06, + "loss": 0.4241, + "step": 37920 + }, + { + "epoch": 2.14, + "grad_norm": 5.226125973952726, + "learning_rate": 2.3166936024819313e-06, + "loss": 0.422, + "step": 37925 + }, + { + "epoch": 2.14, + "grad_norm": 11.940809355740361, + "learning_rate": 2.31531075200258e-06, + "loss": 0.4458, + "step": 37930 + }, + { + "epoch": 2.14, + "grad_norm": 5.0268930430563445, + "learning_rate": 2.313928190004935e-06, + "loss": 0.3701, + "step": 37935 + }, + { + "epoch": 2.14, + "grad_norm": 5.58558284411666, + "learning_rate": 2.3125459166375625e-06, + "loss": 0.3853, + "step": 37940 + }, + { + "epoch": 2.14, + "grad_norm": 4.522188671731066, + "learning_rate": 2.31116393204899e-06, + "loss": 0.3954, + "step": 37945 + }, + { + "epoch": 2.14, + "grad_norm": 5.406528873180831, + "learning_rate": 2.309782236387721e-06, + "loss": 0.4125, + "step": 37950 + }, + { + "epoch": 2.14, + "grad_norm": 5.178343992922706, + "learning_rate": 2.308400829802224e-06, + "loss": 0.396, + "step": 37955 + }, + { + "epoch": 2.14, + "grad_norm": 4.665358440445545, + "learning_rate": 2.307019712440935e-06, + "loss": 0.4077, + "step": 37960 + }, + { + "epoch": 2.14, + "grad_norm": 7.9265159269255525, + "learning_rate": 2.305638884452265e-06, + "loss": 0.4206, + "step": 37965 + }, + { + "epoch": 2.14, + "grad_norm": 5.94031396378375, + "learning_rate": 2.3042583459845853e-06, + "loss": 0.3825, + "step": 37970 + }, + { + "epoch": 2.14, + "grad_norm": 5.476643028450257, + "learning_rate": 2.3028780971862457e-06, + "loss": 0.3737, + "step": 37975 + }, + { + "epoch": 2.14, + "grad_norm": 12.275128561514984, + "learning_rate": 2.3014981382055585e-06, + "loss": 0.4332, + "step": 37980 + }, + { + "epoch": 2.14, + "grad_norm": 4.856065342077004, + "learning_rate": 2.300118469190803e-06, + "loss": 0.4023, + "step": 37985 + }, + { + "epoch": 2.14, + "grad_norm": 4.753232569364006, + "learning_rate": 2.2987390902902357e-06, + "loss": 0.4239, + "step": 37990 + }, + { + "epoch": 2.14, + "grad_norm": 5.524916146054429, + "learning_rate": 2.297360001652073e-06, + "loss": 0.4018, + "step": 37995 + }, + { + "epoch": 2.14, + "grad_norm": 4.344325463129725, + "learning_rate": 2.2959812034245077e-06, + "loss": 0.4064, + "step": 38000 + }, + { + "epoch": 2.14, + "grad_norm": 5.907803185467804, + "learning_rate": 2.2946026957556937e-06, + "loss": 0.4312, + "step": 38005 + }, + { + "epoch": 2.14, + "grad_norm": 11.594221586335419, + "learning_rate": 2.2932244787937618e-06, + "loss": 0.4144, + "step": 38010 + }, + { + "epoch": 2.14, + "grad_norm": 4.99973523736737, + "learning_rate": 2.291846552686804e-06, + "loss": 0.3782, + "step": 38015 + }, + { + "epoch": 2.14, + "grad_norm": 5.396141243222192, + "learning_rate": 2.290468917582886e-06, + "loss": 0.4327, + "step": 38020 + }, + { + "epoch": 2.14, + "grad_norm": 7.567389639433534, + "learning_rate": 2.2890915736300422e-06, + "loss": 0.4231, + "step": 38025 + }, + { + "epoch": 2.14, + "grad_norm": 6.541973240712317, + "learning_rate": 2.2877145209762714e-06, + "loss": 0.4373, + "step": 38030 + }, + { + "epoch": 2.14, + "grad_norm": 5.229987450128883, + "learning_rate": 2.286337759769548e-06, + "loss": 0.3926, + "step": 38035 + }, + { + "epoch": 2.14, + "grad_norm": 4.870012224100013, + "learning_rate": 2.284961290157808e-06, + "loss": 0.3947, + "step": 38040 + }, + { + "epoch": 2.14, + "grad_norm": 5.342811809778449, + "learning_rate": 2.283585112288958e-06, + "loss": 0.3772, + "step": 38045 + }, + { + "epoch": 2.14, + "grad_norm": 5.333846513686606, + "learning_rate": 2.282209226310878e-06, + "loss": 0.4065, + "step": 38050 + }, + { + "epoch": 2.14, + "grad_norm": 5.0244839834440835, + "learning_rate": 2.2808336323714094e-06, + "loss": 0.4091, + "step": 38055 + }, + { + "epoch": 2.14, + "grad_norm": 4.890421913453521, + "learning_rate": 2.2794583306183694e-06, + "loss": 0.4033, + "step": 38060 + }, + { + "epoch": 2.14, + "grad_norm": 4.805186999635017, + "learning_rate": 2.2780833211995383e-06, + "loss": 0.4142, + "step": 38065 + }, + { + "epoch": 2.15, + "grad_norm": 4.541539644382564, + "learning_rate": 2.276708604262665e-06, + "loss": 0.3957, + "step": 38070 + }, + { + "epoch": 2.15, + "grad_norm": 4.7224896999990165, + "learning_rate": 2.2753341799554735e-06, + "loss": 0.4235, + "step": 38075 + }, + { + "epoch": 2.15, + "grad_norm": 4.557678874844924, + "learning_rate": 2.273960048425647e-06, + "loss": 0.3909, + "step": 38080 + }, + { + "epoch": 2.15, + "grad_norm": 4.86070320469673, + "learning_rate": 2.2725862098208464e-06, + "loss": 0.3897, + "step": 38085 + }, + { + "epoch": 2.15, + "grad_norm": 5.325590840359363, + "learning_rate": 2.271212664288694e-06, + "loss": 0.4168, + "step": 38090 + }, + { + "epoch": 2.15, + "grad_norm": 4.7631955298398, + "learning_rate": 2.2698394119767825e-06, + "loss": 0.3726, + "step": 38095 + }, + { + "epoch": 2.15, + "grad_norm": 4.743947556770906, + "learning_rate": 2.268466453032675e-06, + "loss": 0.3956, + "step": 38100 + }, + { + "epoch": 2.15, + "grad_norm": 4.578967483277369, + "learning_rate": 2.267093787603904e-06, + "loss": 0.4061, + "step": 38105 + }, + { + "epoch": 2.15, + "grad_norm": 5.226628945326729, + "learning_rate": 2.2657214158379655e-06, + "loss": 0.4034, + "step": 38110 + }, + { + "epoch": 2.15, + "grad_norm": 4.70623435647588, + "learning_rate": 2.2643493378823278e-06, + "loss": 0.4067, + "step": 38115 + }, + { + "epoch": 2.15, + "grad_norm": 6.903708262756084, + "learning_rate": 2.262977553884429e-06, + "loss": 0.3828, + "step": 38120 + }, + { + "epoch": 2.15, + "grad_norm": 8.185004150810348, + "learning_rate": 2.2616060639916708e-06, + "loss": 0.4381, + "step": 38125 + }, + { + "epoch": 2.15, + "grad_norm": 7.827506693720943, + "learning_rate": 2.2602348683514258e-06, + "loss": 0.4166, + "step": 38130 + }, + { + "epoch": 2.15, + "grad_norm": 5.383455925629179, + "learning_rate": 2.2588639671110366e-06, + "loss": 0.3959, + "step": 38135 + }, + { + "epoch": 2.15, + "grad_norm": 5.104712434454658, + "learning_rate": 2.2574933604178105e-06, + "loss": 0.402, + "step": 38140 + }, + { + "epoch": 2.15, + "grad_norm": 4.536926853638174, + "learning_rate": 2.2561230484190277e-06, + "loss": 0.3932, + "step": 38145 + }, + { + "epoch": 2.15, + "grad_norm": 5.1498435429993314, + "learning_rate": 2.2547530312619335e-06, + "loss": 0.4081, + "step": 38150 + }, + { + "epoch": 2.15, + "grad_norm": 4.455705215270543, + "learning_rate": 2.2533833090937397e-06, + "loss": 0.3903, + "step": 38155 + }, + { + "epoch": 2.15, + "grad_norm": 6.053294158079101, + "learning_rate": 2.2520138820616326e-06, + "loss": 0.4306, + "step": 38160 + }, + { + "epoch": 2.15, + "grad_norm": 4.471603332557932, + "learning_rate": 2.2506447503127606e-06, + "loss": 0.4069, + "step": 38165 + }, + { + "epoch": 2.15, + "grad_norm": 4.981212242396687, + "learning_rate": 2.2492759139942456e-06, + "loss": 0.3882, + "step": 38170 + }, + { + "epoch": 2.15, + "grad_norm": 4.909309713283266, + "learning_rate": 2.247907373253173e-06, + "loss": 0.4151, + "step": 38175 + }, + { + "epoch": 2.15, + "grad_norm": 5.317573791265963, + "learning_rate": 2.246539128236598e-06, + "loss": 0.4009, + "step": 38180 + }, + { + "epoch": 2.15, + "grad_norm": 4.548642843410977, + "learning_rate": 2.2451711790915463e-06, + "loss": 0.3946, + "step": 38185 + }, + { + "epoch": 2.15, + "grad_norm": 4.573241544890438, + "learning_rate": 2.2438035259650085e-06, + "loss": 0.3996, + "step": 38190 + }, + { + "epoch": 2.15, + "grad_norm": 4.84357767238359, + "learning_rate": 2.242436169003946e-06, + "loss": 0.4086, + "step": 38195 + }, + { + "epoch": 2.15, + "grad_norm": 4.789318406802224, + "learning_rate": 2.241069108355288e-06, + "loss": 0.4105, + "step": 38200 + }, + { + "epoch": 2.15, + "grad_norm": 4.725955913285168, + "learning_rate": 2.23970234416593e-06, + "loss": 0.3935, + "step": 38205 + }, + { + "epoch": 2.15, + "grad_norm": 6.014557303941419, + "learning_rate": 2.238335876582738e-06, + "loss": 0.4217, + "step": 38210 + }, + { + "epoch": 2.15, + "grad_norm": 5.219457643630035, + "learning_rate": 2.2369697057525427e-06, + "loss": 0.3927, + "step": 38215 + }, + { + "epoch": 2.15, + "grad_norm": 4.8677932223357745, + "learning_rate": 2.2356038318221487e-06, + "loss": 0.4337, + "step": 38220 + }, + { + "epoch": 2.15, + "grad_norm": 4.530386893227267, + "learning_rate": 2.2342382549383213e-06, + "loss": 0.4395, + "step": 38225 + }, + { + "epoch": 2.15, + "grad_norm": 7.585153410202333, + "learning_rate": 2.2328729752478013e-06, + "loss": 0.421, + "step": 38230 + }, + { + "epoch": 2.15, + "grad_norm": 13.276570454775175, + "learning_rate": 2.231507992897292e-06, + "loss": 0.4166, + "step": 38235 + }, + { + "epoch": 2.15, + "grad_norm": 8.083582448661602, + "learning_rate": 2.2301433080334663e-06, + "loss": 0.397, + "step": 38240 + }, + { + "epoch": 2.15, + "grad_norm": 10.868368137671233, + "learning_rate": 2.2287789208029677e-06, + "loss": 0.4214, + "step": 38245 + }, + { + "epoch": 2.16, + "grad_norm": 7.5182045707673195, + "learning_rate": 2.2274148313524023e-06, + "loss": 0.3842, + "step": 38250 + }, + { + "epoch": 2.16, + "grad_norm": 6.555929393056832, + "learning_rate": 2.2260510398283518e-06, + "loss": 0.4197, + "step": 38255 + }, + { + "epoch": 2.16, + "grad_norm": 6.5329968735218875, + "learning_rate": 2.224687546377358e-06, + "loss": 0.4124, + "step": 38260 + }, + { + "epoch": 2.16, + "grad_norm": 5.009927636139155, + "learning_rate": 2.223324351145937e-06, + "loss": 0.3979, + "step": 38265 + }, + { + "epoch": 2.16, + "grad_norm": 6.8892108371488465, + "learning_rate": 2.2219614542805684e-06, + "loss": 0.4366, + "step": 38270 + }, + { + "epoch": 2.16, + "grad_norm": 5.718257033879936, + "learning_rate": 2.2205988559277007e-06, + "loss": 0.4097, + "step": 38275 + }, + { + "epoch": 2.16, + "grad_norm": 4.958336357175654, + "learning_rate": 2.2192365562337543e-06, + "loss": 0.3917, + "step": 38280 + }, + { + "epoch": 2.16, + "grad_norm": 4.3575227125026, + "learning_rate": 2.21787455534511e-06, + "loss": 0.3902, + "step": 38285 + }, + { + "epoch": 2.16, + "grad_norm": 5.308724346255989, + "learning_rate": 2.2165128534081244e-06, + "loss": 0.3604, + "step": 38290 + }, + { + "epoch": 2.16, + "grad_norm": 4.464676475254124, + "learning_rate": 2.215151450569118e-06, + "loss": 0.3967, + "step": 38295 + }, + { + "epoch": 2.16, + "grad_norm": 5.891423501714995, + "learning_rate": 2.213790346974377e-06, + "loss": 0.4005, + "step": 38300 + }, + { + "epoch": 2.16, + "grad_norm": 5.067428752177997, + "learning_rate": 2.212429542770162e-06, + "loss": 0.4208, + "step": 38305 + }, + { + "epoch": 2.16, + "grad_norm": 4.156959061512414, + "learning_rate": 2.2110690381026926e-06, + "loss": 0.4012, + "step": 38310 + }, + { + "epoch": 2.16, + "grad_norm": 4.822972155335179, + "learning_rate": 2.209708833118166e-06, + "loss": 0.3864, + "step": 38315 + }, + { + "epoch": 2.16, + "grad_norm": 5.006381694705248, + "learning_rate": 2.208348927962739e-06, + "loss": 0.4321, + "step": 38320 + }, + { + "epoch": 2.16, + "grad_norm": 4.754068529708678, + "learning_rate": 2.206989322782539e-06, + "loss": 0.4047, + "step": 38325 + }, + { + "epoch": 2.16, + "grad_norm": 7.63250621373145, + "learning_rate": 2.2056300177236643e-06, + "loss": 0.3918, + "step": 38330 + }, + { + "epoch": 2.16, + "grad_norm": 7.839191277671578, + "learning_rate": 2.2042710129321755e-06, + "loss": 0.3878, + "step": 38335 + }, + { + "epoch": 2.16, + "grad_norm": 4.964535428910256, + "learning_rate": 2.202912308554106e-06, + "loss": 0.3931, + "step": 38340 + }, + { + "epoch": 2.16, + "grad_norm": 6.291096054045648, + "learning_rate": 2.201553904735452e-06, + "loss": 0.4286, + "step": 38345 + }, + { + "epoch": 2.16, + "grad_norm": 4.955057309545511, + "learning_rate": 2.2001958016221826e-06, + "loss": 0.407, + "step": 38350 + }, + { + "epoch": 2.16, + "grad_norm": 4.822559744762919, + "learning_rate": 2.1988379993602316e-06, + "loss": 0.4114, + "step": 38355 + }, + { + "epoch": 2.16, + "grad_norm": 4.308815657077496, + "learning_rate": 2.197480498095498e-06, + "loss": 0.3546, + "step": 38360 + }, + { + "epoch": 2.16, + "grad_norm": 4.701002282985683, + "learning_rate": 2.1961232979738547e-06, + "loss": 0.3973, + "step": 38365 + }, + { + "epoch": 2.16, + "grad_norm": 4.888752659921957, + "learning_rate": 2.194766399141136e-06, + "loss": 0.3993, + "step": 38370 + }, + { + "epoch": 2.16, + "grad_norm": 5.080663918185043, + "learning_rate": 2.1934098017431498e-06, + "loss": 0.3855, + "step": 38375 + }, + { + "epoch": 2.16, + "grad_norm": 4.684344995501928, + "learning_rate": 2.1920535059256655e-06, + "loss": 0.4106, + "step": 38380 + }, + { + "epoch": 2.16, + "grad_norm": 4.901557550814901, + "learning_rate": 2.1906975118344236e-06, + "loss": 0.4305, + "step": 38385 + }, + { + "epoch": 2.16, + "grad_norm": 4.495169066401919, + "learning_rate": 2.189341819615135e-06, + "loss": 0.3851, + "step": 38390 + }, + { + "epoch": 2.16, + "grad_norm": 4.78500350171946, + "learning_rate": 2.18798642941347e-06, + "loss": 0.4048, + "step": 38395 + }, + { + "epoch": 2.16, + "grad_norm": 4.248591999953721, + "learning_rate": 2.1866313413750755e-06, + "loss": 0.4129, + "step": 38400 + }, + { + "epoch": 2.16, + "grad_norm": 4.7044268015767985, + "learning_rate": 2.1852765556455596e-06, + "loss": 0.3839, + "step": 38405 + }, + { + "epoch": 2.16, + "grad_norm": 7.188892401512902, + "learning_rate": 2.1839220723704985e-06, + "loss": 0.4118, + "step": 38410 + }, + { + "epoch": 2.16, + "grad_norm": 5.137718121287876, + "learning_rate": 2.182567891695441e-06, + "loss": 0.4012, + "step": 38415 + }, + { + "epoch": 2.16, + "grad_norm": 4.67217502753716, + "learning_rate": 2.1812140137658955e-06, + "loss": 0.4112, + "step": 38420 + }, + { + "epoch": 2.17, + "grad_norm": 4.455173055441711, + "learning_rate": 2.1798604387273463e-06, + "loss": 0.3584, + "step": 38425 + }, + { + "epoch": 2.17, + "grad_norm": 6.803369904154309, + "learning_rate": 2.1785071667252376e-06, + "loss": 0.3814, + "step": 38430 + }, + { + "epoch": 2.17, + "grad_norm": 4.908988811537963, + "learning_rate": 2.177154197904988e-06, + "loss": 0.4184, + "step": 38435 + }, + { + "epoch": 2.17, + "grad_norm": 4.9731338705417105, + "learning_rate": 2.175801532411978e-06, + "loss": 0.3926, + "step": 38440 + }, + { + "epoch": 2.17, + "grad_norm": 4.799991686734702, + "learning_rate": 2.1744491703915555e-06, + "loss": 0.357, + "step": 38445 + }, + { + "epoch": 2.17, + "grad_norm": 4.4978227725044455, + "learning_rate": 2.1730971119890416e-06, + "loss": 0.4002, + "step": 38450 + }, + { + "epoch": 2.17, + "grad_norm": 4.250610716393017, + "learning_rate": 2.171745357349717e-06, + "loss": 0.3939, + "step": 38455 + }, + { + "epoch": 2.17, + "grad_norm": 4.452189601655305, + "learning_rate": 2.1703939066188386e-06, + "loss": 0.3933, + "step": 38460 + }, + { + "epoch": 2.17, + "grad_norm": 4.626129117239306, + "learning_rate": 2.1690427599416195e-06, + "loss": 0.3735, + "step": 38465 + }, + { + "epoch": 2.17, + "grad_norm": 6.1507599224321305, + "learning_rate": 2.1676919174632527e-06, + "loss": 0.4189, + "step": 38470 + }, + { + "epoch": 2.17, + "grad_norm": 4.675997257923313, + "learning_rate": 2.1663413793288874e-06, + "loss": 0.4034, + "step": 38475 + }, + { + "epoch": 2.17, + "grad_norm": 4.615745239943095, + "learning_rate": 2.164991145683647e-06, + "loss": 0.3866, + "step": 38480 + }, + { + "epoch": 2.17, + "grad_norm": 4.973203534109425, + "learning_rate": 2.163641216672621e-06, + "loss": 0.4113, + "step": 38485 + }, + { + "epoch": 2.17, + "grad_norm": 5.184493107621935, + "learning_rate": 2.1622915924408646e-06, + "loss": 0.4085, + "step": 38490 + }, + { + "epoch": 2.17, + "grad_norm": 11.086631146869495, + "learning_rate": 2.1609422731333986e-06, + "loss": 0.4083, + "step": 38495 + }, + { + "epoch": 2.17, + "grad_norm": 5.913882429785973, + "learning_rate": 2.1595932588952164e-06, + "loss": 0.4175, + "step": 38500 + }, + { + "epoch": 2.17, + "grad_norm": 4.813323665594315, + "learning_rate": 2.158244549871273e-06, + "loss": 0.3822, + "step": 38505 + }, + { + "epoch": 2.17, + "grad_norm": 4.628538006571762, + "learning_rate": 2.156896146206495e-06, + "loss": 0.404, + "step": 38510 + }, + { + "epoch": 2.17, + "grad_norm": 5.013120678653632, + "learning_rate": 2.155548048045773e-06, + "loss": 0.4125, + "step": 38515 + }, + { + "epoch": 2.17, + "grad_norm": 5.113861884290618, + "learning_rate": 2.154200255533968e-06, + "loss": 0.3699, + "step": 38520 + }, + { + "epoch": 2.17, + "grad_norm": 4.887863434683157, + "learning_rate": 2.1528527688159057e-06, + "loss": 0.4111, + "step": 38525 + }, + { + "epoch": 2.17, + "grad_norm": 5.481691056906396, + "learning_rate": 2.1515055880363763e-06, + "loss": 0.4025, + "step": 38530 + }, + { + "epoch": 2.17, + "grad_norm": 4.395663483698197, + "learning_rate": 2.1501587133401445e-06, + "loss": 0.394, + "step": 38535 + }, + { + "epoch": 2.17, + "grad_norm": 5.6222518839369995, + "learning_rate": 2.148812144871935e-06, + "loss": 0.4309, + "step": 38540 + }, + { + "epoch": 2.17, + "grad_norm": 5.745201229958961, + "learning_rate": 2.1474658827764453e-06, + "loss": 0.3957, + "step": 38545 + }, + { + "epoch": 2.17, + "grad_norm": 4.477977586552221, + "learning_rate": 2.146119927198336e-06, + "loss": 0.3897, + "step": 38550 + }, + { + "epoch": 2.17, + "grad_norm": 4.428390864853101, + "learning_rate": 2.1447742782822338e-06, + "loss": 0.3829, + "step": 38555 + }, + { + "epoch": 2.17, + "grad_norm": 5.545247530486125, + "learning_rate": 2.1434289361727363e-06, + "loss": 0.41, + "step": 38560 + }, + { + "epoch": 2.17, + "grad_norm": 5.19207433195976, + "learning_rate": 2.1420839010144086e-06, + "loss": 0.413, + "step": 38565 + }, + { + "epoch": 2.17, + "grad_norm": 4.647612086719953, + "learning_rate": 2.140739172951777e-06, + "loss": 0.4293, + "step": 38570 + }, + { + "epoch": 2.17, + "grad_norm": 6.410536160373945, + "learning_rate": 2.1393947521293413e-06, + "loss": 0.3921, + "step": 38575 + }, + { + "epoch": 2.17, + "grad_norm": 8.049584885388937, + "learning_rate": 2.138050638691563e-06, + "loss": 0.4028, + "step": 38580 + }, + { + "epoch": 2.17, + "grad_norm": 4.769364112906789, + "learning_rate": 2.136706832782876e-06, + "loss": 0.398, + "step": 38585 + }, + { + "epoch": 2.17, + "grad_norm": 4.76686710250167, + "learning_rate": 2.1353633345476742e-06, + "loss": 0.3892, + "step": 38590 + }, + { + "epoch": 2.17, + "grad_norm": 5.290813454338231, + "learning_rate": 2.134020144130327e-06, + "loss": 0.387, + "step": 38595 + }, + { + "epoch": 2.17, + "grad_norm": 4.534713155982688, + "learning_rate": 2.1326772616751623e-06, + "loss": 0.4065, + "step": 38600 + }, + { + "epoch": 2.18, + "grad_norm": 4.9098049548268055, + "learning_rate": 2.1313346873264816e-06, + "loss": 0.3835, + "step": 38605 + }, + { + "epoch": 2.18, + "grad_norm": 4.695743105993168, + "learning_rate": 2.1299924212285493e-06, + "loss": 0.399, + "step": 38610 + }, + { + "epoch": 2.18, + "grad_norm": 4.555631939857085, + "learning_rate": 2.1286504635255963e-06, + "loss": 0.3786, + "step": 38615 + }, + { + "epoch": 2.18, + "grad_norm": 4.358899041852783, + "learning_rate": 2.127308814361825e-06, + "loss": 0.3922, + "step": 38620 + }, + { + "epoch": 2.18, + "grad_norm": 4.6645020119128695, + "learning_rate": 2.125967473881397e-06, + "loss": 0.3971, + "step": 38625 + }, + { + "epoch": 2.18, + "grad_norm": 4.924011309498007, + "learning_rate": 2.1246264422284507e-06, + "loss": 0.3702, + "step": 38630 + }, + { + "epoch": 2.18, + "grad_norm": 4.433465364614165, + "learning_rate": 2.1232857195470825e-06, + "loss": 0.3822, + "step": 38635 + }, + { + "epoch": 2.18, + "grad_norm": 4.765182060416091, + "learning_rate": 2.1219453059813583e-06, + "loss": 0.4016, + "step": 38640 + }, + { + "epoch": 2.18, + "grad_norm": 5.181989455395341, + "learning_rate": 2.120605201675315e-06, + "loss": 0.3979, + "step": 38645 + }, + { + "epoch": 2.18, + "grad_norm": 5.00143295704809, + "learning_rate": 2.119265406772948e-06, + "loss": 0.4022, + "step": 38650 + }, + { + "epoch": 2.18, + "grad_norm": 5.117214685595613, + "learning_rate": 2.1179259214182267e-06, + "loss": 0.4112, + "step": 38655 + }, + { + "epoch": 2.18, + "grad_norm": 6.8399465080894135, + "learning_rate": 2.1165867457550865e-06, + "loss": 0.3927, + "step": 38660 + }, + { + "epoch": 2.18, + "grad_norm": 4.807864459149209, + "learning_rate": 2.1152478799274247e-06, + "loss": 0.3951, + "step": 38665 + }, + { + "epoch": 2.18, + "grad_norm": 4.6053442197291155, + "learning_rate": 2.113909324079111e-06, + "loss": 0.4068, + "step": 38670 + }, + { + "epoch": 2.18, + "grad_norm": 4.50249582365174, + "learning_rate": 2.112571078353976e-06, + "loss": 0.4235, + "step": 38675 + }, + { + "epoch": 2.18, + "grad_norm": 5.3688072795808734, + "learning_rate": 2.1112331428958233e-06, + "loss": 0.3739, + "step": 38680 + }, + { + "epoch": 2.18, + "grad_norm": 4.292860673413531, + "learning_rate": 2.109895517848417e-06, + "loss": 0.4191, + "step": 38685 + }, + { + "epoch": 2.18, + "grad_norm": 4.89379053199037, + "learning_rate": 2.108558203355494e-06, + "loss": 0.3884, + "step": 38690 + }, + { + "epoch": 2.18, + "grad_norm": 4.765328355765687, + "learning_rate": 2.1072211995607534e-06, + "loss": 0.3926, + "step": 38695 + }, + { + "epoch": 2.18, + "grad_norm": 4.6605360820933175, + "learning_rate": 2.105884506607859e-06, + "loss": 0.4142, + "step": 38700 + }, + { + "epoch": 2.18, + "grad_norm": 5.239545475837199, + "learning_rate": 2.1045481246404497e-06, + "loss": 0.4105, + "step": 38705 + }, + { + "epoch": 2.18, + "grad_norm": 5.203008940340655, + "learning_rate": 2.103212053802121e-06, + "loss": 0.3764, + "step": 38710 + }, + { + "epoch": 2.18, + "grad_norm": 5.426397186096734, + "learning_rate": 2.1018762942364437e-06, + "loss": 0.3862, + "step": 38715 + }, + { + "epoch": 2.18, + "grad_norm": 5.306825500480146, + "learning_rate": 2.1005408460869486e-06, + "loss": 0.4287, + "step": 38720 + }, + { + "epoch": 2.18, + "grad_norm": 4.715503113905795, + "learning_rate": 2.0992057094971342e-06, + "loss": 0.3981, + "step": 38725 + }, + { + "epoch": 2.18, + "grad_norm": 8.98365887608179, + "learning_rate": 2.09787088461047e-06, + "loss": 0.3854, + "step": 38730 + }, + { + "epoch": 2.18, + "grad_norm": 5.227889910904786, + "learning_rate": 2.0965363715703857e-06, + "loss": 0.4156, + "step": 38735 + }, + { + "epoch": 2.18, + "grad_norm": 5.7393308788034645, + "learning_rate": 2.095202170520284e-06, + "loss": 0.4218, + "step": 38740 + }, + { + "epoch": 2.18, + "grad_norm": 4.292223087645373, + "learning_rate": 2.093868281603527e-06, + "loss": 0.3964, + "step": 38745 + }, + { + "epoch": 2.18, + "grad_norm": 6.139624667569269, + "learning_rate": 2.092534704963449e-06, + "loss": 0.3898, + "step": 38750 + }, + { + "epoch": 2.18, + "grad_norm": 4.737970076906442, + "learning_rate": 2.0912014407433507e-06, + "loss": 0.3972, + "step": 38755 + }, + { + "epoch": 2.18, + "grad_norm": 4.5147933770327064, + "learning_rate": 2.0898684890864935e-06, + "loss": 0.3883, + "step": 38760 + }, + { + "epoch": 2.18, + "grad_norm": 7.4293994473746325, + "learning_rate": 2.088535850136112e-06, + "loss": 0.3997, + "step": 38765 + }, + { + "epoch": 2.18, + "grad_norm": 6.845198988293049, + "learning_rate": 2.0872035240354007e-06, + "loss": 0.3963, + "step": 38770 + }, + { + "epoch": 2.18, + "grad_norm": 4.704743055052215, + "learning_rate": 2.0858715109275273e-06, + "loss": 0.3881, + "step": 38775 + }, + { + "epoch": 2.19, + "grad_norm": 4.986911366585578, + "learning_rate": 2.084539810955622e-06, + "loss": 0.3865, + "step": 38780 + }, + { + "epoch": 2.19, + "grad_norm": 5.115826486255224, + "learning_rate": 2.0832084242627783e-06, + "loss": 0.421, + "step": 38785 + }, + { + "epoch": 2.19, + "grad_norm": 4.602096951745674, + "learning_rate": 2.081877350992064e-06, + "loss": 0.3861, + "step": 38790 + }, + { + "epoch": 2.19, + "grad_norm": 4.256387121064838, + "learning_rate": 2.080546591286506e-06, + "loss": 0.4074, + "step": 38795 + }, + { + "epoch": 2.19, + "grad_norm": 4.451157458162828, + "learning_rate": 2.0792161452891025e-06, + "loss": 0.3878, + "step": 38800 + }, + { + "epoch": 2.19, + "grad_norm": 6.966273887512294, + "learning_rate": 2.077886013142815e-06, + "loss": 0.4146, + "step": 38805 + }, + { + "epoch": 2.19, + "grad_norm": 4.5331927343856355, + "learning_rate": 2.0765561949905693e-06, + "loss": 0.3943, + "step": 38810 + }, + { + "epoch": 2.19, + "grad_norm": 6.609237308128622, + "learning_rate": 2.075226690975265e-06, + "loss": 0.4112, + "step": 38815 + }, + { + "epoch": 2.19, + "grad_norm": 5.112732306750538, + "learning_rate": 2.0738975012397585e-06, + "loss": 0.3876, + "step": 38820 + }, + { + "epoch": 2.19, + "grad_norm": 4.8975734575630705, + "learning_rate": 2.0725686259268813e-06, + "loss": 0.4287, + "step": 38825 + }, + { + "epoch": 2.19, + "grad_norm": 4.401027482380962, + "learning_rate": 2.0712400651794233e-06, + "loss": 0.4038, + "step": 38830 + }, + { + "epoch": 2.19, + "grad_norm": 4.435267720334556, + "learning_rate": 2.069911819140147e-06, + "loss": 0.3815, + "step": 38835 + }, + { + "epoch": 2.19, + "grad_norm": 4.553894346786587, + "learning_rate": 2.0685838879517767e-06, + "loss": 0.3821, + "step": 38840 + }, + { + "epoch": 2.19, + "grad_norm": 5.713386969070219, + "learning_rate": 2.0672562717570043e-06, + "loss": 0.3907, + "step": 38845 + }, + { + "epoch": 2.19, + "grad_norm": 5.167098796179337, + "learning_rate": 2.0659289706984903e-06, + "loss": 0.3949, + "step": 38850 + }, + { + "epoch": 2.19, + "grad_norm": 4.344477417274741, + "learning_rate": 2.064601984918856e-06, + "loss": 0.4008, + "step": 38855 + }, + { + "epoch": 2.19, + "grad_norm": 4.603400445501257, + "learning_rate": 2.063275314560695e-06, + "loss": 0.4309, + "step": 38860 + }, + { + "epoch": 2.19, + "grad_norm": 5.237801260847373, + "learning_rate": 2.0619489597665622e-06, + "loss": 0.3989, + "step": 38865 + }, + { + "epoch": 2.19, + "grad_norm": 4.878509897529411, + "learning_rate": 2.060622920678978e-06, + "loss": 0.4162, + "step": 38870 + }, + { + "epoch": 2.19, + "grad_norm": 4.413789470759705, + "learning_rate": 2.0592971974404358e-06, + "loss": 0.3897, + "step": 38875 + }, + { + "epoch": 2.19, + "grad_norm": 4.142975068918899, + "learning_rate": 2.057971790193386e-06, + "loss": 0.3487, + "step": 38880 + }, + { + "epoch": 2.19, + "grad_norm": 4.3539662990290084, + "learning_rate": 2.0566466990802534e-06, + "loss": 0.404, + "step": 38885 + }, + { + "epoch": 2.19, + "grad_norm": 4.64585346887782, + "learning_rate": 2.055321924243423e-06, + "loss": 0.3867, + "step": 38890 + }, + { + "epoch": 2.19, + "grad_norm": 5.234958381467139, + "learning_rate": 2.053997465825246e-06, + "loss": 0.3826, + "step": 38895 + }, + { + "epoch": 2.19, + "grad_norm": 5.568749889569857, + "learning_rate": 2.0526733239680445e-06, + "loss": 0.3982, + "step": 38900 + }, + { + "epoch": 2.19, + "grad_norm": 4.955018933633224, + "learning_rate": 2.0513494988141002e-06, + "loss": 0.439, + "step": 38905 + }, + { + "epoch": 2.19, + "grad_norm": 6.116998926552031, + "learning_rate": 2.0500259905056676e-06, + "loss": 0.4243, + "step": 38910 + }, + { + "epoch": 2.19, + "grad_norm": 6.268386539561251, + "learning_rate": 2.0487027991849594e-06, + "loss": 0.4411, + "step": 38915 + }, + { + "epoch": 2.19, + "grad_norm": 4.201328055758747, + "learning_rate": 2.0473799249941627e-06, + "loss": 0.3836, + "step": 38920 + }, + { + "epoch": 2.19, + "grad_norm": 4.970729449189368, + "learning_rate": 2.046057368075422e-06, + "loss": 0.4083, + "step": 38925 + }, + { + "epoch": 2.19, + "grad_norm": 4.8883094970171515, + "learning_rate": 2.0447351285708554e-06, + "loss": 0.4092, + "step": 38930 + }, + { + "epoch": 2.19, + "grad_norm": 4.49127190563385, + "learning_rate": 2.0434132066225405e-06, + "loss": 0.4042, + "step": 38935 + }, + { + "epoch": 2.19, + "grad_norm": 4.313038945917286, + "learning_rate": 2.0420916023725244e-06, + "loss": 0.3974, + "step": 38940 + }, + { + "epoch": 2.19, + "grad_norm": 4.701695780740329, + "learning_rate": 2.0407703159628224e-06, + "loss": 0.3653, + "step": 38945 + }, + { + "epoch": 2.19, + "grad_norm": 4.363619762417488, + "learning_rate": 2.0394493475354094e-06, + "loss": 0.4098, + "step": 38950 + }, + { + "epoch": 2.19, + "grad_norm": 4.816465055287263, + "learning_rate": 2.0381286972322283e-06, + "loss": 0.3834, + "step": 38955 + }, + { + "epoch": 2.2, + "grad_norm": 5.314960838394469, + "learning_rate": 2.0368083651951927e-06, + "loss": 0.4023, + "step": 38960 + }, + { + "epoch": 2.2, + "grad_norm": 5.389517851042226, + "learning_rate": 2.0354883515661734e-06, + "loss": 0.4249, + "step": 38965 + }, + { + "epoch": 2.2, + "grad_norm": 5.597833283499703, + "learning_rate": 2.034168656487016e-06, + "loss": 0.4006, + "step": 38970 + }, + { + "epoch": 2.2, + "grad_norm": 5.730014782523671, + "learning_rate": 2.032849280099526e-06, + "loss": 0.3893, + "step": 38975 + }, + { + "epoch": 2.2, + "grad_norm": 4.6965649993995875, + "learning_rate": 2.0315302225454736e-06, + "loss": 0.3662, + "step": 38980 + }, + { + "epoch": 2.2, + "grad_norm": 6.46922335185519, + "learning_rate": 2.0302114839666017e-06, + "loss": 0.4162, + "step": 38985 + }, + { + "epoch": 2.2, + "grad_norm": 7.213379806054316, + "learning_rate": 2.0288930645046106e-06, + "loss": 0.3651, + "step": 38990 + }, + { + "epoch": 2.2, + "grad_norm": 5.147714807298288, + "learning_rate": 2.0275749643011745e-06, + "loss": 0.3979, + "step": 38995 + }, + { + "epoch": 2.2, + "grad_norm": 4.725775462814913, + "learning_rate": 2.0262571834979243e-06, + "loss": 0.3938, + "step": 39000 + }, + { + "epoch": 2.2, + "grad_norm": 4.445150688922724, + "learning_rate": 2.0249397222364664e-06, + "loss": 0.3984, + "step": 39005 + }, + { + "epoch": 2.2, + "grad_norm": 4.795208073400065, + "learning_rate": 2.023622580658365e-06, + "loss": 0.3909, + "step": 39010 + }, + { + "epoch": 2.2, + "grad_norm": 4.817298566269459, + "learning_rate": 2.022305758905151e-06, + "loss": 0.372, + "step": 39015 + }, + { + "epoch": 2.2, + "grad_norm": 4.78488919786896, + "learning_rate": 2.0209892571183247e-06, + "loss": 0.3942, + "step": 39020 + }, + { + "epoch": 2.2, + "grad_norm": 4.852040691074928, + "learning_rate": 2.0196730754393524e-06, + "loss": 0.408, + "step": 39025 + }, + { + "epoch": 2.2, + "grad_norm": 4.757677745792287, + "learning_rate": 2.018357214009659e-06, + "loss": 0.418, + "step": 39030 + }, + { + "epoch": 2.2, + "grad_norm": 4.603541620603779, + "learning_rate": 2.0170416729706437e-06, + "loss": 0.3866, + "step": 39035 + }, + { + "epoch": 2.2, + "grad_norm": 6.59621435926493, + "learning_rate": 2.0157264524636644e-06, + "loss": 0.4072, + "step": 39040 + }, + { + "epoch": 2.2, + "grad_norm": 5.7671967765864895, + "learning_rate": 2.0144115526300495e-06, + "loss": 0.3887, + "step": 39045 + }, + { + "epoch": 2.2, + "grad_norm": 4.680602077720724, + "learning_rate": 2.013096973611088e-06, + "loss": 0.4106, + "step": 39050 + }, + { + "epoch": 2.2, + "grad_norm": 7.867675568728056, + "learning_rate": 2.0117827155480405e-06, + "loss": 0.3577, + "step": 39055 + }, + { + "epoch": 2.2, + "grad_norm": 4.239896507137009, + "learning_rate": 2.0104687785821286e-06, + "loss": 0.375, + "step": 39060 + }, + { + "epoch": 2.2, + "grad_norm": 6.2848515735994335, + "learning_rate": 2.0091551628545385e-06, + "loss": 0.4027, + "step": 39065 + }, + { + "epoch": 2.2, + "grad_norm": 4.980979996629347, + "learning_rate": 2.0078418685064278e-06, + "loss": 0.3909, + "step": 39070 + }, + { + "epoch": 2.2, + "grad_norm": 4.828427743870696, + "learning_rate": 2.0065288956789114e-06, + "loss": 0.4115, + "step": 39075 + }, + { + "epoch": 2.2, + "grad_norm": 4.874373314519098, + "learning_rate": 2.005216244513078e-06, + "loss": 0.4032, + "step": 39080 + }, + { + "epoch": 2.2, + "grad_norm": 4.500648376458064, + "learning_rate": 2.0039039151499747e-06, + "loss": 0.3872, + "step": 39085 + }, + { + "epoch": 2.2, + "grad_norm": 5.178901616250842, + "learning_rate": 2.0025919077306206e-06, + "loss": 0.4075, + "step": 39090 + }, + { + "epoch": 2.2, + "grad_norm": 4.387145842189551, + "learning_rate": 2.0012802223959937e-06, + "loss": 0.4047, + "step": 39095 + }, + { + "epoch": 2.2, + "grad_norm": 4.60506482068075, + "learning_rate": 1.99996885928704e-06, + "loss": 0.3839, + "step": 39100 + }, + { + "epoch": 2.2, + "grad_norm": 5.5306047194833114, + "learning_rate": 1.998657818544674e-06, + "loss": 0.425, + "step": 39105 + }, + { + "epoch": 2.2, + "grad_norm": 4.825596400925807, + "learning_rate": 1.9973471003097698e-06, + "loss": 0.3949, + "step": 39110 + }, + { + "epoch": 2.2, + "grad_norm": 4.922033121496603, + "learning_rate": 1.9960367047231715e-06, + "loss": 0.4018, + "step": 39115 + }, + { + "epoch": 2.2, + "grad_norm": 4.597560061172928, + "learning_rate": 1.9947266319256887e-06, + "loss": 0.4021, + "step": 39120 + }, + { + "epoch": 2.2, + "grad_norm": 4.657990797184312, + "learning_rate": 1.9934168820580907e-06, + "loss": 0.3968, + "step": 39125 + }, + { + "epoch": 2.2, + "grad_norm": 5.209202164190039, + "learning_rate": 1.992107455261119e-06, + "loss": 0.407, + "step": 39130 + }, + { + "epoch": 2.21, + "grad_norm": 4.386116825533306, + "learning_rate": 1.9907983516754754e-06, + "loss": 0.3963, + "step": 39135 + }, + { + "epoch": 2.21, + "grad_norm": 8.269427638207159, + "learning_rate": 1.9894895714418303e-06, + "loss": 0.3869, + "step": 39140 + }, + { + "epoch": 2.21, + "grad_norm": 6.174116633717544, + "learning_rate": 1.9881811147008158e-06, + "loss": 0.3919, + "step": 39145 + }, + { + "epoch": 2.21, + "grad_norm": 4.734472965468408, + "learning_rate": 1.986872981593034e-06, + "loss": 0.4254, + "step": 39150 + }, + { + "epoch": 2.21, + "grad_norm": 7.022932641200943, + "learning_rate": 1.985565172259049e-06, + "loss": 0.3946, + "step": 39155 + }, + { + "epoch": 2.21, + "grad_norm": 4.498077637297552, + "learning_rate": 1.9842576868393883e-06, + "loss": 0.3717, + "step": 39160 + }, + { + "epoch": 2.21, + "grad_norm": 6.5332225905143355, + "learning_rate": 1.9829505254745497e-06, + "loss": 0.3831, + "step": 39165 + }, + { + "epoch": 2.21, + "grad_norm": 4.93601416254683, + "learning_rate": 1.9816436883049906e-06, + "loss": 0.4047, + "step": 39170 + }, + { + "epoch": 2.21, + "grad_norm": 4.733251523796037, + "learning_rate": 1.9803371754711402e-06, + "loss": 0.397, + "step": 39175 + }, + { + "epoch": 2.21, + "grad_norm": 4.712771353063706, + "learning_rate": 1.979030987113387e-06, + "loss": 0.4272, + "step": 39180 + }, + { + "epoch": 2.21, + "grad_norm": 8.283336273144208, + "learning_rate": 1.977725123372085e-06, + "loss": 0.3841, + "step": 39185 + }, + { + "epoch": 2.21, + "grad_norm": 4.748921865335893, + "learning_rate": 1.976419584387559e-06, + "loss": 0.4207, + "step": 39190 + }, + { + "epoch": 2.21, + "grad_norm": 5.062050962654206, + "learning_rate": 1.975114370300091e-06, + "loss": 0.392, + "step": 39195 + }, + { + "epoch": 2.21, + "grad_norm": 5.014145429863991, + "learning_rate": 1.973809481249935e-06, + "loss": 0.3957, + "step": 39200 + }, + { + "epoch": 2.21, + "grad_norm": 5.318200049638917, + "learning_rate": 1.972504917377304e-06, + "loss": 0.3921, + "step": 39205 + }, + { + "epoch": 2.21, + "grad_norm": 5.624013880590756, + "learning_rate": 1.9712006788223813e-06, + "loss": 0.393, + "step": 39210 + }, + { + "epoch": 2.21, + "grad_norm": 5.339503008943358, + "learning_rate": 1.9698967657253147e-06, + "loss": 0.3549, + "step": 39215 + }, + { + "epoch": 2.21, + "grad_norm": 7.19103963618126, + "learning_rate": 1.9685931782262113e-06, + "loss": 0.4061, + "step": 39220 + }, + { + "epoch": 2.21, + "grad_norm": 4.515879538966008, + "learning_rate": 1.9672899164651516e-06, + "loss": 0.3626, + "step": 39225 + }, + { + "epoch": 2.21, + "grad_norm": 5.761942211722023, + "learning_rate": 1.965986980582173e-06, + "loss": 0.4116, + "step": 39230 + }, + { + "epoch": 2.21, + "grad_norm": 4.459228425857306, + "learning_rate": 1.964684370717285e-06, + "loss": 0.4066, + "step": 39235 + }, + { + "epoch": 2.21, + "grad_norm": 4.445562264940093, + "learning_rate": 1.9633820870104575e-06, + "loss": 0.4106, + "step": 39240 + }, + { + "epoch": 2.21, + "grad_norm": 4.682108648769833, + "learning_rate": 1.9620801296016247e-06, + "loss": 0.3848, + "step": 39245 + }, + { + "epoch": 2.21, + "grad_norm": 6.744454442004688, + "learning_rate": 1.9607784986306906e-06, + "loss": 0.3727, + "step": 39250 + }, + { + "epoch": 2.21, + "grad_norm": 4.539855465939607, + "learning_rate": 1.9594771942375183e-06, + "loss": 0.3694, + "step": 39255 + }, + { + "epoch": 2.21, + "grad_norm": 4.9894757678328965, + "learning_rate": 1.9581762165619423e-06, + "loss": 0.3753, + "step": 39260 + }, + { + "epoch": 2.21, + "grad_norm": 4.5534286960075, + "learning_rate": 1.9568755657437564e-06, + "loss": 0.3911, + "step": 39265 + }, + { + "epoch": 2.21, + "grad_norm": 4.659752904131898, + "learning_rate": 1.95557524192272e-06, + "loss": 0.3972, + "step": 39270 + }, + { + "epoch": 2.21, + "grad_norm": 4.808294937638788, + "learning_rate": 1.9542752452385604e-06, + "loss": 0.3711, + "step": 39275 + }, + { + "epoch": 2.21, + "grad_norm": 4.816358919335911, + "learning_rate": 1.9529755758309666e-06, + "loss": 0.3919, + "step": 39280 + }, + { + "epoch": 2.21, + "grad_norm": 4.6226762383187365, + "learning_rate": 1.951676233839597e-06, + "loss": 0.4161, + "step": 39285 + }, + { + "epoch": 2.21, + "grad_norm": 4.479114808354138, + "learning_rate": 1.9503772194040665e-06, + "loss": 0.3592, + "step": 39290 + }, + { + "epoch": 2.21, + "grad_norm": 4.579325120042945, + "learning_rate": 1.949078532663965e-06, + "loss": 0.3956, + "step": 39295 + }, + { + "epoch": 2.21, + "grad_norm": 4.889692048547527, + "learning_rate": 1.947780173758838e-06, + "loss": 0.413, + "step": 39300 + }, + { + "epoch": 2.21, + "grad_norm": 5.612972680628571, + "learning_rate": 1.9464821428282023e-06, + "loss": 0.3659, + "step": 39305 + }, + { + "epoch": 2.21, + "grad_norm": 4.312394924222412, + "learning_rate": 1.9451844400115378e-06, + "loss": 0.3766, + "step": 39310 + }, + { + "epoch": 2.22, + "grad_norm": 4.0878461852833485, + "learning_rate": 1.9438870654482855e-06, + "loss": 0.3724, + "step": 39315 + }, + { + "epoch": 2.22, + "grad_norm": 4.310007001105772, + "learning_rate": 1.9425900192778575e-06, + "loss": 0.3477, + "step": 39320 + }, + { + "epoch": 2.22, + "grad_norm": 4.810854392040784, + "learning_rate": 1.9412933016396256e-06, + "loss": 0.4027, + "step": 39325 + }, + { + "epoch": 2.22, + "grad_norm": 4.403281496748654, + "learning_rate": 1.939996912672925e-06, + "loss": 0.3941, + "step": 39330 + }, + { + "epoch": 2.22, + "grad_norm": 4.529455231183697, + "learning_rate": 1.9387008525170635e-06, + "loss": 0.4147, + "step": 39335 + }, + { + "epoch": 2.22, + "grad_norm": 4.626997511243248, + "learning_rate": 1.937405121311304e-06, + "loss": 0.3761, + "step": 39340 + }, + { + "epoch": 2.22, + "grad_norm": 5.929562626534522, + "learning_rate": 1.9361097191948817e-06, + "loss": 0.3798, + "step": 39345 + }, + { + "epoch": 2.22, + "grad_norm": 5.040053761044108, + "learning_rate": 1.9348146463069928e-06, + "loss": 0.3693, + "step": 39350 + }, + { + "epoch": 2.22, + "grad_norm": 5.377633274374966, + "learning_rate": 1.9335199027867955e-06, + "loss": 0.4228, + "step": 39355 + }, + { + "epoch": 2.22, + "grad_norm": 6.109222863158795, + "learning_rate": 1.9322254887734203e-06, + "loss": 0.392, + "step": 39360 + }, + { + "epoch": 2.22, + "grad_norm": 6.132264102947876, + "learning_rate": 1.9309314044059537e-06, + "loss": 0.3789, + "step": 39365 + }, + { + "epoch": 2.22, + "grad_norm": 4.976285053201243, + "learning_rate": 1.9296376498234543e-06, + "loss": 0.4071, + "step": 39370 + }, + { + "epoch": 2.22, + "grad_norm": 4.992502688260796, + "learning_rate": 1.9283442251649394e-06, + "loss": 0.3753, + "step": 39375 + }, + { + "epoch": 2.22, + "grad_norm": 5.075911079635343, + "learning_rate": 1.927051130569392e-06, + "loss": 0.4151, + "step": 39380 + }, + { + "epoch": 2.22, + "grad_norm": 4.429071718373526, + "learning_rate": 1.9257583661757634e-06, + "loss": 0.3825, + "step": 39385 + }, + { + "epoch": 2.22, + "grad_norm": 4.5806253644574175, + "learning_rate": 1.9244659321229668e-06, + "loss": 0.3985, + "step": 39390 + }, + { + "epoch": 2.22, + "grad_norm": 4.494931489782704, + "learning_rate": 1.9231738285498785e-06, + "loss": 0.3722, + "step": 39395 + }, + { + "epoch": 2.22, + "grad_norm": 5.260413983614203, + "learning_rate": 1.9218820555953405e-06, + "loss": 0.4215, + "step": 39400 + }, + { + "epoch": 2.22, + "grad_norm": 5.897041511698496, + "learning_rate": 1.9205906133981627e-06, + "loss": 0.3941, + "step": 39405 + }, + { + "epoch": 2.22, + "grad_norm": 6.925661136097314, + "learning_rate": 1.9192995020971133e-06, + "loss": 0.4121, + "step": 39410 + }, + { + "epoch": 2.22, + "grad_norm": 4.636157699110875, + "learning_rate": 1.9180087218309273e-06, + "loss": 0.3852, + "step": 39415 + }, + { + "epoch": 2.22, + "grad_norm": 4.63613934585706, + "learning_rate": 1.916718272738308e-06, + "loss": 0.3939, + "step": 39420 + }, + { + "epoch": 2.22, + "grad_norm": 8.925446808807623, + "learning_rate": 1.9154281549579157e-06, + "loss": 0.3614, + "step": 39425 + }, + { + "epoch": 2.22, + "grad_norm": 4.436743271862936, + "learning_rate": 1.914138368628383e-06, + "loss": 0.374, + "step": 39430 + }, + { + "epoch": 2.22, + "grad_norm": 4.576896601885174, + "learning_rate": 1.9128489138883017e-06, + "loss": 0.4257, + "step": 39435 + }, + { + "epoch": 2.22, + "grad_norm": 7.723575563308978, + "learning_rate": 1.9115597908762277e-06, + "loss": 0.3908, + "step": 39440 + }, + { + "epoch": 2.22, + "grad_norm": 5.41529628446949, + "learning_rate": 1.9102709997306863e-06, + "loss": 0.3761, + "step": 39445 + }, + { + "epoch": 2.22, + "grad_norm": 5.758141671306461, + "learning_rate": 1.9089825405901597e-06, + "loss": 0.3785, + "step": 39450 + }, + { + "epoch": 2.22, + "grad_norm": 4.413224050824788, + "learning_rate": 1.907694413593103e-06, + "loss": 0.4214, + "step": 39455 + }, + { + "epoch": 2.22, + "grad_norm": 4.798702141896009, + "learning_rate": 1.906406618877929e-06, + "loss": 0.3903, + "step": 39460 + }, + { + "epoch": 2.22, + "grad_norm": 4.287535595220807, + "learning_rate": 1.9051191565830147e-06, + "loss": 0.3838, + "step": 39465 + }, + { + "epoch": 2.22, + "grad_norm": 4.940844969553668, + "learning_rate": 1.9038320268467081e-06, + "loss": 0.3555, + "step": 39470 + }, + { + "epoch": 2.22, + "grad_norm": 4.763730501188215, + "learning_rate": 1.9025452298073127e-06, + "loss": 0.3911, + "step": 39475 + }, + { + "epoch": 2.22, + "grad_norm": 4.867809024857266, + "learning_rate": 1.9012587656031022e-06, + "loss": 0.4029, + "step": 39480 + }, + { + "epoch": 2.22, + "grad_norm": 6.606378094814525, + "learning_rate": 1.899972634372315e-06, + "loss": 0.4101, + "step": 39485 + }, + { + "epoch": 2.23, + "grad_norm": 4.950571708424779, + "learning_rate": 1.8986868362531479e-06, + "loss": 0.3846, + "step": 39490 + }, + { + "epoch": 2.23, + "grad_norm": 5.639142141166102, + "learning_rate": 1.897401371383769e-06, + "loss": 0.4156, + "step": 39495 + }, + { + "epoch": 2.23, + "grad_norm": 7.521940871616371, + "learning_rate": 1.8961162399023043e-06, + "loss": 0.4028, + "step": 39500 + }, + { + "epoch": 2.23, + "grad_norm": 4.5318394373804045, + "learning_rate": 1.8948314419468495e-06, + "loss": 0.4242, + "step": 39505 + }, + { + "epoch": 2.23, + "grad_norm": 7.476824102881447, + "learning_rate": 1.8935469776554589e-06, + "loss": 0.4121, + "step": 39510 + }, + { + "epoch": 2.23, + "grad_norm": 4.751126129917067, + "learning_rate": 1.892262847166157e-06, + "loss": 0.3759, + "step": 39515 + }, + { + "epoch": 2.23, + "grad_norm": 4.399312116418375, + "learning_rate": 1.8909790506169273e-06, + "loss": 0.3744, + "step": 39520 + }, + { + "epoch": 2.23, + "grad_norm": 5.369512931193964, + "learning_rate": 1.8896955881457184e-06, + "loss": 0.4174, + "step": 39525 + }, + { + "epoch": 2.23, + "grad_norm": 4.939418429983109, + "learning_rate": 1.8884124598904474e-06, + "loss": 0.3882, + "step": 39530 + }, + { + "epoch": 2.23, + "grad_norm": 4.672525721857139, + "learning_rate": 1.8871296659889877e-06, + "loss": 0.3809, + "step": 39535 + }, + { + "epoch": 2.23, + "grad_norm": 4.671526954543785, + "learning_rate": 1.8858472065791856e-06, + "loss": 0.3968, + "step": 39540 + }, + { + "epoch": 2.23, + "grad_norm": 4.412224228577191, + "learning_rate": 1.8845650817988454e-06, + "loss": 0.3654, + "step": 39545 + }, + { + "epoch": 2.23, + "grad_norm": 6.236915588437215, + "learning_rate": 1.8832832917857341e-06, + "loss": 0.4079, + "step": 39550 + }, + { + "epoch": 2.23, + "grad_norm": 5.086324031331009, + "learning_rate": 1.8820018366775906e-06, + "loss": 0.3886, + "step": 39555 + }, + { + "epoch": 2.23, + "grad_norm": 5.207902869035254, + "learning_rate": 1.8807207166121084e-06, + "loss": 0.4152, + "step": 39560 + }, + { + "epoch": 2.23, + "grad_norm": 6.762951690033797, + "learning_rate": 1.879439931726953e-06, + "loss": 0.3658, + "step": 39565 + }, + { + "epoch": 2.23, + "grad_norm": 5.518356343370002, + "learning_rate": 1.8781594821597477e-06, + "loss": 0.3787, + "step": 39570 + }, + { + "epoch": 2.23, + "grad_norm": 5.805396373492158, + "learning_rate": 1.876879368048084e-06, + "loss": 0.366, + "step": 39575 + }, + { + "epoch": 2.23, + "grad_norm": 6.932263168030357, + "learning_rate": 1.8755995895295171e-06, + "loss": 0.4302, + "step": 39580 + }, + { + "epoch": 2.23, + "grad_norm": 4.785857077956581, + "learning_rate": 1.8743201467415617e-06, + "loss": 0.4024, + "step": 39585 + }, + { + "epoch": 2.23, + "grad_norm": 5.184166770390915, + "learning_rate": 1.8730410398217024e-06, + "loss": 0.3935, + "step": 39590 + }, + { + "epoch": 2.23, + "grad_norm": 5.5109971914732245, + "learning_rate": 1.8717622689073827e-06, + "loss": 0.3899, + "step": 39595 + }, + { + "epoch": 2.23, + "grad_norm": 4.398428167430032, + "learning_rate": 1.8704838341360148e-06, + "loss": 0.3577, + "step": 39600 + }, + { + "epoch": 2.23, + "grad_norm": 4.951762431716239, + "learning_rate": 1.86920573564497e-06, + "loss": 0.3954, + "step": 39605 + }, + { + "epoch": 2.23, + "grad_norm": 7.120061651291709, + "learning_rate": 1.867927973571585e-06, + "loss": 0.402, + "step": 39610 + }, + { + "epoch": 2.23, + "grad_norm": 5.376852122443278, + "learning_rate": 1.8666505480531638e-06, + "loss": 0.4175, + "step": 39615 + }, + { + "epoch": 2.23, + "grad_norm": 5.659641246982101, + "learning_rate": 1.8653734592269673e-06, + "loss": 0.3508, + "step": 39620 + }, + { + "epoch": 2.23, + "grad_norm": 5.020145632739526, + "learning_rate": 1.8640967072302292e-06, + "loss": 0.4005, + "step": 39625 + }, + { + "epoch": 2.23, + "grad_norm": 6.11957725384284, + "learning_rate": 1.8628202922001381e-06, + "loss": 0.3824, + "step": 39630 + }, + { + "epoch": 2.23, + "grad_norm": 4.660902042465461, + "learning_rate": 1.8615442142738531e-06, + "loss": 0.3362, + "step": 39635 + }, + { + "epoch": 2.23, + "grad_norm": 4.924188483670731, + "learning_rate": 1.860268473588494e-06, + "loss": 0.3803, + "step": 39640 + }, + { + "epoch": 2.23, + "grad_norm": 4.923638465380285, + "learning_rate": 1.858993070281142e-06, + "loss": 0.3908, + "step": 39645 + }, + { + "epoch": 2.23, + "grad_norm": 4.373037236508251, + "learning_rate": 1.8577180044888487e-06, + "loss": 0.3496, + "step": 39650 + }, + { + "epoch": 2.23, + "grad_norm": 4.310046001357836, + "learning_rate": 1.8564432763486222e-06, + "loss": 0.3661, + "step": 39655 + }, + { + "epoch": 2.23, + "grad_norm": 5.196622318158273, + "learning_rate": 1.855168885997441e-06, + "loss": 0.3816, + "step": 39660 + }, + { + "epoch": 2.23, + "grad_norm": 5.044666895592206, + "learning_rate": 1.8538948335722402e-06, + "loss": 0.369, + "step": 39665 + }, + { + "epoch": 2.24, + "grad_norm": 4.962374557992139, + "learning_rate": 1.852621119209924e-06, + "loss": 0.3959, + "step": 39670 + }, + { + "epoch": 2.24, + "grad_norm": 4.746987427245066, + "learning_rate": 1.8513477430473615e-06, + "loss": 0.4152, + "step": 39675 + }, + { + "epoch": 2.24, + "grad_norm": 5.162547842833919, + "learning_rate": 1.850074705221378e-06, + "loss": 0.3811, + "step": 39680 + }, + { + "epoch": 2.24, + "grad_norm": 6.004663789773719, + "learning_rate": 1.8488020058687705e-06, + "loss": 0.3638, + "step": 39685 + }, + { + "epoch": 2.24, + "grad_norm": 7.321604927292, + "learning_rate": 1.8475296451262953e-06, + "loss": 0.3911, + "step": 39690 + }, + { + "epoch": 2.24, + "grad_norm": 4.805710004138573, + "learning_rate": 1.8462576231306706e-06, + "loss": 0.41, + "step": 39695 + }, + { + "epoch": 2.24, + "grad_norm": 5.547511515704962, + "learning_rate": 1.844985940018585e-06, + "loss": 0.3666, + "step": 39700 + }, + { + "epoch": 2.24, + "grad_norm": 4.961191279493139, + "learning_rate": 1.8437145959266821e-06, + "loss": 0.3927, + "step": 39705 + }, + { + "epoch": 2.24, + "grad_norm": 4.787963243117133, + "learning_rate": 1.8424435909915773e-06, + "loss": 0.4067, + "step": 39710 + }, + { + "epoch": 2.24, + "grad_norm": 5.272288553057736, + "learning_rate": 1.8411729253498428e-06, + "loss": 0.3916, + "step": 39715 + }, + { + "epoch": 2.24, + "grad_norm": 6.020562475059103, + "learning_rate": 1.8399025991380197e-06, + "loss": 0.4038, + "step": 39720 + }, + { + "epoch": 2.24, + "grad_norm": 4.656818523628417, + "learning_rate": 1.838632612492609e-06, + "loss": 0.3904, + "step": 39725 + }, + { + "epoch": 2.24, + "grad_norm": 5.092253368498932, + "learning_rate": 1.8373629655500746e-06, + "loss": 0.3799, + "step": 39730 + }, + { + "epoch": 2.24, + "grad_norm": 4.8502567763066775, + "learning_rate": 1.8360936584468486e-06, + "loss": 0.3908, + "step": 39735 + }, + { + "epoch": 2.24, + "grad_norm": 4.287050953157093, + "learning_rate": 1.8348246913193213e-06, + "loss": 0.3703, + "step": 39740 + }, + { + "epoch": 2.24, + "grad_norm": 5.095888948582343, + "learning_rate": 1.8335560643038514e-06, + "loss": 0.3924, + "step": 39745 + }, + { + "epoch": 2.24, + "grad_norm": 5.388213094768144, + "learning_rate": 1.8322877775367553e-06, + "loss": 0.391, + "step": 39750 + }, + { + "epoch": 2.24, + "grad_norm": 4.940983560238933, + "learning_rate": 1.8310198311543192e-06, + "loss": 0.3825, + "step": 39755 + }, + { + "epoch": 2.24, + "grad_norm": 6.368845982457075, + "learning_rate": 1.8297522252927863e-06, + "loss": 0.403, + "step": 39760 + }, + { + "epoch": 2.24, + "grad_norm": 5.800167607118869, + "learning_rate": 1.8284849600883686e-06, + "loss": 0.3962, + "step": 39765 + }, + { + "epoch": 2.24, + "grad_norm": 4.612145761309282, + "learning_rate": 1.82721803567724e-06, + "loss": 0.3682, + "step": 39770 + }, + { + "epoch": 2.24, + "grad_norm": 4.812702118365965, + "learning_rate": 1.8259514521955368e-06, + "loss": 0.4102, + "step": 39775 + }, + { + "epoch": 2.24, + "grad_norm": 4.891077334074156, + "learning_rate": 1.824685209779356e-06, + "loss": 0.4098, + "step": 39780 + }, + { + "epoch": 2.24, + "grad_norm": 4.45152695790426, + "learning_rate": 1.823419308564765e-06, + "loss": 0.4017, + "step": 39785 + }, + { + "epoch": 2.24, + "grad_norm": 4.7630537946305225, + "learning_rate": 1.8221537486877866e-06, + "loss": 0.4222, + "step": 39790 + }, + { + "epoch": 2.24, + "grad_norm": 5.317864713299714, + "learning_rate": 1.8208885302844147e-06, + "loss": 0.4157, + "step": 39795 + }, + { + "epoch": 2.24, + "grad_norm": 5.761185300543549, + "learning_rate": 1.8196236534905992e-06, + "loss": 0.3916, + "step": 39800 + }, + { + "epoch": 2.24, + "grad_norm": 5.30928314552267, + "learning_rate": 1.8183591184422594e-06, + "loss": 0.364, + "step": 39805 + }, + { + "epoch": 2.24, + "grad_norm": 4.9886560228569525, + "learning_rate": 1.8170949252752745e-06, + "loss": 0.3872, + "step": 39810 + }, + { + "epoch": 2.24, + "grad_norm": 4.847642331118586, + "learning_rate": 1.8158310741254853e-06, + "loss": 0.3695, + "step": 39815 + }, + { + "epoch": 2.24, + "grad_norm": 4.640814823407414, + "learning_rate": 1.8145675651287015e-06, + "loss": 0.3861, + "step": 39820 + }, + { + "epoch": 2.24, + "grad_norm": 5.283218923150907, + "learning_rate": 1.8133043984206894e-06, + "loss": 0.3582, + "step": 39825 + }, + { + "epoch": 2.24, + "grad_norm": 4.65729786494483, + "learning_rate": 1.8120415741371855e-06, + "loss": 0.3759, + "step": 39830 + }, + { + "epoch": 2.24, + "grad_norm": 4.579537865520433, + "learning_rate": 1.8107790924138847e-06, + "loss": 0.3697, + "step": 39835 + }, + { + "epoch": 2.24, + "grad_norm": 4.80671901708008, + "learning_rate": 1.8095169533864427e-06, + "loss": 0.3932, + "step": 39840 + }, + { + "epoch": 2.25, + "grad_norm": 6.830477689387796, + "learning_rate": 1.8082551571904854e-06, + "loss": 0.3897, + "step": 39845 + }, + { + "epoch": 2.25, + "grad_norm": 5.8268395670951865, + "learning_rate": 1.8069937039615992e-06, + "loss": 0.4089, + "step": 39850 + }, + { + "epoch": 2.25, + "grad_norm": 5.91057321248474, + "learning_rate": 1.8057325938353298e-06, + "loss": 0.3883, + "step": 39855 + }, + { + "epoch": 2.25, + "grad_norm": 4.285469458486904, + "learning_rate": 1.8044718269471918e-06, + "loss": 0.3751, + "step": 39860 + }, + { + "epoch": 2.25, + "grad_norm": 7.551259040139519, + "learning_rate": 1.8032114034326576e-06, + "loss": 0.396, + "step": 39865 + }, + { + "epoch": 2.25, + "grad_norm": 4.692882712259712, + "learning_rate": 1.8019513234271684e-06, + "loss": 0.3856, + "step": 39870 + }, + { + "epoch": 2.25, + "grad_norm": 4.600972094497757, + "learning_rate": 1.8006915870661218e-06, + "loss": 0.3806, + "step": 39875 + }, + { + "epoch": 2.25, + "grad_norm": 4.49988349118192, + "learning_rate": 1.7994321944848853e-06, + "loss": 0.3649, + "step": 39880 + }, + { + "epoch": 2.25, + "grad_norm": 5.118171304183476, + "learning_rate": 1.7981731458187828e-06, + "loss": 0.3672, + "step": 39885 + }, + { + "epoch": 2.25, + "grad_norm": 6.591487130933895, + "learning_rate": 1.7969144412031086e-06, + "loss": 0.3517, + "step": 39890 + }, + { + "epoch": 2.25, + "grad_norm": 10.145874648692256, + "learning_rate": 1.7956560807731144e-06, + "loss": 0.3779, + "step": 39895 + }, + { + "epoch": 2.25, + "grad_norm": 9.145632912209214, + "learning_rate": 1.7943980646640136e-06, + "loss": 0.3989, + "step": 39900 + }, + { + "epoch": 2.25, + "grad_norm": 5.613315290941895, + "learning_rate": 1.7931403930109904e-06, + "loss": 0.385, + "step": 39905 + }, + { + "epoch": 2.25, + "grad_norm": 8.404404999385008, + "learning_rate": 1.7918830659491832e-06, + "loss": 0.4078, + "step": 39910 + }, + { + "epoch": 2.25, + "grad_norm": 5.832867129164264, + "learning_rate": 1.7906260836137e-06, + "loss": 0.4082, + "step": 39915 + }, + { + "epoch": 2.25, + "grad_norm": 4.84273079348836, + "learning_rate": 1.7893694461396093e-06, + "loss": 0.4031, + "step": 39920 + }, + { + "epoch": 2.25, + "grad_norm": 6.085281413321037, + "learning_rate": 1.788113153661939e-06, + "loss": 0.3423, + "step": 39925 + }, + { + "epoch": 2.25, + "grad_norm": 6.2583943463115554, + "learning_rate": 1.7868572063156868e-06, + "loss": 0.3832, + "step": 39930 + }, + { + "epoch": 2.25, + "grad_norm": 8.184451483121519, + "learning_rate": 1.7856016042358076e-06, + "loss": 0.3908, + "step": 39935 + }, + { + "epoch": 2.25, + "grad_norm": 7.637568268935216, + "learning_rate": 1.7843463475572216e-06, + "loss": 0.3583, + "step": 39940 + }, + { + "epoch": 2.25, + "grad_norm": 10.988223285001286, + "learning_rate": 1.7830914364148145e-06, + "loss": 0.3629, + "step": 39945 + }, + { + "epoch": 2.25, + "grad_norm": 6.474123226405208, + "learning_rate": 1.7818368709434276e-06, + "loss": 0.3732, + "step": 39950 + }, + { + "epoch": 2.25, + "grad_norm": 4.610931757106105, + "learning_rate": 1.7805826512778734e-06, + "loss": 0.3866, + "step": 39955 + }, + { + "epoch": 2.25, + "grad_norm": 4.387604416512119, + "learning_rate": 1.7793287775529206e-06, + "loss": 0.3664, + "step": 39960 + }, + { + "epoch": 2.25, + "grad_norm": 5.3282718625370515, + "learning_rate": 1.778075249903306e-06, + "loss": 0.3921, + "step": 39965 + }, + { + "epoch": 2.25, + "grad_norm": 4.732838549904333, + "learning_rate": 1.7768220684637233e-06, + "loss": 0.3841, + "step": 39970 + }, + { + "epoch": 2.25, + "grad_norm": 6.480958165378972, + "learning_rate": 1.7755692333688363e-06, + "loss": 0.4094, + "step": 39975 + }, + { + "epoch": 2.25, + "grad_norm": 5.8594509812252955, + "learning_rate": 1.7743167447532656e-06, + "loss": 0.3912, + "step": 39980 + }, + { + "epoch": 2.25, + "grad_norm": 5.201267026690023, + "learning_rate": 1.7730646027515947e-06, + "loss": 0.3791, + "step": 39985 + }, + { + "epoch": 2.25, + "grad_norm": 4.904683434394065, + "learning_rate": 1.7718128074983759e-06, + "loss": 0.357, + "step": 39990 + }, + { + "epoch": 2.25, + "grad_norm": 4.614484975725764, + "learning_rate": 1.7705613591281162e-06, + "loss": 0.3913, + "step": 39995 + }, + { + "epoch": 2.25, + "grad_norm": 4.903514375355792, + "learning_rate": 1.769310257775293e-06, + "loss": 0.3841, + "step": 40000 + }, + { + "epoch": 2.25, + "grad_norm": 5.101166798534954, + "learning_rate": 1.7680595035743402e-06, + "loss": 0.3933, + "step": 40005 + }, + { + "epoch": 2.25, + "grad_norm": 4.626978306924325, + "learning_rate": 1.7668090966596556e-06, + "loss": 0.3687, + "step": 40010 + }, + { + "epoch": 2.25, + "grad_norm": 5.118185491288194, + "learning_rate": 1.7655590371656051e-06, + "loss": 0.396, + "step": 40015 + }, + { + "epoch": 2.25, + "grad_norm": 4.51995297695505, + "learning_rate": 1.7643093252265087e-06, + "loss": 0.3408, + "step": 40020 + }, + { + "epoch": 2.26, + "grad_norm": 4.585735743445754, + "learning_rate": 1.7630599609766575e-06, + "loss": 0.3786, + "step": 40025 + }, + { + "epoch": 2.26, + "grad_norm": 5.597941801697267, + "learning_rate": 1.7618109445502969e-06, + "loss": 0.3905, + "step": 40030 + }, + { + "epoch": 2.26, + "grad_norm": 4.291746850834901, + "learning_rate": 1.7605622760816416e-06, + "loss": 0.3717, + "step": 40035 + }, + { + "epoch": 2.26, + "grad_norm": 4.183254091595897, + "learning_rate": 1.7593139557048688e-06, + "loss": 0.3818, + "step": 40040 + }, + { + "epoch": 2.26, + "grad_norm": 5.698888169204857, + "learning_rate": 1.7580659835541115e-06, + "loss": 0.3953, + "step": 40045 + }, + { + "epoch": 2.26, + "grad_norm": 4.08503488443472, + "learning_rate": 1.7568183597634736e-06, + "loss": 0.3478, + "step": 40050 + }, + { + "epoch": 2.26, + "grad_norm": 6.575952842973665, + "learning_rate": 1.755571084467015e-06, + "loss": 0.4021, + "step": 40055 + }, + { + "epoch": 2.26, + "grad_norm": 4.717343849842922, + "learning_rate": 1.7543241577987634e-06, + "loss": 0.3807, + "step": 40060 + }, + { + "epoch": 2.26, + "grad_norm": 4.896224140805982, + "learning_rate": 1.7530775798927053e-06, + "loss": 0.3848, + "step": 40065 + }, + { + "epoch": 2.26, + "grad_norm": 6.206786661431114, + "learning_rate": 1.7518313508827895e-06, + "loss": 0.3833, + "step": 40070 + }, + { + "epoch": 2.26, + "grad_norm": 5.457832631442598, + "learning_rate": 1.7505854709029319e-06, + "loss": 0.3935, + "step": 40075 + }, + { + "epoch": 2.26, + "grad_norm": 4.77053742441305, + "learning_rate": 1.7493399400870042e-06, + "loss": 0.3876, + "step": 40080 + }, + { + "epoch": 2.26, + "grad_norm": 5.315225688429048, + "learning_rate": 1.7480947585688486e-06, + "loss": 0.3804, + "step": 40085 + }, + { + "epoch": 2.26, + "grad_norm": 5.007213698493856, + "learning_rate": 1.7468499264822624e-06, + "loss": 0.3972, + "step": 40090 + }, + { + "epoch": 2.26, + "grad_norm": 4.255648013712933, + "learning_rate": 1.7456054439610076e-06, + "loss": 0.3585, + "step": 40095 + }, + { + "epoch": 2.26, + "grad_norm": 4.616146107390057, + "learning_rate": 1.7443613111388119e-06, + "loss": 0.3908, + "step": 40100 + }, + { + "epoch": 2.26, + "grad_norm": 4.7905542605923, + "learning_rate": 1.74311752814936e-06, + "loss": 0.4015, + "step": 40105 + }, + { + "epoch": 2.26, + "grad_norm": 5.456087471308075, + "learning_rate": 1.7418740951263053e-06, + "loss": 0.3468, + "step": 40110 + }, + { + "epoch": 2.26, + "grad_norm": 4.476592988351036, + "learning_rate": 1.740631012203256e-06, + "loss": 0.4064, + "step": 40115 + }, + { + "epoch": 2.26, + "grad_norm": 5.131544441389975, + "learning_rate": 1.739388279513791e-06, + "loss": 0.3988, + "step": 40120 + }, + { + "epoch": 2.26, + "grad_norm": 5.603534983768999, + "learning_rate": 1.7381458971914444e-06, + "loss": 0.3639, + "step": 40125 + }, + { + "epoch": 2.26, + "grad_norm": 8.43381883975848, + "learning_rate": 1.7369038653697163e-06, + "loss": 0.3781, + "step": 40130 + }, + { + "epoch": 2.26, + "grad_norm": 5.003059387400197, + "learning_rate": 1.7356621841820708e-06, + "loss": 0.3895, + "step": 40135 + }, + { + "epoch": 2.26, + "grad_norm": 4.452544345035951, + "learning_rate": 1.734420853761929e-06, + "loss": 0.4036, + "step": 40140 + }, + { + "epoch": 2.26, + "grad_norm": 5.69387442137243, + "learning_rate": 1.7331798742426798e-06, + "loss": 0.4047, + "step": 40145 + }, + { + "epoch": 2.26, + "grad_norm": 5.121479283529866, + "learning_rate": 1.7319392457576706e-06, + "loss": 0.3916, + "step": 40150 + }, + { + "epoch": 2.26, + "grad_norm": 4.633748456271346, + "learning_rate": 1.730698968440211e-06, + "loss": 0.3559, + "step": 40155 + }, + { + "epoch": 2.26, + "grad_norm": 5.2209201655120925, + "learning_rate": 1.7294590424235775e-06, + "loss": 0.3842, + "step": 40160 + }, + { + "epoch": 2.26, + "grad_norm": 4.787937888618413, + "learning_rate": 1.7282194678410019e-06, + "loss": 0.3768, + "step": 40165 + }, + { + "epoch": 2.26, + "grad_norm": 4.726384762671759, + "learning_rate": 1.7269802448256857e-06, + "loss": 0.3633, + "step": 40170 + }, + { + "epoch": 2.26, + "grad_norm": 5.48174191295697, + "learning_rate": 1.7257413735107874e-06, + "loss": 0.3756, + "step": 40175 + }, + { + "epoch": 2.26, + "grad_norm": 5.32187970316261, + "learning_rate": 1.7245028540294273e-06, + "loss": 0.4005, + "step": 40180 + }, + { + "epoch": 2.26, + "grad_norm": 5.091410517401763, + "learning_rate": 1.7232646865146925e-06, + "loss": 0.3681, + "step": 40185 + }, + { + "epoch": 2.26, + "grad_norm": 9.237693585398954, + "learning_rate": 1.7220268710996273e-06, + "loss": 0.402, + "step": 40190 + }, + { + "epoch": 2.26, + "grad_norm": 4.939114523363182, + "learning_rate": 1.7207894079172432e-06, + "loss": 0.4014, + "step": 40195 + }, + { + "epoch": 2.27, + "grad_norm": 7.504750750160518, + "learning_rate": 1.719552297100508e-06, + "loss": 0.4257, + "step": 40200 + }, + { + "epoch": 2.27, + "grad_norm": 5.6011876625089485, + "learning_rate": 1.7183155387823575e-06, + "loss": 0.4136, + "step": 40205 + }, + { + "epoch": 2.27, + "grad_norm": 5.318831549647662, + "learning_rate": 1.7170791330956836e-06, + "loss": 0.3744, + "step": 40210 + }, + { + "epoch": 2.27, + "grad_norm": 4.512806244539759, + "learning_rate": 1.7158430801733478e-06, + "loss": 0.3936, + "step": 40215 + }, + { + "epoch": 2.27, + "grad_norm": 4.398991443550248, + "learning_rate": 1.714607380148165e-06, + "loss": 0.3674, + "step": 40220 + }, + { + "epoch": 2.27, + "grad_norm": 4.834808978448424, + "learning_rate": 1.7133720331529192e-06, + "loss": 0.4031, + "step": 40225 + }, + { + "epoch": 2.27, + "grad_norm": 6.234458990469325, + "learning_rate": 1.7121370393203545e-06, + "loss": 0.3526, + "step": 40230 + }, + { + "epoch": 2.27, + "grad_norm": 5.230986911149694, + "learning_rate": 1.7109023987831758e-06, + "loss": 0.4306, + "step": 40235 + }, + { + "epoch": 2.27, + "grad_norm": 4.260164425173618, + "learning_rate": 1.7096681116740489e-06, + "loss": 0.3653, + "step": 40240 + }, + { + "epoch": 2.27, + "grad_norm": 5.09656629929404, + "learning_rate": 1.7084341781256058e-06, + "loss": 0.3697, + "step": 40245 + }, + { + "epoch": 2.27, + "grad_norm": 4.9899251099614945, + "learning_rate": 1.7072005982704354e-06, + "loss": 0.3697, + "step": 40250 + }, + { + "epoch": 2.27, + "grad_norm": 4.608311515844059, + "learning_rate": 1.7059673722410953e-06, + "loss": 0.3504, + "step": 40255 + }, + { + "epoch": 2.27, + "grad_norm": 5.661292840743603, + "learning_rate": 1.7047345001700981e-06, + "loss": 0.4, + "step": 40260 + }, + { + "epoch": 2.27, + "grad_norm": 5.65632882069433, + "learning_rate": 1.70350198218992e-06, + "loss": 0.3672, + "step": 40265 + }, + { + "epoch": 2.27, + "grad_norm": 5.380331718138243, + "learning_rate": 1.702269818433005e-06, + "loss": 0.3983, + "step": 40270 + }, + { + "epoch": 2.27, + "grad_norm": 5.4660712596552745, + "learning_rate": 1.7010380090317496e-06, + "loss": 0.3802, + "step": 40275 + }, + { + "epoch": 2.27, + "grad_norm": 4.887172784336666, + "learning_rate": 1.6998065541185216e-06, + "loss": 0.3907, + "step": 40280 + }, + { + "epoch": 2.27, + "grad_norm": 6.2312738967975285, + "learning_rate": 1.6985754538256427e-06, + "loss": 0.4181, + "step": 40285 + }, + { + "epoch": 2.27, + "grad_norm": 5.17169900427142, + "learning_rate": 1.6973447082854027e-06, + "loss": 0.363, + "step": 40290 + }, + { + "epoch": 2.27, + "grad_norm": 5.041953468166664, + "learning_rate": 1.6961143176300503e-06, + "loss": 0.3733, + "step": 40295 + }, + { + "epoch": 2.27, + "grad_norm": 4.670575023396616, + "learning_rate": 1.6948842819917933e-06, + "loss": 0.3909, + "step": 40300 + }, + { + "epoch": 2.27, + "grad_norm": 5.05039389259463, + "learning_rate": 1.6936546015028072e-06, + "loss": 0.4076, + "step": 40305 + }, + { + "epoch": 2.27, + "grad_norm": 4.382956696895827, + "learning_rate": 1.6924252762952275e-06, + "loss": 0.3801, + "step": 40310 + }, + { + "epoch": 2.27, + "grad_norm": 4.530599516911307, + "learning_rate": 1.6911963065011478e-06, + "loss": 0.4104, + "step": 40315 + }, + { + "epoch": 2.27, + "grad_norm": 4.374977071160649, + "learning_rate": 1.6899676922526297e-06, + "loss": 0.3685, + "step": 40320 + }, + { + "epoch": 2.27, + "grad_norm": 4.856868607579105, + "learning_rate": 1.6887394336816892e-06, + "loss": 0.3841, + "step": 40325 + }, + { + "epoch": 2.27, + "grad_norm": 4.822420778666742, + "learning_rate": 1.6875115309203128e-06, + "loss": 0.3677, + "step": 40330 + }, + { + "epoch": 2.27, + "grad_norm": 4.980018474505903, + "learning_rate": 1.6862839841004392e-06, + "loss": 0.3813, + "step": 40335 + }, + { + "epoch": 2.27, + "grad_norm": 6.318868293061312, + "learning_rate": 1.6850567933539774e-06, + "loss": 0.3865, + "step": 40340 + }, + { + "epoch": 2.27, + "grad_norm": 6.28277277721484, + "learning_rate": 1.6838299588127937e-06, + "loss": 0.403, + "step": 40345 + }, + { + "epoch": 2.27, + "grad_norm": 4.37803981519213, + "learning_rate": 1.6826034806087144e-06, + "loss": 0.3591, + "step": 40350 + }, + { + "epoch": 2.27, + "grad_norm": 4.519348203250091, + "learning_rate": 1.6813773588735338e-06, + "loss": 0.4053, + "step": 40355 + }, + { + "epoch": 2.27, + "grad_norm": 4.767719750048162, + "learning_rate": 1.6801515937390001e-06, + "loss": 0.3545, + "step": 40360 + }, + { + "epoch": 2.27, + "grad_norm": 5.166497253582355, + "learning_rate": 1.678926185336831e-06, + "loss": 0.3967, + "step": 40365 + }, + { + "epoch": 2.27, + "grad_norm": 4.3982206883441135, + "learning_rate": 1.6777011337986992e-06, + "loss": 0.3432, + "step": 40370 + }, + { + "epoch": 2.27, + "grad_norm": 4.561414973283749, + "learning_rate": 1.676476439256244e-06, + "loss": 0.3918, + "step": 40375 + }, + { + "epoch": 2.28, + "grad_norm": 4.428004784875688, + "learning_rate": 1.6752521018410633e-06, + "loss": 0.3739, + "step": 40380 + }, + { + "epoch": 2.28, + "grad_norm": 5.195053332195572, + "learning_rate": 1.6740281216847165e-06, + "loss": 0.365, + "step": 40385 + }, + { + "epoch": 2.28, + "grad_norm": 4.921958038547423, + "learning_rate": 1.6728044989187276e-06, + "loss": 0.3986, + "step": 40390 + }, + { + "epoch": 2.28, + "grad_norm": 4.7015340087411674, + "learning_rate": 1.671581233674578e-06, + "loss": 0.4132, + "step": 40395 + }, + { + "epoch": 2.28, + "grad_norm": 4.311419329575143, + "learning_rate": 1.6703583260837141e-06, + "loss": 0.3884, + "step": 40400 + }, + { + "epoch": 2.28, + "grad_norm": 5.4625246607491045, + "learning_rate": 1.6691357762775445e-06, + "loss": 0.3902, + "step": 40405 + }, + { + "epoch": 2.28, + "grad_norm": 4.81466550448462, + "learning_rate": 1.6679135843874344e-06, + "loss": 0.3725, + "step": 40410 + }, + { + "epoch": 2.28, + "grad_norm": 6.390794897561129, + "learning_rate": 1.6666917505447171e-06, + "loss": 0.3771, + "step": 40415 + }, + { + "epoch": 2.28, + "grad_norm": 5.0075798180616005, + "learning_rate": 1.6654702748806806e-06, + "loss": 0.4033, + "step": 40420 + }, + { + "epoch": 2.28, + "grad_norm": 5.067239046390207, + "learning_rate": 1.6642491575265807e-06, + "loss": 0.3808, + "step": 40425 + }, + { + "epoch": 2.28, + "grad_norm": 4.249901217365233, + "learning_rate": 1.6630283986136308e-06, + "loss": 0.3533, + "step": 40430 + }, + { + "epoch": 2.28, + "grad_norm": 6.202734373070795, + "learning_rate": 1.6618079982730052e-06, + "loss": 0.3619, + "step": 40435 + }, + { + "epoch": 2.28, + "grad_norm": 5.502412149561305, + "learning_rate": 1.660587956635844e-06, + "loss": 0.3837, + "step": 40440 + }, + { + "epoch": 2.28, + "grad_norm": 5.431563390945679, + "learning_rate": 1.6593682738332434e-06, + "loss": 0.4127, + "step": 40445 + }, + { + "epoch": 2.28, + "grad_norm": 4.680340853441706, + "learning_rate": 1.658148949996267e-06, + "loss": 0.3825, + "step": 40450 + }, + { + "epoch": 2.28, + "grad_norm": 5.629858676737288, + "learning_rate": 1.656929985255933e-06, + "loss": 0.3564, + "step": 40455 + }, + { + "epoch": 2.28, + "grad_norm": 5.03217003073883, + "learning_rate": 1.6557113797432273e-06, + "loss": 0.3768, + "step": 40460 + }, + { + "epoch": 2.28, + "grad_norm": 5.5766269217099955, + "learning_rate": 1.6544931335890934e-06, + "loss": 0.3908, + "step": 40465 + }, + { + "epoch": 2.28, + "grad_norm": 4.338633797693593, + "learning_rate": 1.6532752469244362e-06, + "loss": 0.361, + "step": 40470 + }, + { + "epoch": 2.28, + "grad_norm": 6.592202192053285, + "learning_rate": 1.6520577198801247e-06, + "loss": 0.3881, + "step": 40475 + }, + { + "epoch": 2.28, + "grad_norm": 4.821523306890911, + "learning_rate": 1.6508405525869859e-06, + "loss": 0.3807, + "step": 40480 + }, + { + "epoch": 2.28, + "grad_norm": 5.025014604605802, + "learning_rate": 1.6496237451758118e-06, + "loss": 0.3658, + "step": 40485 + }, + { + "epoch": 2.28, + "grad_norm": 4.756637352965077, + "learning_rate": 1.6484072977773512e-06, + "loss": 0.3922, + "step": 40490 + }, + { + "epoch": 2.28, + "grad_norm": 5.713603424803598, + "learning_rate": 1.6471912105223182e-06, + "loss": 0.3752, + "step": 40495 + }, + { + "epoch": 2.28, + "grad_norm": 6.496059990610466, + "learning_rate": 1.6459754835413882e-06, + "loss": 0.3894, + "step": 40500 + }, + { + "epoch": 2.28, + "grad_norm": 4.663798283636861, + "learning_rate": 1.6447601169651933e-06, + "loss": 0.3922, + "step": 40505 + }, + { + "epoch": 2.28, + "grad_norm": 5.76114830557741, + "learning_rate": 1.6435451109243334e-06, + "loss": 0.3568, + "step": 40510 + }, + { + "epoch": 2.28, + "grad_norm": 4.470802302673014, + "learning_rate": 1.6423304655493627e-06, + "loss": 0.3777, + "step": 40515 + }, + { + "epoch": 2.28, + "grad_norm": 5.110184425128485, + "learning_rate": 1.6411161809708038e-06, + "loss": 0.3803, + "step": 40520 + }, + { + "epoch": 2.28, + "grad_norm": 4.355958891754689, + "learning_rate": 1.6399022573191347e-06, + "loss": 0.3721, + "step": 40525 + }, + { + "epoch": 2.28, + "grad_norm": 4.7045848819087475, + "learning_rate": 1.6386886947247959e-06, + "loss": 0.3805, + "step": 40530 + }, + { + "epoch": 2.28, + "grad_norm": 4.500835312947065, + "learning_rate": 1.6374754933181936e-06, + "loss": 0.3665, + "step": 40535 + }, + { + "epoch": 2.28, + "grad_norm": 5.791410238145022, + "learning_rate": 1.6362626532296872e-06, + "loss": 0.3907, + "step": 40540 + }, + { + "epoch": 2.28, + "grad_norm": 4.969446936783075, + "learning_rate": 1.6350501745896064e-06, + "loss": 0.406, + "step": 40545 + }, + { + "epoch": 2.28, + "grad_norm": 4.809957590921652, + "learning_rate": 1.6338380575282341e-06, + "loss": 0.3676, + "step": 40550 + }, + { + "epoch": 2.29, + "grad_norm": 8.84334907773835, + "learning_rate": 1.6326263021758177e-06, + "loss": 0.4029, + "step": 40555 + }, + { + "epoch": 2.29, + "grad_norm": 4.860346619578493, + "learning_rate": 1.6314149086625674e-06, + "loss": 0.3833, + "step": 40560 + }, + { + "epoch": 2.29, + "grad_norm": 4.672873076175617, + "learning_rate": 1.6302038771186507e-06, + "loss": 0.3573, + "step": 40565 + }, + { + "epoch": 2.29, + "grad_norm": 4.565323847713768, + "learning_rate": 1.628993207674201e-06, + "loss": 0.3729, + "step": 40570 + }, + { + "epoch": 2.29, + "grad_norm": 4.871387321383843, + "learning_rate": 1.6277829004593065e-06, + "loss": 0.3661, + "step": 40575 + }, + { + "epoch": 2.29, + "grad_norm": 4.921764024629117, + "learning_rate": 1.6265729556040243e-06, + "loss": 0.3792, + "step": 40580 + }, + { + "epoch": 2.29, + "grad_norm": 6.628195277449639, + "learning_rate": 1.625363373238364e-06, + "loss": 0.4596, + "step": 40585 + }, + { + "epoch": 2.29, + "grad_norm": 4.241764352139462, + "learning_rate": 1.6241541534923028e-06, + "loss": 0.3558, + "step": 40590 + }, + { + "epoch": 2.29, + "grad_norm": 5.467257929690298, + "learning_rate": 1.622945296495778e-06, + "loss": 0.386, + "step": 40595 + }, + { + "epoch": 2.29, + "grad_norm": 5.809227856666526, + "learning_rate": 1.6217368023786838e-06, + "loss": 0.4057, + "step": 40600 + }, + { + "epoch": 2.29, + "grad_norm": 4.708594995182603, + "learning_rate": 1.6205286712708813e-06, + "loss": 0.3903, + "step": 40605 + }, + { + "epoch": 2.29, + "grad_norm": 4.8796229505159925, + "learning_rate": 1.619320903302188e-06, + "loss": 0.3808, + "step": 40610 + }, + { + "epoch": 2.29, + "grad_norm": 4.678629484760158, + "learning_rate": 1.6181134986023817e-06, + "loss": 0.3798, + "step": 40615 + }, + { + "epoch": 2.29, + "grad_norm": 4.720973732849028, + "learning_rate": 1.6169064573012072e-06, + "loss": 0.3715, + "step": 40620 + }, + { + "epoch": 2.29, + "grad_norm": 4.814377236137956, + "learning_rate": 1.6156997795283624e-06, + "loss": 0.3783, + "step": 40625 + }, + { + "epoch": 2.29, + "grad_norm": 5.417103814417879, + "learning_rate": 1.6144934654135148e-06, + "loss": 0.3753, + "step": 40630 + }, + { + "epoch": 2.29, + "grad_norm": 5.0065188606912745, + "learning_rate": 1.6132875150862853e-06, + "loss": 0.4103, + "step": 40635 + }, + { + "epoch": 2.29, + "grad_norm": 4.9714511946192355, + "learning_rate": 1.6120819286762574e-06, + "loss": 0.4156, + "step": 40640 + }, + { + "epoch": 2.29, + "grad_norm": 4.317206691917036, + "learning_rate": 1.6108767063129798e-06, + "loss": 0.3897, + "step": 40645 + }, + { + "epoch": 2.29, + "grad_norm": 4.861857406802999, + "learning_rate": 1.609671848125956e-06, + "loss": 0.3758, + "step": 40650 + }, + { + "epoch": 2.29, + "grad_norm": 4.81094751787445, + "learning_rate": 1.6084673542446567e-06, + "loss": 0.3742, + "step": 40655 + }, + { + "epoch": 2.29, + "grad_norm": 4.436022098390143, + "learning_rate": 1.6072632247985075e-06, + "loss": 0.3881, + "step": 40660 + }, + { + "epoch": 2.29, + "grad_norm": 5.927519085257424, + "learning_rate": 1.6060594599168972e-06, + "loss": 0.4152, + "step": 40665 + }, + { + "epoch": 2.29, + "grad_norm": 4.802895990350653, + "learning_rate": 1.6048560597291758e-06, + "loss": 0.3883, + "step": 40670 + }, + { + "epoch": 2.29, + "grad_norm": 4.997356432757642, + "learning_rate": 1.6036530243646564e-06, + "loss": 0.3541, + "step": 40675 + }, + { + "epoch": 2.29, + "grad_norm": 5.178252585774607, + "learning_rate": 1.6024503539526077e-06, + "loss": 0.3643, + "step": 40680 + }, + { + "epoch": 2.29, + "grad_norm": 5.81546052738333, + "learning_rate": 1.6012480486222626e-06, + "loss": 0.4504, + "step": 40685 + }, + { + "epoch": 2.29, + "grad_norm": 6.6443873418273425, + "learning_rate": 1.6000461085028162e-06, + "loss": 0.3774, + "step": 40690 + }, + { + "epoch": 2.29, + "grad_norm": 5.20862116748695, + "learning_rate": 1.5988445337234205e-06, + "loss": 0.3922, + "step": 40695 + }, + { + "epoch": 2.29, + "grad_norm": 5.090724390447631, + "learning_rate": 1.597643324413189e-06, + "loss": 0.3798, + "step": 40700 + }, + { + "epoch": 2.29, + "grad_norm": 6.972254039083642, + "learning_rate": 1.596442480701199e-06, + "loss": 0.3624, + "step": 40705 + }, + { + "epoch": 2.29, + "grad_norm": 5.308436788062177, + "learning_rate": 1.5952420027164839e-06, + "loss": 0.3603, + "step": 40710 + }, + { + "epoch": 2.29, + "grad_norm": 5.839966926446589, + "learning_rate": 1.594041890588044e-06, + "loss": 0.4028, + "step": 40715 + }, + { + "epoch": 2.29, + "grad_norm": 5.426533284941413, + "learning_rate": 1.5928421444448344e-06, + "loss": 0.3984, + "step": 40720 + }, + { + "epoch": 2.29, + "grad_norm": 4.334377695764412, + "learning_rate": 1.5916427644157717e-06, + "loss": 0.3652, + "step": 40725 + }, + { + "epoch": 2.29, + "grad_norm": 5.13341820801301, + "learning_rate": 1.5904437506297376e-06, + "loss": 0.3659, + "step": 40730 + }, + { + "epoch": 2.3, + "grad_norm": 4.811878173929772, + "learning_rate": 1.5892451032155686e-06, + "loss": 0.3971, + "step": 40735 + }, + { + "epoch": 2.3, + "grad_norm": 4.839613876293462, + "learning_rate": 1.588046822302068e-06, + "loss": 0.3931, + "step": 40740 + }, + { + "epoch": 2.3, + "grad_norm": 4.641267564857314, + "learning_rate": 1.586848908017994e-06, + "loss": 0.3594, + "step": 40745 + }, + { + "epoch": 2.3, + "grad_norm": 5.265078255817545, + "learning_rate": 1.5856513604920665e-06, + "loss": 0.3887, + "step": 40750 + }, + { + "epoch": 2.3, + "grad_norm": 4.924287364070042, + "learning_rate": 1.5844541798529706e-06, + "loss": 0.3897, + "step": 40755 + }, + { + "epoch": 2.3, + "grad_norm": 4.418766995878838, + "learning_rate": 1.5832573662293455e-06, + "loss": 0.3786, + "step": 40760 + }, + { + "epoch": 2.3, + "grad_norm": 4.247511210258513, + "learning_rate": 1.582060919749796e-06, + "loss": 0.3551, + "step": 40765 + }, + { + "epoch": 2.3, + "grad_norm": 4.783076149553892, + "learning_rate": 1.5808648405428866e-06, + "loss": 0.3794, + "step": 40770 + }, + { + "epoch": 2.3, + "grad_norm": 6.568669354521603, + "learning_rate": 1.579669128737138e-06, + "loss": 0.3932, + "step": 40775 + }, + { + "epoch": 2.3, + "grad_norm": 5.404243627253454, + "learning_rate": 1.5784737844610388e-06, + "loss": 0.3793, + "step": 40780 + }, + { + "epoch": 2.3, + "grad_norm": 5.471633109198061, + "learning_rate": 1.5772788078430295e-06, + "loss": 0.4197, + "step": 40785 + }, + { + "epoch": 2.3, + "grad_norm": 5.17267264392894, + "learning_rate": 1.57608419901152e-06, + "loss": 0.379, + "step": 40790 + }, + { + "epoch": 2.3, + "grad_norm": 4.8909752234693284, + "learning_rate": 1.5748899580948723e-06, + "loss": 0.3922, + "step": 40795 + }, + { + "epoch": 2.3, + "grad_norm": 4.530214688417191, + "learning_rate": 1.5736960852214167e-06, + "loss": 0.3865, + "step": 40800 + }, + { + "epoch": 2.3, + "grad_norm": 4.536300365596513, + "learning_rate": 1.5725025805194377e-06, + "loss": 0.3733, + "step": 40805 + }, + { + "epoch": 2.3, + "grad_norm": 4.738979810237718, + "learning_rate": 1.5713094441171818e-06, + "loss": 0.4215, + "step": 40810 + }, + { + "epoch": 2.3, + "grad_norm": 5.6112891500415, + "learning_rate": 1.5701166761428599e-06, + "loss": 0.3837, + "step": 40815 + }, + { + "epoch": 2.3, + "grad_norm": 5.267322599042935, + "learning_rate": 1.5689242767246366e-06, + "loss": 0.3814, + "step": 40820 + }, + { + "epoch": 2.3, + "grad_norm": 4.715174760088558, + "learning_rate": 1.5677322459906436e-06, + "loss": 0.3578, + "step": 40825 + }, + { + "epoch": 2.3, + "grad_norm": 5.234044556174818, + "learning_rate": 1.566540584068969e-06, + "loss": 0.3827, + "step": 40830 + }, + { + "epoch": 2.3, + "grad_norm": 5.479917105729541, + "learning_rate": 1.5653492910876595e-06, + "loss": 0.3746, + "step": 40835 + }, + { + "epoch": 2.3, + "grad_norm": 4.679943290035979, + "learning_rate": 1.564158367174729e-06, + "loss": 0.3906, + "step": 40840 + }, + { + "epoch": 2.3, + "grad_norm": 7.136481668956996, + "learning_rate": 1.5629678124581438e-06, + "loss": 0.3901, + "step": 40845 + }, + { + "epoch": 2.3, + "grad_norm": 5.376873173911454, + "learning_rate": 1.5617776270658374e-06, + "loss": 0.3857, + "step": 40850 + }, + { + "epoch": 2.3, + "grad_norm": 4.394554983598227, + "learning_rate": 1.5605878111256972e-06, + "loss": 0.3566, + "step": 40855 + }, + { + "epoch": 2.3, + "grad_norm": 4.481350604069281, + "learning_rate": 1.5593983647655764e-06, + "loss": 0.35, + "step": 40860 + }, + { + "epoch": 2.3, + "grad_norm": 4.353970597670246, + "learning_rate": 1.558209288113287e-06, + "loss": 0.3553, + "step": 40865 + }, + { + "epoch": 2.3, + "grad_norm": 5.064622350006163, + "learning_rate": 1.5570205812965982e-06, + "loss": 0.3855, + "step": 40870 + }, + { + "epoch": 2.3, + "grad_norm": 4.966188123128139, + "learning_rate": 1.5558322444432444e-06, + "loss": 0.3888, + "step": 40875 + }, + { + "epoch": 2.3, + "grad_norm": 4.345661301466502, + "learning_rate": 1.554644277680915e-06, + "loss": 0.3775, + "step": 40880 + }, + { + "epoch": 2.3, + "grad_norm": 5.868666959598315, + "learning_rate": 1.5534566811372648e-06, + "loss": 0.3879, + "step": 40885 + }, + { + "epoch": 2.3, + "grad_norm": 6.154470452715487, + "learning_rate": 1.5522694549399047e-06, + "loss": 0.3642, + "step": 40890 + }, + { + "epoch": 2.3, + "grad_norm": 5.184623228605581, + "learning_rate": 1.5510825992164063e-06, + "loss": 0.3871, + "step": 40895 + }, + { + "epoch": 2.3, + "grad_norm": 5.153310343958727, + "learning_rate": 1.5498961140943053e-06, + "loss": 0.3916, + "step": 40900 + }, + { + "epoch": 2.3, + "grad_norm": 7.562091939341356, + "learning_rate": 1.5487099997010918e-06, + "loss": 0.3909, + "step": 40905 + }, + { + "epoch": 2.31, + "grad_norm": 6.3833323057626945, + "learning_rate": 1.5475242561642218e-06, + "loss": 0.3969, + "step": 40910 + }, + { + "epoch": 2.31, + "grad_norm": 4.311173049896008, + "learning_rate": 1.5463388836111083e-06, + "loss": 0.3805, + "step": 40915 + }, + { + "epoch": 2.31, + "grad_norm": 4.621440572540954, + "learning_rate": 1.5451538821691214e-06, + "loss": 0.3626, + "step": 40920 + }, + { + "epoch": 2.31, + "grad_norm": 5.674777468191818, + "learning_rate": 1.543969251965599e-06, + "loss": 0.4025, + "step": 40925 + }, + { + "epoch": 2.31, + "grad_norm": 5.557421883043057, + "learning_rate": 1.542784993127832e-06, + "loss": 0.3675, + "step": 40930 + }, + { + "epoch": 2.31, + "grad_norm": 5.129566341297116, + "learning_rate": 1.5416011057830765e-06, + "loss": 0.3873, + "step": 40935 + }, + { + "epoch": 2.31, + "grad_norm": 4.563078388464786, + "learning_rate": 1.540417590058544e-06, + "loss": 0.3591, + "step": 40940 + }, + { + "epoch": 2.31, + "grad_norm": 4.244213722893947, + "learning_rate": 1.5392344460814113e-06, + "loss": 0.4273, + "step": 40945 + }, + { + "epoch": 2.31, + "grad_norm": 4.094560947769992, + "learning_rate": 1.5380516739788087e-06, + "loss": 0.3682, + "step": 40950 + }, + { + "epoch": 2.31, + "grad_norm": 5.505086647240551, + "learning_rate": 1.5368692738778335e-06, + "loss": 0.387, + "step": 40955 + }, + { + "epoch": 2.31, + "grad_norm": 6.498789918590851, + "learning_rate": 1.5356872459055394e-06, + "loss": 0.3465, + "step": 40960 + }, + { + "epoch": 2.31, + "grad_norm": 4.820769572094417, + "learning_rate": 1.5345055901889388e-06, + "loss": 0.394, + "step": 40965 + }, + { + "epoch": 2.31, + "grad_norm": 4.791439790016614, + "learning_rate": 1.5333243068550085e-06, + "loss": 0.3751, + "step": 40970 + }, + { + "epoch": 2.31, + "grad_norm": 4.55250492826794, + "learning_rate": 1.5321433960306808e-06, + "loss": 0.3693, + "step": 40975 + }, + { + "epoch": 2.31, + "grad_norm": 6.270833146308453, + "learning_rate": 1.5309628578428487e-06, + "loss": 0.3558, + "step": 40980 + }, + { + "epoch": 2.31, + "grad_norm": 4.659859310376631, + "learning_rate": 1.529782692418369e-06, + "loss": 0.3842, + "step": 40985 + }, + { + "epoch": 2.31, + "grad_norm": 6.083723396185362, + "learning_rate": 1.528602899884053e-06, + "loss": 0.3744, + "step": 40990 + }, + { + "epoch": 2.31, + "grad_norm": 8.007805627671026, + "learning_rate": 1.527423480366677e-06, + "loss": 0.3797, + "step": 40995 + }, + { + "epoch": 2.31, + "grad_norm": 5.629745728060397, + "learning_rate": 1.526244433992972e-06, + "loss": 0.3707, + "step": 41000 + }, + { + "epoch": 2.31, + "grad_norm": 5.29122520084207, + "learning_rate": 1.5250657608896359e-06, + "loss": 0.369, + "step": 41005 + }, + { + "epoch": 2.31, + "grad_norm": 5.0311020264316575, + "learning_rate": 1.5238874611833188e-06, + "loss": 0.3878, + "step": 41010 + }, + { + "epoch": 2.31, + "grad_norm": 5.058390944753647, + "learning_rate": 1.5227095350006344e-06, + "loss": 0.3781, + "step": 41015 + }, + { + "epoch": 2.31, + "grad_norm": 4.338325784886404, + "learning_rate": 1.521531982468158e-06, + "loss": 0.3644, + "step": 41020 + }, + { + "epoch": 2.31, + "grad_norm": 5.424131997517405, + "learning_rate": 1.5203548037124199e-06, + "loss": 0.3848, + "step": 41025 + }, + { + "epoch": 2.31, + "grad_norm": 4.815794483256725, + "learning_rate": 1.5191779988599176e-06, + "loss": 0.3626, + "step": 41030 + }, + { + "epoch": 2.31, + "grad_norm": 4.924553387563179, + "learning_rate": 1.518001568037099e-06, + "loss": 0.3728, + "step": 41035 + }, + { + "epoch": 2.31, + "grad_norm": 5.723624819856848, + "learning_rate": 1.5168255113703805e-06, + "loss": 0.3686, + "step": 41040 + }, + { + "epoch": 2.31, + "grad_norm": 4.432214576467207, + "learning_rate": 1.5156498289861321e-06, + "loss": 0.3797, + "step": 41045 + }, + { + "epoch": 2.31, + "grad_norm": 5.246535250718635, + "learning_rate": 1.514474521010687e-06, + "loss": 0.3598, + "step": 41050 + }, + { + "epoch": 2.31, + "grad_norm": 6.387679548552412, + "learning_rate": 1.5132995875703393e-06, + "loss": 0.3706, + "step": 41055 + }, + { + "epoch": 2.31, + "grad_norm": 4.845500059135731, + "learning_rate": 1.5121250287913387e-06, + "loss": 0.3693, + "step": 41060 + }, + { + "epoch": 2.31, + "grad_norm": 9.057007563879125, + "learning_rate": 1.5109508447998956e-06, + "loss": 0.3936, + "step": 41065 + }, + { + "epoch": 2.31, + "grad_norm": 4.706049512572617, + "learning_rate": 1.5097770357221836e-06, + "loss": 0.3863, + "step": 41070 + }, + { + "epoch": 2.31, + "grad_norm": 6.334496043402186, + "learning_rate": 1.5086036016843313e-06, + "loss": 0.4276, + "step": 41075 + }, + { + "epoch": 2.31, + "grad_norm": 5.243426822776101, + "learning_rate": 1.5074305428124325e-06, + "loss": 0.3561, + "step": 41080 + }, + { + "epoch": 2.31, + "grad_norm": 11.798026476982267, + "learning_rate": 1.5062578592325338e-06, + "loss": 0.3621, + "step": 41085 + }, + { + "epoch": 2.32, + "grad_norm": 5.113419671835875, + "learning_rate": 1.5050855510706491e-06, + "loss": 0.3567, + "step": 41090 + }, + { + "epoch": 2.32, + "grad_norm": 4.536698788021902, + "learning_rate": 1.5039136184527465e-06, + "loss": 0.3819, + "step": 41095 + }, + { + "epoch": 2.32, + "grad_norm": 4.159698410576232, + "learning_rate": 1.5027420615047527e-06, + "loss": 0.3852, + "step": 41100 + }, + { + "epoch": 2.32, + "grad_norm": 4.483275684812258, + "learning_rate": 1.5015708803525607e-06, + "loss": 0.352, + "step": 41105 + }, + { + "epoch": 2.32, + "grad_norm": 4.409720247496013, + "learning_rate": 1.500400075122016e-06, + "loss": 0.3293, + "step": 41110 + }, + { + "epoch": 2.32, + "grad_norm": 4.864562323156177, + "learning_rate": 1.49922964593893e-06, + "loss": 0.3775, + "step": 41115 + }, + { + "epoch": 2.32, + "grad_norm": 4.582645043870312, + "learning_rate": 1.4980595929290682e-06, + "loss": 0.3388, + "step": 41120 + }, + { + "epoch": 2.32, + "grad_norm": 4.971510177781197, + "learning_rate": 1.4968899162181566e-06, + "loss": 0.3746, + "step": 41125 + }, + { + "epoch": 2.32, + "grad_norm": 4.342454429663956, + "learning_rate": 1.4957206159318843e-06, + "loss": 0.4003, + "step": 41130 + }, + { + "epoch": 2.32, + "grad_norm": 4.386339371066969, + "learning_rate": 1.4945516921958985e-06, + "loss": 0.381, + "step": 41135 + }, + { + "epoch": 2.32, + "grad_norm": 8.40558156905113, + "learning_rate": 1.4933831451358032e-06, + "loss": 0.3786, + "step": 41140 + }, + { + "epoch": 2.32, + "grad_norm": 4.6527546428144415, + "learning_rate": 1.492214974877166e-06, + "loss": 0.3573, + "step": 41145 + }, + { + "epoch": 2.32, + "grad_norm": 6.008328683240061, + "learning_rate": 1.491047181545509e-06, + "loss": 0.3496, + "step": 41150 + }, + { + "epoch": 2.32, + "grad_norm": 5.137348978051636, + "learning_rate": 1.4898797652663206e-06, + "loss": 0.3832, + "step": 41155 + }, + { + "epoch": 2.32, + "grad_norm": 5.402391843206946, + "learning_rate": 1.4887127261650408e-06, + "loss": 0.3586, + "step": 41160 + }, + { + "epoch": 2.32, + "grad_norm": 4.877030220146141, + "learning_rate": 1.487546064367077e-06, + "loss": 0.3727, + "step": 41165 + }, + { + "epoch": 2.32, + "grad_norm": 4.451313273696703, + "learning_rate": 1.4863797799977886e-06, + "loss": 0.4203, + "step": 41170 + }, + { + "epoch": 2.32, + "grad_norm": 5.544209830360189, + "learning_rate": 1.485213873182501e-06, + "loss": 0.3709, + "step": 41175 + }, + { + "epoch": 2.32, + "grad_norm": 8.802021736841098, + "learning_rate": 1.4840483440464948e-06, + "loss": 0.4226, + "step": 41180 + }, + { + "epoch": 2.32, + "grad_norm": 5.014701088904308, + "learning_rate": 1.4828831927150096e-06, + "loss": 0.4191, + "step": 41185 + }, + { + "epoch": 2.32, + "grad_norm": 4.589328705932698, + "learning_rate": 1.4817184193132488e-06, + "loss": 0.3706, + "step": 41190 + }, + { + "epoch": 2.32, + "grad_norm": 4.8671270281107475, + "learning_rate": 1.4805540239663696e-06, + "loss": 0.3833, + "step": 41195 + }, + { + "epoch": 2.32, + "grad_norm": 4.750281417746645, + "learning_rate": 1.4793900067994948e-06, + "loss": 0.3768, + "step": 41200 + }, + { + "epoch": 2.32, + "grad_norm": 4.852647510417684, + "learning_rate": 1.4782263679377018e-06, + "loss": 0.3667, + "step": 41205 + }, + { + "epoch": 2.32, + "grad_norm": 4.639245774739903, + "learning_rate": 1.477063107506026e-06, + "loss": 0.3696, + "step": 41210 + }, + { + "epoch": 2.32, + "grad_norm": 6.678349940400766, + "learning_rate": 1.4759002256294691e-06, + "loss": 0.3928, + "step": 41215 + }, + { + "epoch": 2.32, + "grad_norm": 4.682189264635429, + "learning_rate": 1.4747377224329845e-06, + "loss": 0.3745, + "step": 41220 + }, + { + "epoch": 2.32, + "grad_norm": 5.026459060219476, + "learning_rate": 1.4735755980414902e-06, + "loss": 0.4051, + "step": 41225 + }, + { + "epoch": 2.32, + "grad_norm": 4.962382841294382, + "learning_rate": 1.472413852579862e-06, + "loss": 0.361, + "step": 41230 + }, + { + "epoch": 2.32, + "grad_norm": 5.432993333532744, + "learning_rate": 1.4712524861729328e-06, + "loss": 0.38, + "step": 41235 + }, + { + "epoch": 2.32, + "grad_norm": 5.84267554570037, + "learning_rate": 1.470091498945499e-06, + "loss": 0.3699, + "step": 41240 + }, + { + "epoch": 2.32, + "grad_norm": 5.795298784613299, + "learning_rate": 1.4689308910223104e-06, + "loss": 0.3812, + "step": 41245 + }, + { + "epoch": 2.32, + "grad_norm": 7.465642466091052, + "learning_rate": 1.467770662528083e-06, + "loss": 0.3631, + "step": 41250 + }, + { + "epoch": 2.32, + "grad_norm": 4.545525192552546, + "learning_rate": 1.4666108135874851e-06, + "loss": 0.3592, + "step": 41255 + }, + { + "epoch": 2.32, + "grad_norm": 4.242304682894049, + "learning_rate": 1.465451344325151e-06, + "loss": 0.3602, + "step": 41260 + }, + { + "epoch": 2.33, + "grad_norm": 4.713111656574008, + "learning_rate": 1.4642922548656695e-06, + "loss": 0.3545, + "step": 41265 + }, + { + "epoch": 2.33, + "grad_norm": 5.309981303294159, + "learning_rate": 1.4631335453335877e-06, + "loss": 0.3438, + "step": 41270 + }, + { + "epoch": 2.33, + "grad_norm": 5.687516209370226, + "learning_rate": 1.4619752158534174e-06, + "loss": 0.3656, + "step": 41275 + }, + { + "epoch": 2.33, + "grad_norm": 4.786537559134805, + "learning_rate": 1.460817266549623e-06, + "loss": 0.3797, + "step": 41280 + }, + { + "epoch": 2.33, + "grad_norm": 4.569979725037328, + "learning_rate": 1.4596596975466342e-06, + "loss": 0.374, + "step": 41285 + }, + { + "epoch": 2.33, + "grad_norm": 4.478371640377967, + "learning_rate": 1.4585025089688366e-06, + "loss": 0.3376, + "step": 41290 + }, + { + "epoch": 2.33, + "grad_norm": 7.089197001967615, + "learning_rate": 1.4573457009405716e-06, + "loss": 0.4028, + "step": 41295 + }, + { + "epoch": 2.33, + "grad_norm": 5.8026464978663075, + "learning_rate": 1.4561892735861482e-06, + "loss": 0.3503, + "step": 41300 + }, + { + "epoch": 2.33, + "grad_norm": 6.550690911035344, + "learning_rate": 1.455033227029825e-06, + "loss": 0.3742, + "step": 41305 + }, + { + "epoch": 2.33, + "grad_norm": 4.776837430310707, + "learning_rate": 1.4538775613958289e-06, + "loss": 0.3591, + "step": 41310 + }, + { + "epoch": 2.33, + "grad_norm": 6.620114119199254, + "learning_rate": 1.4527222768083377e-06, + "loss": 0.3714, + "step": 41315 + }, + { + "epoch": 2.33, + "grad_norm": 5.524204452624631, + "learning_rate": 1.4515673733914926e-06, + "loss": 0.3547, + "step": 41320 + }, + { + "epoch": 2.33, + "grad_norm": 5.215791621823948, + "learning_rate": 1.4504128512693955e-06, + "loss": 0.381, + "step": 41325 + }, + { + "epoch": 2.33, + "grad_norm": 4.630587571043699, + "learning_rate": 1.4492587105661015e-06, + "loss": 0.3636, + "step": 41330 + }, + { + "epoch": 2.33, + "grad_norm": 4.358019452151076, + "learning_rate": 1.4481049514056316e-06, + "loss": 0.3746, + "step": 41335 + }, + { + "epoch": 2.33, + "grad_norm": 5.4237005159013565, + "learning_rate": 1.4469515739119583e-06, + "loss": 0.3608, + "step": 41340 + }, + { + "epoch": 2.33, + "grad_norm": 4.7422580007666, + "learning_rate": 1.4457985782090205e-06, + "loss": 0.3603, + "step": 41345 + }, + { + "epoch": 2.33, + "grad_norm": 4.715583950427968, + "learning_rate": 1.4446459644207124e-06, + "loss": 0.3657, + "step": 41350 + }, + { + "epoch": 2.33, + "grad_norm": 4.698222572958804, + "learning_rate": 1.4434937326708837e-06, + "loss": 0.3499, + "step": 41355 + }, + { + "epoch": 2.33, + "grad_norm": 5.201807539958746, + "learning_rate": 1.4423418830833518e-06, + "loss": 0.3744, + "step": 41360 + }, + { + "epoch": 2.33, + "grad_norm": 6.3381142221138305, + "learning_rate": 1.4411904157818835e-06, + "loss": 0.4119, + "step": 41365 + }, + { + "epoch": 2.33, + "grad_norm": 4.969531506545102, + "learning_rate": 1.4400393308902133e-06, + "loss": 0.391, + "step": 41370 + }, + { + "epoch": 2.33, + "grad_norm": 4.737253772410425, + "learning_rate": 1.4388886285320281e-06, + "loss": 0.3734, + "step": 41375 + }, + { + "epoch": 2.33, + "grad_norm": 4.717335145515903, + "learning_rate": 1.437738308830975e-06, + "loss": 0.3761, + "step": 41380 + }, + { + "epoch": 2.33, + "grad_norm": 4.156294678426971, + "learning_rate": 1.4365883719106633e-06, + "loss": 0.3746, + "step": 41385 + }, + { + "epoch": 2.33, + "grad_norm": 5.3731152947031, + "learning_rate": 1.435438817894656e-06, + "loss": 0.3878, + "step": 41390 + }, + { + "epoch": 2.33, + "grad_norm": 6.342736889782638, + "learning_rate": 1.434289646906481e-06, + "loss": 0.3422, + "step": 41395 + }, + { + "epoch": 2.33, + "grad_norm": 4.5571586886572755, + "learning_rate": 1.4331408590696182e-06, + "loss": 0.3709, + "step": 41400 + }, + { + "epoch": 2.33, + "grad_norm": 7.298640611879569, + "learning_rate": 1.4319924545075137e-06, + "loss": 0.3907, + "step": 41405 + }, + { + "epoch": 2.33, + "grad_norm": 5.493926763384604, + "learning_rate": 1.430844433343565e-06, + "loss": 0.3889, + "step": 41410 + }, + { + "epoch": 2.33, + "grad_norm": 4.963244884218475, + "learning_rate": 1.4296967957011342e-06, + "loss": 0.3839, + "step": 41415 + }, + { + "epoch": 2.33, + "grad_norm": 4.535090327382197, + "learning_rate": 1.4285495417035411e-06, + "loss": 0.3572, + "step": 41420 + }, + { + "epoch": 2.33, + "grad_norm": 4.426326852781701, + "learning_rate": 1.4274026714740601e-06, + "loss": 0.3797, + "step": 41425 + }, + { + "epoch": 2.33, + "grad_norm": 4.7193713564748005, + "learning_rate": 1.4262561851359308e-06, + "loss": 0.3802, + "step": 41430 + }, + { + "epoch": 2.33, + "grad_norm": 5.6806080297893615, + "learning_rate": 1.425110082812346e-06, + "loss": 0.3822, + "step": 41435 + }, + { + "epoch": 2.33, + "grad_norm": 5.334496719452402, + "learning_rate": 1.423964364626459e-06, + "loss": 0.3882, + "step": 41440 + }, + { + "epoch": 2.34, + "grad_norm": 4.7277683705123525, + "learning_rate": 1.4228190307013845e-06, + "loss": 0.388, + "step": 41445 + }, + { + "epoch": 2.34, + "grad_norm": 5.897797861548727, + "learning_rate": 1.4216740811601915e-06, + "loss": 0.3658, + "step": 41450 + }, + { + "epoch": 2.34, + "grad_norm": 4.040401471109189, + "learning_rate": 1.420529516125912e-06, + "loss": 0.3491, + "step": 41455 + }, + { + "epoch": 2.34, + "grad_norm": 4.665000912640641, + "learning_rate": 1.4193853357215331e-06, + "loss": 0.3443, + "step": 41460 + }, + { + "epoch": 2.34, + "grad_norm": 7.097019856409633, + "learning_rate": 1.418241540070001e-06, + "loss": 0.3956, + "step": 41465 + }, + { + "epoch": 2.34, + "grad_norm": 4.362517713058213, + "learning_rate": 1.4170981292942248e-06, + "loss": 0.3568, + "step": 41470 + }, + { + "epoch": 2.34, + "grad_norm": 4.506807738596816, + "learning_rate": 1.4159551035170655e-06, + "loss": 0.3554, + "step": 41475 + }, + { + "epoch": 2.34, + "grad_norm": 4.9957157494220095, + "learning_rate": 1.4148124628613492e-06, + "loss": 0.4186, + "step": 41480 + }, + { + "epoch": 2.34, + "grad_norm": 4.587804492736562, + "learning_rate": 1.413670207449855e-06, + "loss": 0.3728, + "step": 41485 + }, + { + "epoch": 2.34, + "grad_norm": 4.705940252804655, + "learning_rate": 1.412528337405326e-06, + "loss": 0.3681, + "step": 41490 + }, + { + "epoch": 2.34, + "grad_norm": 4.83490997795641, + "learning_rate": 1.4113868528504587e-06, + "loss": 0.3701, + "step": 41495 + }, + { + "epoch": 2.34, + "grad_norm": 4.60418297060718, + "learning_rate": 1.410245753907913e-06, + "loss": 0.3548, + "step": 41500 + }, + { + "epoch": 2.34, + "grad_norm": 4.825715126402115, + "learning_rate": 1.4091050407003025e-06, + "loss": 0.339, + "step": 41505 + }, + { + "epoch": 2.34, + "grad_norm": 5.057393212416262, + "learning_rate": 1.4079647133502033e-06, + "loss": 0.3736, + "step": 41510 + }, + { + "epoch": 2.34, + "grad_norm": 5.4553613565875505, + "learning_rate": 1.4068247719801497e-06, + "loss": 0.3824, + "step": 41515 + }, + { + "epoch": 2.34, + "grad_norm": 4.533306894948522, + "learning_rate": 1.4056852167126328e-06, + "loss": 0.351, + "step": 41520 + }, + { + "epoch": 2.34, + "grad_norm": 4.715541749177123, + "learning_rate": 1.4045460476701006e-06, + "loss": 0.3851, + "step": 41525 + }, + { + "epoch": 2.34, + "grad_norm": 4.603344085842678, + "learning_rate": 1.4034072649749646e-06, + "loss": 0.3716, + "step": 41530 + }, + { + "epoch": 2.34, + "grad_norm": 4.403149084826972, + "learning_rate": 1.4022688687495901e-06, + "loss": 0.3641, + "step": 41535 + }, + { + "epoch": 2.34, + "grad_norm": 4.895832540303241, + "learning_rate": 1.4011308591163047e-06, + "loss": 0.4189, + "step": 41540 + }, + { + "epoch": 2.34, + "grad_norm": 6.169701701755714, + "learning_rate": 1.3999932361973917e-06, + "loss": 0.3623, + "step": 41545 + }, + { + "epoch": 2.34, + "grad_norm": 6.516106282935076, + "learning_rate": 1.398856000115092e-06, + "loss": 0.3828, + "step": 41550 + }, + { + "epoch": 2.34, + "grad_norm": 4.4977017376881125, + "learning_rate": 1.3977191509916094e-06, + "loss": 0.3622, + "step": 41555 + }, + { + "epoch": 2.34, + "grad_norm": 4.846541545398783, + "learning_rate": 1.3965826889491008e-06, + "loss": 0.3712, + "step": 41560 + }, + { + "epoch": 2.34, + "grad_norm": 4.629694430996607, + "learning_rate": 1.3954466141096867e-06, + "loss": 0.3738, + "step": 41565 + }, + { + "epoch": 2.34, + "grad_norm": 5.315466466695631, + "learning_rate": 1.3943109265954408e-06, + "loss": 0.359, + "step": 41570 + }, + { + "epoch": 2.34, + "grad_norm": 4.559927034532552, + "learning_rate": 1.3931756265284003e-06, + "loss": 0.3858, + "step": 41575 + }, + { + "epoch": 2.34, + "grad_norm": 4.773281738613931, + "learning_rate": 1.392040714030557e-06, + "loss": 0.3779, + "step": 41580 + }, + { + "epoch": 2.34, + "grad_norm": 5.5318569459830655, + "learning_rate": 1.3909061892238602e-06, + "loss": 0.3428, + "step": 41585 + }, + { + "epoch": 2.34, + "grad_norm": 6.394046415543049, + "learning_rate": 1.3897720522302215e-06, + "loss": 0.3631, + "step": 41590 + }, + { + "epoch": 2.34, + "grad_norm": 4.794106531694925, + "learning_rate": 1.3886383031715111e-06, + "loss": 0.381, + "step": 41595 + }, + { + "epoch": 2.34, + "grad_norm": 4.5524649568518125, + "learning_rate": 1.3875049421695514e-06, + "loss": 0.3627, + "step": 41600 + }, + { + "epoch": 2.34, + "grad_norm": 4.6490462069269185, + "learning_rate": 1.3863719693461303e-06, + "loss": 0.3322, + "step": 41605 + }, + { + "epoch": 2.34, + "grad_norm": 4.8040181101337005, + "learning_rate": 1.3852393848229872e-06, + "loss": 0.4208, + "step": 41610 + }, + { + "epoch": 2.34, + "grad_norm": 6.121062145125661, + "learning_rate": 1.3841071887218278e-06, + "loss": 0.3813, + "step": 41615 + }, + { + "epoch": 2.35, + "grad_norm": 4.4776921560623855, + "learning_rate": 1.3829753811643071e-06, + "loss": 0.3607, + "step": 41620 + }, + { + "epoch": 2.35, + "grad_norm": 4.479672731586354, + "learning_rate": 1.3818439622720465e-06, + "loss": 0.3466, + "step": 41625 + }, + { + "epoch": 2.35, + "grad_norm": 4.910570954227758, + "learning_rate": 1.3807129321666201e-06, + "loss": 0.3706, + "step": 41630 + }, + { + "epoch": 2.35, + "grad_norm": 4.818472317059777, + "learning_rate": 1.3795822909695605e-06, + "loss": 0.3632, + "step": 41635 + }, + { + "epoch": 2.35, + "grad_norm": 5.126721980157568, + "learning_rate": 1.3784520388023641e-06, + "loss": 0.3274, + "step": 41640 + }, + { + "epoch": 2.35, + "grad_norm": 4.6296388314821355, + "learning_rate": 1.3773221757864775e-06, + "loss": 0.3488, + "step": 41645 + }, + { + "epoch": 2.35, + "grad_norm": 4.765091515487751, + "learning_rate": 1.3761927020433124e-06, + "loss": 0.3731, + "step": 41650 + }, + { + "epoch": 2.35, + "grad_norm": 5.604923886368388, + "learning_rate": 1.3750636176942333e-06, + "loss": 0.3734, + "step": 41655 + }, + { + "epoch": 2.35, + "grad_norm": 5.116447850524022, + "learning_rate": 1.373934922860568e-06, + "loss": 0.346, + "step": 41660 + }, + { + "epoch": 2.35, + "grad_norm": 5.244332453409364, + "learning_rate": 1.372806617663598e-06, + "loss": 0.3485, + "step": 41665 + }, + { + "epoch": 2.35, + "grad_norm": 4.352084726186808, + "learning_rate": 1.3716787022245636e-06, + "loss": 0.3894, + "step": 41670 + }, + { + "epoch": 2.35, + "grad_norm": 4.9576958499730415, + "learning_rate": 1.3705511766646674e-06, + "loss": 0.3756, + "step": 41675 + }, + { + "epoch": 2.35, + "grad_norm": 5.806080884421692, + "learning_rate": 1.3694240411050635e-06, + "loss": 0.3728, + "step": 41680 + }, + { + "epoch": 2.35, + "grad_norm": 4.464414698598367, + "learning_rate": 1.3682972956668694e-06, + "loss": 0.3868, + "step": 41685 + }, + { + "epoch": 2.35, + "grad_norm": 4.753367671992034, + "learning_rate": 1.3671709404711602e-06, + "loss": 0.3959, + "step": 41690 + }, + { + "epoch": 2.35, + "grad_norm": 8.959482104429004, + "learning_rate": 1.3660449756389654e-06, + "loss": 0.3848, + "step": 41695 + }, + { + "epoch": 2.35, + "grad_norm": 5.327636594853386, + "learning_rate": 1.364919401291277e-06, + "loss": 0.3926, + "step": 41700 + }, + { + "epoch": 2.35, + "grad_norm": 5.787583437400583, + "learning_rate": 1.3637942175490398e-06, + "loss": 0.3739, + "step": 41705 + }, + { + "epoch": 2.35, + "grad_norm": 4.607871008060517, + "learning_rate": 1.362669424533164e-06, + "loss": 0.3586, + "step": 41710 + }, + { + "epoch": 2.35, + "grad_norm": 5.217382721853331, + "learning_rate": 1.361545022364511e-06, + "loss": 0.3853, + "step": 41715 + }, + { + "epoch": 2.35, + "grad_norm": 4.999931616537445, + "learning_rate": 1.360421011163901e-06, + "loss": 0.3564, + "step": 41720 + }, + { + "epoch": 2.35, + "grad_norm": 6.009381263935578, + "learning_rate": 1.3592973910521184e-06, + "loss": 0.3762, + "step": 41725 + }, + { + "epoch": 2.35, + "grad_norm": 5.060872407200973, + "learning_rate": 1.3581741621498967e-06, + "loss": 0.3938, + "step": 41730 + }, + { + "epoch": 2.35, + "grad_norm": 7.482587453727809, + "learning_rate": 1.357051324577936e-06, + "loss": 0.3568, + "step": 41735 + }, + { + "epoch": 2.35, + "grad_norm": 5.901812743155264, + "learning_rate": 1.3559288784568857e-06, + "loss": 0.3674, + "step": 41740 + }, + { + "epoch": 2.35, + "grad_norm": 5.506271378926924, + "learning_rate": 1.354806823907362e-06, + "loss": 0.3599, + "step": 41745 + }, + { + "epoch": 2.35, + "grad_norm": 4.988373502308444, + "learning_rate": 1.3536851610499325e-06, + "loss": 0.3564, + "step": 41750 + }, + { + "epoch": 2.35, + "grad_norm": 4.160118358919035, + "learning_rate": 1.352563890005123e-06, + "loss": 0.3844, + "step": 41755 + }, + { + "epoch": 2.35, + "grad_norm": 4.456918901709329, + "learning_rate": 1.3514430108934229e-06, + "loss": 0.3634, + "step": 41760 + }, + { + "epoch": 2.35, + "grad_norm": 4.720142489661282, + "learning_rate": 1.3503225238352718e-06, + "loss": 0.3621, + "step": 41765 + }, + { + "epoch": 2.35, + "grad_norm": 4.6730796976704845, + "learning_rate": 1.349202428951074e-06, + "loss": 0.3849, + "step": 41770 + }, + { + "epoch": 2.35, + "grad_norm": 4.708039666936624, + "learning_rate": 1.3480827263611862e-06, + "loss": 0.3784, + "step": 41775 + }, + { + "epoch": 2.35, + "grad_norm": 4.5775020472229, + "learning_rate": 1.346963416185927e-06, + "loss": 0.3731, + "step": 41780 + }, + { + "epoch": 2.35, + "grad_norm": 4.427022226127237, + "learning_rate": 1.3458444985455715e-06, + "loss": 0.3848, + "step": 41785 + }, + { + "epoch": 2.35, + "grad_norm": 5.0727974789767805, + "learning_rate": 1.3447259735603507e-06, + "loss": 0.381, + "step": 41790 + }, + { + "epoch": 2.35, + "grad_norm": 4.394027541147416, + "learning_rate": 1.3436078413504572e-06, + "loss": 0.3441, + "step": 41795 + }, + { + "epoch": 2.36, + "grad_norm": 4.810623074537778, + "learning_rate": 1.3424901020360382e-06, + "loss": 0.3603, + "step": 41800 + }, + { + "epoch": 2.36, + "grad_norm": 4.560463698130751, + "learning_rate": 1.3413727557371981e-06, + "loss": 0.3579, + "step": 41805 + }, + { + "epoch": 2.36, + "grad_norm": 4.778712992367557, + "learning_rate": 1.3402558025740036e-06, + "loss": 0.3502, + "step": 41810 + }, + { + "epoch": 2.36, + "grad_norm": 4.442525354266272, + "learning_rate": 1.3391392426664735e-06, + "loss": 0.3906, + "step": 41815 + }, + { + "epoch": 2.36, + "grad_norm": 5.152634030087905, + "learning_rate": 1.338023076134589e-06, + "loss": 0.3877, + "step": 41820 + }, + { + "epoch": 2.36, + "grad_norm": 5.66621143335817, + "learning_rate": 1.336907303098286e-06, + "loss": 0.363, + "step": 41825 + }, + { + "epoch": 2.36, + "grad_norm": 4.210877694280694, + "learning_rate": 1.3357919236774608e-06, + "loss": 0.3703, + "step": 41830 + }, + { + "epoch": 2.36, + "grad_norm": 4.542877284587468, + "learning_rate": 1.334676937991965e-06, + "loss": 0.3564, + "step": 41835 + }, + { + "epoch": 2.36, + "grad_norm": 4.586788209250504, + "learning_rate": 1.3335623461616071e-06, + "loss": 0.3687, + "step": 41840 + }, + { + "epoch": 2.36, + "grad_norm": 4.78880284551674, + "learning_rate": 1.3324481483061574e-06, + "loss": 0.3739, + "step": 41845 + }, + { + "epoch": 2.36, + "grad_norm": 4.514647892992655, + "learning_rate": 1.3313343445453392e-06, + "loss": 0.3399, + "step": 41850 + }, + { + "epoch": 2.36, + "grad_norm": 4.693777487639411, + "learning_rate": 1.330220934998838e-06, + "loss": 0.3743, + "step": 41855 + }, + { + "epoch": 2.36, + "grad_norm": 5.255976843985964, + "learning_rate": 1.3291079197862915e-06, + "loss": 0.3721, + "step": 41860 + }, + { + "epoch": 2.36, + "grad_norm": 4.006070376437593, + "learning_rate": 1.327995299027302e-06, + "loss": 0.3686, + "step": 41865 + }, + { + "epoch": 2.36, + "grad_norm": 4.40479363120874, + "learning_rate": 1.3268830728414217e-06, + "loss": 0.3541, + "step": 41870 + }, + { + "epoch": 2.36, + "grad_norm": 5.312797149746925, + "learning_rate": 1.325771241348166e-06, + "loss": 0.3582, + "step": 41875 + }, + { + "epoch": 2.36, + "grad_norm": 4.891479836302079, + "learning_rate": 1.3246598046670072e-06, + "loss": 0.379, + "step": 41880 + }, + { + "epoch": 2.36, + "grad_norm": 4.884039559814739, + "learning_rate": 1.3235487629173733e-06, + "loss": 0.3524, + "step": 41885 + }, + { + "epoch": 2.36, + "grad_norm": 5.228085868933899, + "learning_rate": 1.3224381162186489e-06, + "loss": 0.3638, + "step": 41890 + }, + { + "epoch": 2.36, + "grad_norm": 6.482258460260676, + "learning_rate": 1.32132786469018e-06, + "loss": 0.3572, + "step": 41895 + }, + { + "epoch": 2.36, + "grad_norm": 7.428868936913378, + "learning_rate": 1.3202180084512667e-06, + "loss": 0.3888, + "step": 41900 + }, + { + "epoch": 2.36, + "grad_norm": 4.441922147118252, + "learning_rate": 1.319108547621169e-06, + "loss": 0.3555, + "step": 41905 + }, + { + "epoch": 2.36, + "grad_norm": 4.682905296226707, + "learning_rate": 1.3179994823191022e-06, + "loss": 0.3633, + "step": 41910 + }, + { + "epoch": 2.36, + "grad_norm": 7.472129864605422, + "learning_rate": 1.3168908126642421e-06, + "loss": 0.3779, + "step": 41915 + }, + { + "epoch": 2.36, + "grad_norm": 6.180004486006848, + "learning_rate": 1.3157825387757189e-06, + "loss": 0.3689, + "step": 41920 + }, + { + "epoch": 2.36, + "grad_norm": 4.778681656948636, + "learning_rate": 1.3146746607726201e-06, + "loss": 0.3672, + "step": 41925 + }, + { + "epoch": 2.36, + "grad_norm": 4.808843573270111, + "learning_rate": 1.3135671787739952e-06, + "loss": 0.3742, + "step": 41930 + }, + { + "epoch": 2.36, + "grad_norm": 4.802187639254322, + "learning_rate": 1.3124600928988445e-06, + "loss": 0.3818, + "step": 41935 + }, + { + "epoch": 2.36, + "grad_norm": 6.360056543214213, + "learning_rate": 1.3113534032661334e-06, + "loss": 0.3608, + "step": 41940 + }, + { + "epoch": 2.36, + "grad_norm": 15.847509537782647, + "learning_rate": 1.3102471099947778e-06, + "loss": 0.3572, + "step": 41945 + }, + { + "epoch": 2.36, + "grad_norm": 4.841931709990855, + "learning_rate": 1.309141213203653e-06, + "loss": 0.3387, + "step": 41950 + }, + { + "epoch": 2.36, + "grad_norm": 5.41016421912756, + "learning_rate": 1.3080357130115938e-06, + "loss": 0.3792, + "step": 41955 + }, + { + "epoch": 2.36, + "grad_norm": 5.187559739697147, + "learning_rate": 1.306930609537393e-06, + "loss": 0.3448, + "step": 41960 + }, + { + "epoch": 2.36, + "grad_norm": 4.495795948164244, + "learning_rate": 1.3058259028997948e-06, + "loss": 0.3605, + "step": 41965 + }, + { + "epoch": 2.36, + "grad_norm": 6.382753309694544, + "learning_rate": 1.3047215932175077e-06, + "loss": 0.3483, + "step": 41970 + }, + { + "epoch": 2.37, + "grad_norm": 4.5716181643549465, + "learning_rate": 1.3036176806091949e-06, + "loss": 0.3419, + "step": 41975 + }, + { + "epoch": 2.37, + "grad_norm": 5.150721114938997, + "learning_rate": 1.302514165193476e-06, + "loss": 0.396, + "step": 41980 + }, + { + "epoch": 2.37, + "grad_norm": 5.307574294717373, + "learning_rate": 1.301411047088927e-06, + "loss": 0.3588, + "step": 41985 + }, + { + "epoch": 2.37, + "grad_norm": 4.761829431456866, + "learning_rate": 1.3003083264140848e-06, + "loss": 0.354, + "step": 41990 + }, + { + "epoch": 2.37, + "grad_norm": 5.2638307749607565, + "learning_rate": 1.2992060032874398e-06, + "loss": 0.374, + "step": 41995 + }, + { + "epoch": 2.37, + "grad_norm": 5.443351903348739, + "learning_rate": 1.2981040778274444e-06, + "loss": 0.3523, + "step": 42000 + }, + { + "epoch": 2.37, + "grad_norm": 5.314166116795517, + "learning_rate": 1.2970025501525036e-06, + "loss": 0.3494, + "step": 42005 + }, + { + "epoch": 2.37, + "grad_norm": 4.805614747845596, + "learning_rate": 1.2959014203809793e-06, + "loss": 0.3483, + "step": 42010 + }, + { + "epoch": 2.37, + "grad_norm": 4.563221109534187, + "learning_rate": 1.2948006886311965e-06, + "loss": 0.3607, + "step": 42015 + }, + { + "epoch": 2.37, + "grad_norm": 4.639028664449478, + "learning_rate": 1.2937003550214305e-06, + "loss": 0.3999, + "step": 42020 + }, + { + "epoch": 2.37, + "grad_norm": 4.933649264612827, + "learning_rate": 1.2926004196699205e-06, + "loss": 0.3736, + "step": 42025 + }, + { + "epoch": 2.37, + "grad_norm": 4.600465104816654, + "learning_rate": 1.2915008826948566e-06, + "loss": 0.3534, + "step": 42030 + }, + { + "epoch": 2.37, + "grad_norm": 4.642752716311966, + "learning_rate": 1.2904017442143884e-06, + "loss": 0.382, + "step": 42035 + }, + { + "epoch": 2.37, + "grad_norm": 4.383477090892518, + "learning_rate": 1.2893030043466264e-06, + "loss": 0.3316, + "step": 42040 + }, + { + "epoch": 2.37, + "grad_norm": 4.6475135081138195, + "learning_rate": 1.2882046632096318e-06, + "loss": 0.3855, + "step": 42045 + }, + { + "epoch": 2.37, + "grad_norm": 4.308433393615727, + "learning_rate": 1.2871067209214272e-06, + "loss": 0.3531, + "step": 42050 + }, + { + "epoch": 2.37, + "grad_norm": 4.649384180267416, + "learning_rate": 1.2860091775999928e-06, + "loss": 0.3349, + "step": 42055 + }, + { + "epoch": 2.37, + "grad_norm": 5.66288589554288, + "learning_rate": 1.2849120333632625e-06, + "loss": 0.3916, + "step": 42060 + }, + { + "epoch": 2.37, + "grad_norm": 4.964130849795638, + "learning_rate": 1.283815288329131e-06, + "loss": 0.3871, + "step": 42065 + }, + { + "epoch": 2.37, + "grad_norm": 4.8521532342133025, + "learning_rate": 1.2827189426154468e-06, + "loss": 0.3182, + "step": 42070 + }, + { + "epoch": 2.37, + "grad_norm": 5.67941770433278, + "learning_rate": 1.281622996340019e-06, + "loss": 0.359, + "step": 42075 + }, + { + "epoch": 2.37, + "grad_norm": 5.951347083426389, + "learning_rate": 1.2805274496206088e-06, + "loss": 0.3873, + "step": 42080 + }, + { + "epoch": 2.37, + "grad_norm": 4.778512172297239, + "learning_rate": 1.279432302574941e-06, + "loss": 0.3235, + "step": 42085 + }, + { + "epoch": 2.37, + "grad_norm": 4.637048999459916, + "learning_rate": 1.2783375553206929e-06, + "loss": 0.3845, + "step": 42090 + }, + { + "epoch": 2.37, + "grad_norm": 13.564738367310575, + "learning_rate": 1.2772432079754975e-06, + "loss": 0.4089, + "step": 42095 + }, + { + "epoch": 2.37, + "grad_norm": 5.802261635220572, + "learning_rate": 1.2761492606569508e-06, + "loss": 0.3561, + "step": 42100 + }, + { + "epoch": 2.37, + "grad_norm": 6.420038409930823, + "learning_rate": 1.2750557134825996e-06, + "loss": 0.3524, + "step": 42105 + }, + { + "epoch": 2.37, + "grad_norm": 5.055511029770161, + "learning_rate": 1.2739625665699523e-06, + "loss": 0.3548, + "step": 42110 + }, + { + "epoch": 2.37, + "grad_norm": 4.911610320868343, + "learning_rate": 1.2728698200364719e-06, + "loss": 0.3913, + "step": 42115 + }, + { + "epoch": 2.37, + "grad_norm": 5.824163423306069, + "learning_rate": 1.271777473999577e-06, + "loss": 0.3643, + "step": 42120 + }, + { + "epoch": 2.37, + "grad_norm": 4.496457440085291, + "learning_rate": 1.2706855285766473e-06, + "loss": 0.3672, + "step": 42125 + }, + { + "epoch": 2.37, + "grad_norm": 4.7146890653985585, + "learning_rate": 1.2695939838850151e-06, + "loss": 0.3628, + "step": 42130 + }, + { + "epoch": 2.37, + "grad_norm": 4.628729247949617, + "learning_rate": 1.2685028400419747e-06, + "loss": 0.3682, + "step": 42135 + }, + { + "epoch": 2.37, + "grad_norm": 4.454871913383951, + "learning_rate": 1.2674120971647709e-06, + "loss": 0.3438, + "step": 42140 + }, + { + "epoch": 2.37, + "grad_norm": 4.98463705586402, + "learning_rate": 1.2663217553706098e-06, + "loss": 0.3374, + "step": 42145 + }, + { + "epoch": 2.37, + "grad_norm": 5.012009429245551, + "learning_rate": 1.2652318147766563e-06, + "loss": 0.3671, + "step": 42150 + }, + { + "epoch": 2.38, + "grad_norm": 4.376290256788171, + "learning_rate": 1.264142275500025e-06, + "loss": 0.3498, + "step": 42155 + }, + { + "epoch": 2.38, + "grad_norm": 19.3603246234026, + "learning_rate": 1.2630531376577954e-06, + "loss": 0.4071, + "step": 42160 + }, + { + "epoch": 2.38, + "grad_norm": 7.572495194724955, + "learning_rate": 1.2619644013669974e-06, + "loss": 0.3641, + "step": 42165 + }, + { + "epoch": 2.38, + "grad_norm": 5.77345672789725, + "learning_rate": 1.260876066744623e-06, + "loss": 0.3684, + "step": 42170 + }, + { + "epoch": 2.38, + "grad_norm": 4.530893686920582, + "learning_rate": 1.2597881339076174e-06, + "loss": 0.369, + "step": 42175 + }, + { + "epoch": 2.38, + "grad_norm": 5.588971650663712, + "learning_rate": 1.2587006029728815e-06, + "loss": 0.3582, + "step": 42180 + }, + { + "epoch": 2.38, + "grad_norm": 5.895544639275465, + "learning_rate": 1.2576134740572798e-06, + "loss": 0.3235, + "step": 42185 + }, + { + "epoch": 2.38, + "grad_norm": 4.902353707970118, + "learning_rate": 1.2565267472776249e-06, + "loss": 0.3758, + "step": 42190 + }, + { + "epoch": 2.38, + "grad_norm": 6.275755681550319, + "learning_rate": 1.2554404227506933e-06, + "loss": 0.3716, + "step": 42195 + }, + { + "epoch": 2.38, + "grad_norm": 6.474480660736474, + "learning_rate": 1.2543545005932146e-06, + "loss": 0.3747, + "step": 42200 + }, + { + "epoch": 2.38, + "grad_norm": 4.420762975329743, + "learning_rate": 1.2532689809218745e-06, + "loss": 0.378, + "step": 42205 + }, + { + "epoch": 2.38, + "grad_norm": 5.217549343063801, + "learning_rate": 1.252183863853319e-06, + "loss": 0.388, + "step": 42210 + }, + { + "epoch": 2.38, + "grad_norm": 4.467735935719349, + "learning_rate": 1.2510991495041463e-06, + "loss": 0.3626, + "step": 42215 + }, + { + "epoch": 2.38, + "grad_norm": 4.850374283615647, + "learning_rate": 1.2500148379909166e-06, + "loss": 0.3696, + "step": 42220 + }, + { + "epoch": 2.38, + "grad_norm": 4.496383422687956, + "learning_rate": 1.248930929430141e-06, + "loss": 0.3719, + "step": 42225 + }, + { + "epoch": 2.38, + "grad_norm": 6.242307280006708, + "learning_rate": 1.2478474239382932e-06, + "loss": 0.3088, + "step": 42230 + }, + { + "epoch": 2.38, + "grad_norm": 5.058077876542059, + "learning_rate": 1.2467643216317982e-06, + "loss": 0.3775, + "step": 42235 + }, + { + "epoch": 2.38, + "grad_norm": 4.471919999541183, + "learning_rate": 1.2456816226270407e-06, + "loss": 0.3331, + "step": 42240 + }, + { + "epoch": 2.38, + "grad_norm": 4.659201725476392, + "learning_rate": 1.2445993270403634e-06, + "loss": 0.3609, + "step": 42245 + }, + { + "epoch": 2.38, + "grad_norm": 4.942208712849299, + "learning_rate": 1.2435174349880607e-06, + "loss": 0.3591, + "step": 42250 + }, + { + "epoch": 2.38, + "grad_norm": 4.523505832121196, + "learning_rate": 1.24243594658639e-06, + "loss": 0.3843, + "step": 42255 + }, + { + "epoch": 2.38, + "grad_norm": 4.947283233247577, + "learning_rate": 1.2413548619515603e-06, + "loss": 0.3588, + "step": 42260 + }, + { + "epoch": 2.38, + "grad_norm": 4.292315925839984, + "learning_rate": 1.240274181199737e-06, + "loss": 0.3512, + "step": 42265 + }, + { + "epoch": 2.38, + "grad_norm": 4.743756549305206, + "learning_rate": 1.2391939044470475e-06, + "loss": 0.3754, + "step": 42270 + }, + { + "epoch": 2.38, + "grad_norm": 4.826997907589478, + "learning_rate": 1.2381140318095691e-06, + "loss": 0.3956, + "step": 42275 + }, + { + "epoch": 2.38, + "grad_norm": 4.690877213456252, + "learning_rate": 1.2370345634033421e-06, + "loss": 0.3751, + "step": 42280 + }, + { + "epoch": 2.38, + "grad_norm": 4.609682999814088, + "learning_rate": 1.2359554993443584e-06, + "loss": 0.3395, + "step": 42285 + }, + { + "epoch": 2.38, + "grad_norm": 4.598513227330498, + "learning_rate": 1.2348768397485666e-06, + "loss": 0.3683, + "step": 42290 + }, + { + "epoch": 2.38, + "grad_norm": 4.662917186226731, + "learning_rate": 1.2337985847318772e-06, + "loss": 0.3472, + "step": 42295 + }, + { + "epoch": 2.38, + "grad_norm": 4.422588702274017, + "learning_rate": 1.2327207344101493e-06, + "loss": 0.381, + "step": 42300 + }, + { + "epoch": 2.38, + "grad_norm": 4.505366246396598, + "learning_rate": 1.2316432888992069e-06, + "loss": 0.3559, + "step": 42305 + }, + { + "epoch": 2.38, + "grad_norm": 4.3381788087830255, + "learning_rate": 1.2305662483148217e-06, + "loss": 0.3448, + "step": 42310 + }, + { + "epoch": 2.38, + "grad_norm": 5.294779684309665, + "learning_rate": 1.2294896127727313e-06, + "loss": 0.3684, + "step": 42315 + }, + { + "epoch": 2.38, + "grad_norm": 5.446079138501898, + "learning_rate": 1.2284133823886206e-06, + "loss": 0.37, + "step": 42320 + }, + { + "epoch": 2.38, + "grad_norm": 4.9445942664675675, + "learning_rate": 1.2273375572781388e-06, + "loss": 0.3701, + "step": 42325 + }, + { + "epoch": 2.39, + "grad_norm": 4.498016661428263, + "learning_rate": 1.226262137556885e-06, + "loss": 0.3868, + "step": 42330 + }, + { + "epoch": 2.39, + "grad_norm": 4.422679423110836, + "learning_rate": 1.2251871233404195e-06, + "loss": 0.3589, + "step": 42335 + }, + { + "epoch": 2.39, + "grad_norm": 4.371199858577176, + "learning_rate": 1.2241125147442583e-06, + "loss": 0.3624, + "step": 42340 + }, + { + "epoch": 2.39, + "grad_norm": 4.884361530712702, + "learning_rate": 1.2230383118838718e-06, + "loss": 0.3552, + "step": 42345 + }, + { + "epoch": 2.39, + "grad_norm": 5.562749827350988, + "learning_rate": 1.2219645148746862e-06, + "loss": 0.3586, + "step": 42350 + }, + { + "epoch": 2.39, + "grad_norm": 4.968413991488184, + "learning_rate": 1.2208911238320887e-06, + "loss": 0.3702, + "step": 42355 + }, + { + "epoch": 2.39, + "grad_norm": 4.38240302692401, + "learning_rate": 1.219818138871417e-06, + "loss": 0.3436, + "step": 42360 + }, + { + "epoch": 2.39, + "grad_norm": 4.866090970297154, + "learning_rate": 1.2187455601079706e-06, + "loss": 0.3522, + "step": 42365 + }, + { + "epoch": 2.39, + "grad_norm": 4.71774457805276, + "learning_rate": 1.2176733876570008e-06, + "loss": 0.3305, + "step": 42370 + }, + { + "epoch": 2.39, + "grad_norm": 4.715950757583577, + "learning_rate": 1.2166016216337195e-06, + "loss": 0.3383, + "step": 42375 + }, + { + "epoch": 2.39, + "grad_norm": 4.780255750910164, + "learning_rate": 1.215530262153291e-06, + "loss": 0.3639, + "step": 42380 + }, + { + "epoch": 2.39, + "grad_norm": 7.649557384828715, + "learning_rate": 1.2144593093308365e-06, + "loss": 0.3456, + "step": 42385 + }, + { + "epoch": 2.39, + "grad_norm": 6.658998179623315, + "learning_rate": 1.2133887632814378e-06, + "loss": 0.3543, + "step": 42390 + }, + { + "epoch": 2.39, + "grad_norm": 5.047139095782406, + "learning_rate": 1.2123186241201269e-06, + "loss": 0.3914, + "step": 42395 + }, + { + "epoch": 2.39, + "grad_norm": 6.460150286641404, + "learning_rate": 1.2112488919618975e-06, + "loss": 0.3773, + "step": 42400 + }, + { + "epoch": 2.39, + "grad_norm": 4.863819087309987, + "learning_rate": 1.2101795669216954e-06, + "loss": 0.3417, + "step": 42405 + }, + { + "epoch": 2.39, + "grad_norm": 4.497138328333464, + "learning_rate": 1.2091106491144233e-06, + "loss": 0.3681, + "step": 42410 + }, + { + "epoch": 2.39, + "grad_norm": 4.901559440697807, + "learning_rate": 1.2080421386549423e-06, + "loss": 0.3704, + "step": 42415 + }, + { + "epoch": 2.39, + "grad_norm": 4.626243185632536, + "learning_rate": 1.20697403565807e-06, + "loss": 0.3946, + "step": 42420 + }, + { + "epoch": 2.39, + "grad_norm": 6.25988784988983, + "learning_rate": 1.2059063402385762e-06, + "loss": 0.3673, + "step": 42425 + }, + { + "epoch": 2.39, + "grad_norm": 5.092003009439223, + "learning_rate": 1.2048390525111925e-06, + "loss": 0.3741, + "step": 42430 + }, + { + "epoch": 2.39, + "grad_norm": 4.992085822155388, + "learning_rate": 1.2037721725906e-06, + "loss": 0.3554, + "step": 42435 + }, + { + "epoch": 2.39, + "grad_norm": 4.586761164031409, + "learning_rate": 1.2027057005914434e-06, + "loss": 0.3566, + "step": 42440 + }, + { + "epoch": 2.39, + "grad_norm": 4.92405306572773, + "learning_rate": 1.2016396366283162e-06, + "loss": 0.3755, + "step": 42445 + }, + { + "epoch": 2.39, + "grad_norm": 4.7923894461247025, + "learning_rate": 1.2005739808157751e-06, + "loss": 0.3504, + "step": 42450 + }, + { + "epoch": 2.39, + "grad_norm": 5.044599357681824, + "learning_rate": 1.199508733268327e-06, + "loss": 0.3821, + "step": 42455 + }, + { + "epoch": 2.39, + "grad_norm": 4.839607446144359, + "learning_rate": 1.1984438941004389e-06, + "loss": 0.3442, + "step": 42460 + }, + { + "epoch": 2.39, + "grad_norm": 5.089035579665456, + "learning_rate": 1.1973794634265322e-06, + "loss": 0.3651, + "step": 42465 + }, + { + "epoch": 2.39, + "grad_norm": 4.748395692327808, + "learning_rate": 1.1963154413609835e-06, + "loss": 0.3343, + "step": 42470 + }, + { + "epoch": 2.39, + "grad_norm": 4.693467743332488, + "learning_rate": 1.1952518280181286e-06, + "loss": 0.3694, + "step": 42475 + }, + { + "epoch": 2.39, + "grad_norm": 5.399528781148906, + "learning_rate": 1.1941886235122547e-06, + "loss": 0.3695, + "step": 42480 + }, + { + "epoch": 2.39, + "grad_norm": 5.092356045785655, + "learning_rate": 1.1931258279576112e-06, + "loss": 0.3758, + "step": 42485 + }, + { + "epoch": 2.39, + "grad_norm": 8.137382948152087, + "learning_rate": 1.1920634414683985e-06, + "loss": 0.3444, + "step": 42490 + }, + { + "epoch": 2.39, + "grad_norm": 6.251183036824891, + "learning_rate": 1.1910014641587737e-06, + "loss": 0.3453, + "step": 42495 + }, + { + "epoch": 2.39, + "grad_norm": 4.391307992928254, + "learning_rate": 1.1899398961428527e-06, + "loss": 0.3776, + "step": 42500 + }, + { + "epoch": 2.39, + "grad_norm": 4.725950315746725, + "learning_rate": 1.1888787375347037e-06, + "loss": 0.3646, + "step": 42505 + }, + { + "epoch": 2.4, + "grad_norm": 5.096875154747406, + "learning_rate": 1.1878179884483544e-06, + "loss": 0.3476, + "step": 42510 + }, + { + "epoch": 2.4, + "grad_norm": 7.9398674542331715, + "learning_rate": 1.1867576489977878e-06, + "loss": 0.3553, + "step": 42515 + }, + { + "epoch": 2.4, + "grad_norm": 4.314723165258239, + "learning_rate": 1.18569771929694e-06, + "loss": 0.365, + "step": 42520 + }, + { + "epoch": 2.4, + "grad_norm": 4.729494773691787, + "learning_rate": 1.1846381994597073e-06, + "loss": 0.3549, + "step": 42525 + }, + { + "epoch": 2.4, + "grad_norm": 5.281703371643079, + "learning_rate": 1.1835790895999365e-06, + "loss": 0.3382, + "step": 42530 + }, + { + "epoch": 2.4, + "grad_norm": 4.897607832496858, + "learning_rate": 1.182520389831438e-06, + "loss": 0.3529, + "step": 42535 + }, + { + "epoch": 2.4, + "grad_norm": 4.540451706205725, + "learning_rate": 1.1814621002679694e-06, + "loss": 0.3613, + "step": 42540 + }, + { + "epoch": 2.4, + "grad_norm": 5.224883443222026, + "learning_rate": 1.1804042210232525e-06, + "loss": 0.3803, + "step": 42545 + }, + { + "epoch": 2.4, + "grad_norm": 5.624150968078994, + "learning_rate": 1.1793467522109587e-06, + "loss": 0.3707, + "step": 42550 + }, + { + "epoch": 2.4, + "grad_norm": 5.059456294473843, + "learning_rate": 1.178289693944717e-06, + "loss": 0.3825, + "step": 42555 + }, + { + "epoch": 2.4, + "grad_norm": 7.0517341329358665, + "learning_rate": 1.1772330463381154e-06, + "loss": 0.3544, + "step": 42560 + }, + { + "epoch": 2.4, + "grad_norm": 4.6496531285395495, + "learning_rate": 1.1761768095046933e-06, + "loss": 0.3919, + "step": 42565 + }, + { + "epoch": 2.4, + "grad_norm": 7.2775145645304, + "learning_rate": 1.1751209835579497e-06, + "loss": 0.3636, + "step": 42570 + }, + { + "epoch": 2.4, + "grad_norm": 5.936966101273284, + "learning_rate": 1.1740655686113373e-06, + "loss": 0.3707, + "step": 42575 + }, + { + "epoch": 2.4, + "grad_norm": 5.170511625514858, + "learning_rate": 1.173010564778263e-06, + "loss": 0.3662, + "step": 42580 + }, + { + "epoch": 2.4, + "grad_norm": 5.469860799493985, + "learning_rate": 1.1719559721720948e-06, + "loss": 0.3556, + "step": 42585 + }, + { + "epoch": 2.4, + "grad_norm": 5.509544621385367, + "learning_rate": 1.1709017909061504e-06, + "loss": 0.3343, + "step": 42590 + }, + { + "epoch": 2.4, + "grad_norm": 4.839685983344473, + "learning_rate": 1.1698480210937092e-06, + "loss": 0.3686, + "step": 42595 + }, + { + "epoch": 2.4, + "grad_norm": 5.258152205207002, + "learning_rate": 1.168794662848e-06, + "loss": 0.3275, + "step": 42600 + }, + { + "epoch": 2.4, + "grad_norm": 4.706633011505221, + "learning_rate": 1.167741716282213e-06, + "loss": 0.3826, + "step": 42605 + }, + { + "epoch": 2.4, + "grad_norm": 4.916007626626051, + "learning_rate": 1.1666891815094933e-06, + "loss": 0.3816, + "step": 42610 + }, + { + "epoch": 2.4, + "grad_norm": 6.4945402078139, + "learning_rate": 1.1656370586429366e-06, + "loss": 0.3805, + "step": 42615 + }, + { + "epoch": 2.4, + "grad_norm": 5.840101890634254, + "learning_rate": 1.1645853477956025e-06, + "loss": 0.3644, + "step": 42620 + }, + { + "epoch": 2.4, + "grad_norm": 4.07723512998222, + "learning_rate": 1.1635340490804981e-06, + "loss": 0.3758, + "step": 42625 + }, + { + "epoch": 2.4, + "grad_norm": 5.013491122624717, + "learning_rate": 1.1624831626105932e-06, + "loss": 0.3643, + "step": 42630 + }, + { + "epoch": 2.4, + "grad_norm": 4.7561952960882845, + "learning_rate": 1.1614326884988086e-06, + "loss": 0.3508, + "step": 42635 + }, + { + "epoch": 2.4, + "grad_norm": 6.28442183659094, + "learning_rate": 1.1603826268580209e-06, + "loss": 0.3568, + "step": 42640 + }, + { + "epoch": 2.4, + "grad_norm": 5.271824491240048, + "learning_rate": 1.159332977801067e-06, + "loss": 0.3935, + "step": 42645 + }, + { + "epoch": 2.4, + "grad_norm": 4.488640991513405, + "learning_rate": 1.1582837414407332e-06, + "loss": 0.3315, + "step": 42650 + }, + { + "epoch": 2.4, + "grad_norm": 8.378671217242998, + "learning_rate": 1.1572349178897686e-06, + "loss": 0.4071, + "step": 42655 + }, + { + "epoch": 2.4, + "grad_norm": 4.853148764565508, + "learning_rate": 1.1561865072608702e-06, + "loss": 0.3498, + "step": 42660 + }, + { + "epoch": 2.4, + "grad_norm": 5.1490141479757385, + "learning_rate": 1.155138509666695e-06, + "loss": 0.3656, + "step": 42665 + }, + { + "epoch": 2.4, + "grad_norm": 5.224691882158689, + "learning_rate": 1.1540909252198568e-06, + "loss": 0.3624, + "step": 42670 + }, + { + "epoch": 2.4, + "grad_norm": 5.185636218938329, + "learning_rate": 1.1530437540329203e-06, + "loss": 0.3915, + "step": 42675 + }, + { + "epoch": 2.4, + "grad_norm": 4.770407006739393, + "learning_rate": 1.1519969962184124e-06, + "loss": 0.3896, + "step": 42680 + }, + { + "epoch": 2.41, + "grad_norm": 5.192529260782055, + "learning_rate": 1.1509506518888074e-06, + "loss": 0.3236, + "step": 42685 + }, + { + "epoch": 2.41, + "grad_norm": 4.517305542649535, + "learning_rate": 1.1499047211565445e-06, + "loss": 0.3724, + "step": 42690 + }, + { + "epoch": 2.41, + "grad_norm": 5.4512329579728265, + "learning_rate": 1.148859204134009e-06, + "loss": 0.3724, + "step": 42695 + }, + { + "epoch": 2.41, + "grad_norm": 7.542600508957431, + "learning_rate": 1.1478141009335486e-06, + "loss": 0.3566, + "step": 42700 + }, + { + "epoch": 2.41, + "grad_norm": 4.840329621827059, + "learning_rate": 1.1467694116674654e-06, + "loss": 0.3592, + "step": 42705 + }, + { + "epoch": 2.41, + "grad_norm": 6.726700809548532, + "learning_rate": 1.145725136448013e-06, + "loss": 0.3916, + "step": 42710 + }, + { + "epoch": 2.41, + "grad_norm": 4.823408757694335, + "learning_rate": 1.1446812753874066e-06, + "loss": 0.3442, + "step": 42715 + }, + { + "epoch": 2.41, + "grad_norm": 4.880849935296445, + "learning_rate": 1.1436378285978117e-06, + "loss": 0.3552, + "step": 42720 + }, + { + "epoch": 2.41, + "grad_norm": 4.581109594625088, + "learning_rate": 1.1425947961913503e-06, + "loss": 0.3475, + "step": 42725 + }, + { + "epoch": 2.41, + "grad_norm": 5.275388164638105, + "learning_rate": 1.1415521782801036e-06, + "loss": 0.3282, + "step": 42730 + }, + { + "epoch": 2.41, + "grad_norm": 4.677484122563242, + "learning_rate": 1.1405099749761022e-06, + "loss": 0.3294, + "step": 42735 + }, + { + "epoch": 2.41, + "grad_norm": 4.947584927636766, + "learning_rate": 1.1394681863913388e-06, + "loss": 0.3728, + "step": 42740 + }, + { + "epoch": 2.41, + "grad_norm": 5.102466767534395, + "learning_rate": 1.1384268126377568e-06, + "loss": 0.3935, + "step": 42745 + }, + { + "epoch": 2.41, + "grad_norm": 5.320546329072317, + "learning_rate": 1.1373858538272547e-06, + "loss": 0.3589, + "step": 42750 + }, + { + "epoch": 2.41, + "grad_norm": 4.789518230043431, + "learning_rate": 1.1363453100716908e-06, + "loss": 0.3497, + "step": 42755 + }, + { + "epoch": 2.41, + "grad_norm": 4.505961748227482, + "learning_rate": 1.1353051814828736e-06, + "loss": 0.3812, + "step": 42760 + }, + { + "epoch": 2.41, + "grad_norm": 4.541489597493219, + "learning_rate": 1.1342654681725724e-06, + "loss": 0.3666, + "step": 42765 + }, + { + "epoch": 2.41, + "grad_norm": 5.79090840679571, + "learning_rate": 1.1332261702525072e-06, + "loss": 0.3465, + "step": 42770 + }, + { + "epoch": 2.41, + "grad_norm": 4.566615663460663, + "learning_rate": 1.1321872878343542e-06, + "loss": 0.3235, + "step": 42775 + }, + { + "epoch": 2.41, + "grad_norm": 5.302393540097486, + "learning_rate": 1.1311488210297472e-06, + "loss": 0.3756, + "step": 42780 + }, + { + "epoch": 2.41, + "grad_norm": 5.282567677613329, + "learning_rate": 1.130110769950275e-06, + "loss": 0.35, + "step": 42785 + }, + { + "epoch": 2.41, + "grad_norm": 5.50860637000532, + "learning_rate": 1.1290731347074784e-06, + "loss": 0.3585, + "step": 42790 + }, + { + "epoch": 2.41, + "grad_norm": 4.8996798469259755, + "learning_rate": 1.1280359154128572e-06, + "loss": 0.3395, + "step": 42795 + }, + { + "epoch": 2.41, + "grad_norm": 4.606964117917527, + "learning_rate": 1.1269991121778667e-06, + "loss": 0.3148, + "step": 42800 + }, + { + "epoch": 2.41, + "grad_norm": 4.733248327210414, + "learning_rate": 1.1259627251139144e-06, + "loss": 0.3563, + "step": 42805 + }, + { + "epoch": 2.41, + "grad_norm": 4.947891824404803, + "learning_rate": 1.1249267543323633e-06, + "loss": 0.35, + "step": 42810 + }, + { + "epoch": 2.41, + "grad_norm": 5.490153591450526, + "learning_rate": 1.1238911999445356e-06, + "loss": 0.3395, + "step": 42815 + }, + { + "epoch": 2.41, + "grad_norm": 4.539950352977154, + "learning_rate": 1.122856062061704e-06, + "loss": 0.3653, + "step": 42820 + }, + { + "epoch": 2.41, + "grad_norm": 4.6516196029149315, + "learning_rate": 1.1218213407951006e-06, + "loss": 0.3541, + "step": 42825 + }, + { + "epoch": 2.41, + "grad_norm": 4.396347971692736, + "learning_rate": 1.1207870362559104e-06, + "loss": 0.3231, + "step": 42830 + }, + { + "epoch": 2.41, + "grad_norm": 5.001218368092357, + "learning_rate": 1.119753148555272e-06, + "loss": 0.3597, + "step": 42835 + }, + { + "epoch": 2.41, + "grad_norm": 5.1681320005852225, + "learning_rate": 1.1187196778042835e-06, + "loss": 0.3669, + "step": 42840 + }, + { + "epoch": 2.41, + "grad_norm": 4.593220039690747, + "learning_rate": 1.1176866241139938e-06, + "loss": 0.3589, + "step": 42845 + }, + { + "epoch": 2.41, + "grad_norm": 4.039168848888276, + "learning_rate": 1.116653987595412e-06, + "loss": 0.3171, + "step": 42850 + }, + { + "epoch": 2.41, + "grad_norm": 4.952587473166305, + "learning_rate": 1.1156217683594962e-06, + "loss": 0.3621, + "step": 42855 + }, + { + "epoch": 2.41, + "grad_norm": 5.259426043426545, + "learning_rate": 1.114589966517166e-06, + "loss": 0.3396, + "step": 42860 + }, + { + "epoch": 2.42, + "grad_norm": 5.18436919600694, + "learning_rate": 1.113558582179291e-06, + "loss": 0.3954, + "step": 42865 + }, + { + "epoch": 2.42, + "grad_norm": 4.869072471229186, + "learning_rate": 1.1125276154566978e-06, + "loss": 0.357, + "step": 42870 + }, + { + "epoch": 2.42, + "grad_norm": 5.491125796378438, + "learning_rate": 1.1114970664601687e-06, + "loss": 0.3881, + "step": 42875 + }, + { + "epoch": 2.42, + "grad_norm": 4.614701961113036, + "learning_rate": 1.1104669353004426e-06, + "loss": 0.3985, + "step": 42880 + }, + { + "epoch": 2.42, + "grad_norm": 5.400489516627011, + "learning_rate": 1.1094372220882082e-06, + "loss": 0.3709, + "step": 42885 + }, + { + "epoch": 2.42, + "grad_norm": 6.574379662364799, + "learning_rate": 1.1084079269341168e-06, + "loss": 0.3359, + "step": 42890 + }, + { + "epoch": 2.42, + "grad_norm": 6.713964177768655, + "learning_rate": 1.1073790499487668e-06, + "loss": 0.3641, + "step": 42895 + }, + { + "epoch": 2.42, + "grad_norm": 4.612098728645514, + "learning_rate": 1.1063505912427187e-06, + "loss": 0.3511, + "step": 42900 + }, + { + "epoch": 2.42, + "grad_norm": 4.362508971976225, + "learning_rate": 1.105322550926482e-06, + "loss": 0.3225, + "step": 42905 + }, + { + "epoch": 2.42, + "grad_norm": 5.2506411846191146, + "learning_rate": 1.1042949291105275e-06, + "loss": 0.3573, + "step": 42910 + }, + { + "epoch": 2.42, + "grad_norm": 4.973191182475268, + "learning_rate": 1.103267725905276e-06, + "loss": 0.3522, + "step": 42915 + }, + { + "epoch": 2.42, + "grad_norm": 4.209285931134533, + "learning_rate": 1.1022409414211033e-06, + "loss": 0.3415, + "step": 42920 + }, + { + "epoch": 2.42, + "grad_norm": 4.143845947568642, + "learning_rate": 1.1012145757683446e-06, + "loss": 0.3852, + "step": 42925 + }, + { + "epoch": 2.42, + "grad_norm": 5.335144342818536, + "learning_rate": 1.1001886290572855e-06, + "loss": 0.3492, + "step": 42930 + }, + { + "epoch": 2.42, + "grad_norm": 4.484389042032344, + "learning_rate": 1.0991631013981708e-06, + "loss": 0.3546, + "step": 42935 + }, + { + "epoch": 2.42, + "grad_norm": 4.884075111752374, + "learning_rate": 1.0981379929011949e-06, + "loss": 0.3869, + "step": 42940 + }, + { + "epoch": 2.42, + "grad_norm": 5.038709618640821, + "learning_rate": 1.097113303676513e-06, + "loss": 0.3774, + "step": 42945 + }, + { + "epoch": 2.42, + "grad_norm": 4.694482807404403, + "learning_rate": 1.0960890338342323e-06, + "loss": 0.3453, + "step": 42950 + }, + { + "epoch": 2.42, + "grad_norm": 4.90117682929359, + "learning_rate": 1.0950651834844123e-06, + "loss": 0.3636, + "step": 42955 + }, + { + "epoch": 2.42, + "grad_norm": 5.266971892132762, + "learning_rate": 1.0940417527370733e-06, + "loss": 0.3496, + "step": 42960 + }, + { + "epoch": 2.42, + "grad_norm": 5.075111698053483, + "learning_rate": 1.0930187417021849e-06, + "loss": 0.3943, + "step": 42965 + }, + { + "epoch": 2.42, + "grad_norm": 5.645309782701889, + "learning_rate": 1.091996150489676e-06, + "loss": 0.3845, + "step": 42970 + }, + { + "epoch": 2.42, + "grad_norm": 4.473858431297266, + "learning_rate": 1.0909739792094286e-06, + "loss": 0.3612, + "step": 42975 + }, + { + "epoch": 2.42, + "grad_norm": 4.901206521594583, + "learning_rate": 1.0899522279712782e-06, + "loss": 0.3569, + "step": 42980 + }, + { + "epoch": 2.42, + "grad_norm": 5.357958252896021, + "learning_rate": 1.0889308968850187e-06, + "loss": 0.3739, + "step": 42985 + }, + { + "epoch": 2.42, + "grad_norm": 6.188909256230952, + "learning_rate": 1.0879099860603937e-06, + "loss": 0.3809, + "step": 42990 + }, + { + "epoch": 2.42, + "grad_norm": 4.366863968183281, + "learning_rate": 1.0868894956071075e-06, + "loss": 0.3083, + "step": 42995 + }, + { + "epoch": 2.42, + "grad_norm": 7.3366329068196645, + "learning_rate": 1.0858694256348152e-06, + "loss": 0.3376, + "step": 43000 + }, + { + "epoch": 2.42, + "grad_norm": 6.066228333290048, + "learning_rate": 1.0848497762531251e-06, + "loss": 0.3336, + "step": 43005 + }, + { + "epoch": 2.42, + "grad_norm": 4.498027883393406, + "learning_rate": 1.0838305475716078e-06, + "loss": 0.3453, + "step": 43010 + }, + { + "epoch": 2.42, + "grad_norm": 5.302131309750226, + "learning_rate": 1.0828117396997794e-06, + "loss": 0.382, + "step": 43015 + }, + { + "epoch": 2.42, + "grad_norm": 6.5950376168562626, + "learning_rate": 1.0817933527471197e-06, + "loss": 0.36, + "step": 43020 + }, + { + "epoch": 2.42, + "grad_norm": 8.522134318094178, + "learning_rate": 1.0807753868230547e-06, + "loss": 0.3649, + "step": 43025 + }, + { + "epoch": 2.42, + "grad_norm": 7.83539842348374, + "learning_rate": 1.0797578420369725e-06, + "loss": 0.3991, + "step": 43030 + }, + { + "epoch": 2.42, + "grad_norm": 5.378807767473629, + "learning_rate": 1.078740718498212e-06, + "loss": 0.3962, + "step": 43035 + }, + { + "epoch": 2.43, + "grad_norm": 4.726310380418322, + "learning_rate": 1.0777240163160652e-06, + "loss": 0.3541, + "step": 43040 + }, + { + "epoch": 2.43, + "grad_norm": 5.589454089631747, + "learning_rate": 1.0767077355997847e-06, + "loss": 0.3663, + "step": 43045 + }, + { + "epoch": 2.43, + "grad_norm": 4.553776969200796, + "learning_rate": 1.0756918764585717e-06, + "loss": 0.3674, + "step": 43050 + }, + { + "epoch": 2.43, + "grad_norm": 4.684727431490814, + "learning_rate": 1.0746764390015867e-06, + "loss": 0.368, + "step": 43055 + }, + { + "epoch": 2.43, + "grad_norm": 7.246213698396501, + "learning_rate": 1.073661423337941e-06, + "loss": 0.3716, + "step": 43060 + }, + { + "epoch": 2.43, + "grad_norm": 4.579392698898941, + "learning_rate": 1.072646829576704e-06, + "loss": 0.3986, + "step": 43065 + }, + { + "epoch": 2.43, + "grad_norm": 4.698311915325066, + "learning_rate": 1.0716326578268988e-06, + "loss": 0.3397, + "step": 43070 + }, + { + "epoch": 2.43, + "grad_norm": 7.239743051282497, + "learning_rate": 1.0706189081975005e-06, + "loss": 0.3646, + "step": 43075 + }, + { + "epoch": 2.43, + "grad_norm": 4.668251711338613, + "learning_rate": 1.0696055807974436e-06, + "loss": 0.3549, + "step": 43080 + }, + { + "epoch": 2.43, + "grad_norm": 4.869417168100637, + "learning_rate": 1.0685926757356136e-06, + "loss": 0.3339, + "step": 43085 + }, + { + "epoch": 2.43, + "grad_norm": 4.660513286141073, + "learning_rate": 1.067580193120849e-06, + "loss": 0.3446, + "step": 43090 + }, + { + "epoch": 2.43, + "grad_norm": 4.536550325931741, + "learning_rate": 1.0665681330619499e-06, + "loss": 0.3698, + "step": 43095 + }, + { + "epoch": 2.43, + "grad_norm": 4.4121793911309295, + "learning_rate": 1.0655564956676628e-06, + "loss": 0.3514, + "step": 43100 + }, + { + "epoch": 2.43, + "grad_norm": 4.550953081210822, + "learning_rate": 1.0645452810466956e-06, + "loss": 0.3483, + "step": 43105 + }, + { + "epoch": 2.43, + "grad_norm": 6.033125702994512, + "learning_rate": 1.063534489307705e-06, + "loss": 0.3503, + "step": 43110 + }, + { + "epoch": 2.43, + "grad_norm": 4.810283334327166, + "learning_rate": 1.062524120559308e-06, + "loss": 0.3596, + "step": 43115 + }, + { + "epoch": 2.43, + "grad_norm": 5.9444574105968435, + "learning_rate": 1.0615141749100711e-06, + "loss": 0.3638, + "step": 43120 + }, + { + "epoch": 2.43, + "grad_norm": 4.680441825295457, + "learning_rate": 1.060504652468517e-06, + "loss": 0.341, + "step": 43125 + }, + { + "epoch": 2.43, + "grad_norm": 4.313650136760696, + "learning_rate": 1.059495553343126e-06, + "loss": 0.352, + "step": 43130 + }, + { + "epoch": 2.43, + "grad_norm": 4.542408747405393, + "learning_rate": 1.0584868776423267e-06, + "loss": 0.3736, + "step": 43135 + }, + { + "epoch": 2.43, + "grad_norm": 4.532313524525178, + "learning_rate": 1.0574786254745095e-06, + "loss": 0.3849, + "step": 43140 + }, + { + "epoch": 2.43, + "grad_norm": 4.9027632410283095, + "learning_rate": 1.056470796948012e-06, + "loss": 0.3601, + "step": 43145 + }, + { + "epoch": 2.43, + "grad_norm": 4.405975228332189, + "learning_rate": 1.0554633921711327e-06, + "loss": 0.3281, + "step": 43150 + }, + { + "epoch": 2.43, + "grad_norm": 4.789522903631732, + "learning_rate": 1.0544564112521193e-06, + "loss": 0.3376, + "step": 43155 + }, + { + "epoch": 2.43, + "grad_norm": 5.0475364087497745, + "learning_rate": 1.0534498542991773e-06, + "loss": 0.3674, + "step": 43160 + }, + { + "epoch": 2.43, + "grad_norm": 4.560707161998586, + "learning_rate": 1.0524437214204675e-06, + "loss": 0.3125, + "step": 43165 + }, + { + "epoch": 2.43, + "grad_norm": 4.614296182908391, + "learning_rate": 1.051438012724102e-06, + "loss": 0.3427, + "step": 43170 + }, + { + "epoch": 2.43, + "grad_norm": 4.488209880985004, + "learning_rate": 1.0504327283181465e-06, + "loss": 0.3493, + "step": 43175 + }, + { + "epoch": 2.43, + "grad_norm": 4.8567613946857975, + "learning_rate": 1.0494278683106268e-06, + "loss": 0.359, + "step": 43180 + }, + { + "epoch": 2.43, + "grad_norm": 5.397052687821712, + "learning_rate": 1.048423432809516e-06, + "loss": 0.3337, + "step": 43185 + }, + { + "epoch": 2.43, + "grad_norm": 4.629733818602774, + "learning_rate": 1.047419421922749e-06, + "loss": 0.3174, + "step": 43190 + }, + { + "epoch": 2.43, + "grad_norm": 4.634847597091976, + "learning_rate": 1.0464158357582078e-06, + "loss": 0.35, + "step": 43195 + }, + { + "epoch": 2.43, + "grad_norm": 4.876626716703233, + "learning_rate": 1.0454126744237342e-06, + "loss": 0.3542, + "step": 43200 + }, + { + "epoch": 2.43, + "grad_norm": 4.902056100936468, + "learning_rate": 1.0444099380271222e-06, + "loss": 0.3596, + "step": 43205 + }, + { + "epoch": 2.43, + "grad_norm": 4.506384279963405, + "learning_rate": 1.0434076266761184e-06, + "loss": 0.3707, + "step": 43210 + }, + { + "epoch": 2.43, + "grad_norm": 5.040252743715373, + "learning_rate": 1.0424057404784282e-06, + "loss": 0.3674, + "step": 43215 + }, + { + "epoch": 2.44, + "grad_norm": 5.047683764095478, + "learning_rate": 1.0414042795417056e-06, + "loss": 0.3452, + "step": 43220 + }, + { + "epoch": 2.44, + "grad_norm": 5.0903219258364345, + "learning_rate": 1.0404032439735655e-06, + "loss": 0.3472, + "step": 43225 + }, + { + "epoch": 2.44, + "grad_norm": 4.8555499240650635, + "learning_rate": 1.0394026338815716e-06, + "loss": 0.3105, + "step": 43230 + }, + { + "epoch": 2.44, + "grad_norm": 5.215914172669765, + "learning_rate": 1.0384024493732431e-06, + "loss": 0.3762, + "step": 43235 + }, + { + "epoch": 2.44, + "grad_norm": 4.543103488290535, + "learning_rate": 1.0374026905560553e-06, + "loss": 0.3377, + "step": 43240 + }, + { + "epoch": 2.44, + "grad_norm": 4.7548177995340914, + "learning_rate": 1.0364033575374377e-06, + "loss": 0.3668, + "step": 43245 + }, + { + "epoch": 2.44, + "grad_norm": 5.892691110099438, + "learning_rate": 1.0354044504247713e-06, + "loss": 0.3272, + "step": 43250 + }, + { + "epoch": 2.44, + "grad_norm": 4.67963894133362, + "learning_rate": 1.0344059693253944e-06, + "loss": 0.327, + "step": 43255 + }, + { + "epoch": 2.44, + "grad_norm": 4.5946017789824625, + "learning_rate": 1.033407914346597e-06, + "loss": 0.3309, + "step": 43260 + }, + { + "epoch": 2.44, + "grad_norm": 4.9379160496499654, + "learning_rate": 1.032410285595626e-06, + "loss": 0.3529, + "step": 43265 + }, + { + "epoch": 2.44, + "grad_norm": 4.580929557843822, + "learning_rate": 1.0314130831796787e-06, + "loss": 0.3578, + "step": 43270 + }, + { + "epoch": 2.44, + "grad_norm": 5.522582007485534, + "learning_rate": 1.0304163072059115e-06, + "loss": 0.3405, + "step": 43275 + }, + { + "epoch": 2.44, + "grad_norm": 5.714784217768867, + "learning_rate": 1.0294199577814302e-06, + "loss": 0.3355, + "step": 43280 + }, + { + "epoch": 2.44, + "grad_norm": 4.726055189950499, + "learning_rate": 1.028424035013299e-06, + "loss": 0.3629, + "step": 43285 + }, + { + "epoch": 2.44, + "grad_norm": 5.863915755657142, + "learning_rate": 1.0274285390085336e-06, + "loss": 0.3264, + "step": 43290 + }, + { + "epoch": 2.44, + "grad_norm": 5.136164250849754, + "learning_rate": 1.0264334698741024e-06, + "loss": 0.3601, + "step": 43295 + }, + { + "epoch": 2.44, + "grad_norm": 7.512042974519487, + "learning_rate": 1.025438827716933e-06, + "loss": 0.3655, + "step": 43300 + }, + { + "epoch": 2.44, + "grad_norm": 4.788476683195161, + "learning_rate": 1.0244446126439012e-06, + "loss": 0.364, + "step": 43305 + }, + { + "epoch": 2.44, + "grad_norm": 4.477570487488166, + "learning_rate": 1.0234508247618423e-06, + "loss": 0.3542, + "step": 43310 + }, + { + "epoch": 2.44, + "grad_norm": 5.142502088098034, + "learning_rate": 1.0224574641775426e-06, + "loss": 0.3746, + "step": 43315 + }, + { + "epoch": 2.44, + "grad_norm": 5.543256946603856, + "learning_rate": 1.021464530997741e-06, + "loss": 0.36, + "step": 43320 + }, + { + "epoch": 2.44, + "grad_norm": 4.352514628843138, + "learning_rate": 1.0204720253291357e-06, + "loss": 0.3744, + "step": 43325 + }, + { + "epoch": 2.44, + "grad_norm": 5.53314883612779, + "learning_rate": 1.0194799472783729e-06, + "loss": 0.3563, + "step": 43330 + }, + { + "epoch": 2.44, + "grad_norm": 4.780028672157098, + "learning_rate": 1.0184882969520565e-06, + "loss": 0.3811, + "step": 43335 + }, + { + "epoch": 2.44, + "grad_norm": 4.707034646085166, + "learning_rate": 1.0174970744567464e-06, + "loss": 0.3633, + "step": 43340 + }, + { + "epoch": 2.44, + "grad_norm": 4.552000652056377, + "learning_rate": 1.01650627989895e-06, + "loss": 0.368, + "step": 43345 + }, + { + "epoch": 2.44, + "grad_norm": 4.884540408630439, + "learning_rate": 1.015515913385136e-06, + "loss": 0.3549, + "step": 43350 + }, + { + "epoch": 2.44, + "grad_norm": 4.836787043370082, + "learning_rate": 1.0145259750217208e-06, + "loss": 0.3502, + "step": 43355 + }, + { + "epoch": 2.44, + "grad_norm": 4.974463732444184, + "learning_rate": 1.0135364649150798e-06, + "loss": 0.3524, + "step": 43360 + }, + { + "epoch": 2.44, + "grad_norm": 4.463646137280299, + "learning_rate": 1.0125473831715382e-06, + "loss": 0.3175, + "step": 43365 + }, + { + "epoch": 2.44, + "grad_norm": 4.77740823679646, + "learning_rate": 1.0115587298973794e-06, + "loss": 0.3332, + "step": 43370 + }, + { + "epoch": 2.44, + "grad_norm": 6.965352720357074, + "learning_rate": 1.0105705051988373e-06, + "loss": 0.3415, + "step": 43375 + }, + { + "epoch": 2.44, + "grad_norm": 5.1761805683383075, + "learning_rate": 1.0095827091820997e-06, + "loss": 0.3741, + "step": 43380 + }, + { + "epoch": 2.44, + "grad_norm": 5.311466247664548, + "learning_rate": 1.0085953419533128e-06, + "loss": 0.3586, + "step": 43385 + }, + { + "epoch": 2.44, + "grad_norm": 4.618416334591783, + "learning_rate": 1.0076084036185701e-06, + "loss": 0.3567, + "step": 43390 + }, + { + "epoch": 2.45, + "grad_norm": 4.466280404764819, + "learning_rate": 1.0066218942839257e-06, + "loss": 0.3581, + "step": 43395 + }, + { + "epoch": 2.45, + "grad_norm": 4.638259497898554, + "learning_rate": 1.0056358140553829e-06, + "loss": 0.3376, + "step": 43400 + }, + { + "epoch": 2.45, + "grad_norm": 4.555066038395794, + "learning_rate": 1.0046501630388988e-06, + "loss": 0.341, + "step": 43405 + }, + { + "epoch": 2.45, + "grad_norm": 5.192399290954197, + "learning_rate": 1.0036649413403893e-06, + "loss": 0.3805, + "step": 43410 + }, + { + "epoch": 2.45, + "grad_norm": 4.6710061054899805, + "learning_rate": 1.0026801490657173e-06, + "loss": 0.3746, + "step": 43415 + }, + { + "epoch": 2.45, + "grad_norm": 4.931779119046791, + "learning_rate": 1.001695786320706e-06, + "loss": 0.3754, + "step": 43420 + }, + { + "epoch": 2.45, + "grad_norm": 4.848174812797477, + "learning_rate": 1.0007118532111275e-06, + "loss": 0.3312, + "step": 43425 + }, + { + "epoch": 2.45, + "grad_norm": 4.034035453384377, + "learning_rate": 9.9972834984271e-07, + "loss": 0.3609, + "step": 43430 + }, + { + "epoch": 2.45, + "grad_norm": 4.634366268733935, + "learning_rate": 9.987452763211381e-07, + "loss": 0.3452, + "step": 43435 + }, + { + "epoch": 2.45, + "grad_norm": 5.415788569573445, + "learning_rate": 9.977626327520428e-07, + "loss": 0.363, + "step": 43440 + }, + { + "epoch": 2.45, + "grad_norm": 5.055785616882905, + "learning_rate": 9.96780419241018e-07, + "loss": 0.3263, + "step": 43445 + }, + { + "epoch": 2.45, + "grad_norm": 5.897312973020388, + "learning_rate": 9.95798635893603e-07, + "loss": 0.3471, + "step": 43450 + }, + { + "epoch": 2.45, + "grad_norm": 4.506357394633795, + "learning_rate": 9.948172828152974e-07, + "loss": 0.3296, + "step": 43455 + }, + { + "epoch": 2.45, + "grad_norm": 4.7375468390477105, + "learning_rate": 9.938363601115514e-07, + "loss": 0.3517, + "step": 43460 + }, + { + "epoch": 2.45, + "grad_norm": 5.676075268107502, + "learning_rate": 9.928558678877676e-07, + "loss": 0.3679, + "step": 43465 + }, + { + "epoch": 2.45, + "grad_norm": 5.4475776117003, + "learning_rate": 9.918758062493073e-07, + "loss": 0.348, + "step": 43470 + }, + { + "epoch": 2.45, + "grad_norm": 4.395459201622998, + "learning_rate": 9.90896175301479e-07, + "loss": 0.3621, + "step": 43475 + }, + { + "epoch": 2.45, + "grad_norm": 4.848568076893303, + "learning_rate": 9.899169751495518e-07, + "loss": 0.3494, + "step": 43480 + }, + { + "epoch": 2.45, + "grad_norm": 4.583999888517346, + "learning_rate": 9.889382058987434e-07, + "loss": 0.3519, + "step": 43485 + }, + { + "epoch": 2.45, + "grad_norm": 4.531939939673199, + "learning_rate": 9.879598676542257e-07, + "loss": 0.3785, + "step": 43490 + }, + { + "epoch": 2.45, + "grad_norm": 4.210142755797226, + "learning_rate": 9.869819605211277e-07, + "loss": 0.3531, + "step": 43495 + }, + { + "epoch": 2.45, + "grad_norm": 4.396586341580197, + "learning_rate": 9.86004484604528e-07, + "loss": 0.3488, + "step": 43500 + }, + { + "epoch": 2.45, + "grad_norm": 5.882581756833667, + "learning_rate": 9.850274400094627e-07, + "loss": 0.3387, + "step": 43505 + }, + { + "epoch": 2.45, + "grad_norm": 4.7304096470247705, + "learning_rate": 9.840508268409171e-07, + "loss": 0.3542, + "step": 43510 + }, + { + "epoch": 2.45, + "grad_norm": 5.190917960802614, + "learning_rate": 9.830746452038353e-07, + "loss": 0.3603, + "step": 43515 + }, + { + "epoch": 2.45, + "grad_norm": 5.0149739070574775, + "learning_rate": 9.820988952031092e-07, + "loss": 0.3486, + "step": 43520 + }, + { + "epoch": 2.45, + "grad_norm": 5.474595154363621, + "learning_rate": 9.811235769435896e-07, + "loss": 0.3539, + "step": 43525 + }, + { + "epoch": 2.45, + "grad_norm": 6.8841178221262185, + "learning_rate": 9.801486905300795e-07, + "loss": 0.3623, + "step": 43530 + }, + { + "epoch": 2.45, + "grad_norm": 4.820060284256322, + "learning_rate": 9.791742360673317e-07, + "loss": 0.3544, + "step": 43535 + }, + { + "epoch": 2.45, + "grad_norm": 4.496545564226054, + "learning_rate": 9.782002136600594e-07, + "loss": 0.3895, + "step": 43540 + }, + { + "epoch": 2.45, + "grad_norm": 5.524870034904284, + "learning_rate": 9.772266234129235e-07, + "loss": 0.3465, + "step": 43545 + }, + { + "epoch": 2.45, + "grad_norm": 4.7122840063230935, + "learning_rate": 9.7625346543054e-07, + "loss": 0.363, + "step": 43550 + }, + { + "epoch": 2.45, + "grad_norm": 4.824248073000432, + "learning_rate": 9.7528073981748e-07, + "loss": 0.3412, + "step": 43555 + }, + { + "epoch": 2.45, + "grad_norm": 4.7221990418797715, + "learning_rate": 9.743084466782666e-07, + "loss": 0.3534, + "step": 43560 + }, + { + "epoch": 2.45, + "grad_norm": 4.7464187992370315, + "learning_rate": 9.733365861173782e-07, + "loss": 0.3491, + "step": 43565 + }, + { + "epoch": 2.45, + "grad_norm": 4.658102500981284, + "learning_rate": 9.723651582392451e-07, + "loss": 0.3698, + "step": 43570 + }, + { + "epoch": 2.46, + "grad_norm": 4.818336991693816, + "learning_rate": 9.713941631482492e-07, + "loss": 0.343, + "step": 43575 + }, + { + "epoch": 2.46, + "grad_norm": 5.260098728332438, + "learning_rate": 9.704236009487317e-07, + "loss": 0.3443, + "step": 43580 + }, + { + "epoch": 2.46, + "grad_norm": 4.329658662759802, + "learning_rate": 9.694534717449806e-07, + "loss": 0.3523, + "step": 43585 + }, + { + "epoch": 2.46, + "grad_norm": 4.324974830145578, + "learning_rate": 9.684837756412436e-07, + "loss": 0.3357, + "step": 43590 + }, + { + "epoch": 2.46, + "grad_norm": 4.2250847620165395, + "learning_rate": 9.675145127417162e-07, + "loss": 0.3165, + "step": 43595 + }, + { + "epoch": 2.46, + "grad_norm": 4.629797728082159, + "learning_rate": 9.66545683150552e-07, + "loss": 0.374, + "step": 43600 + }, + { + "epoch": 2.46, + "grad_norm": 4.7039192606979325, + "learning_rate": 9.655772869718544e-07, + "loss": 0.3613, + "step": 43605 + }, + { + "epoch": 2.46, + "grad_norm": 5.408365979438267, + "learning_rate": 9.646093243096833e-07, + "loss": 0.3613, + "step": 43610 + }, + { + "epoch": 2.46, + "grad_norm": 4.898897090792314, + "learning_rate": 9.636417952680487e-07, + "loss": 0.3187, + "step": 43615 + }, + { + "epoch": 2.46, + "grad_norm": 4.784863823967556, + "learning_rate": 9.626746999509174e-07, + "loss": 0.363, + "step": 43620 + }, + { + "epoch": 2.46, + "grad_norm": 4.4615621138050425, + "learning_rate": 9.617080384622085e-07, + "loss": 0.3229, + "step": 43625 + }, + { + "epoch": 2.46, + "grad_norm": 4.79519809907856, + "learning_rate": 9.607418109057931e-07, + "loss": 0.3611, + "step": 43630 + }, + { + "epoch": 2.46, + "grad_norm": 5.463654097369577, + "learning_rate": 9.59776017385496e-07, + "loss": 0.3488, + "step": 43635 + }, + { + "epoch": 2.46, + "grad_norm": 4.5246864291427515, + "learning_rate": 9.588106580050976e-07, + "loss": 0.3139, + "step": 43640 + }, + { + "epoch": 2.46, + "grad_norm": 5.453200078871238, + "learning_rate": 9.578457328683276e-07, + "loss": 0.3497, + "step": 43645 + }, + { + "epoch": 2.46, + "grad_norm": 4.539786815358693, + "learning_rate": 9.568812420788747e-07, + "loss": 0.3647, + "step": 43650 + }, + { + "epoch": 2.46, + "grad_norm": 4.725255156121965, + "learning_rate": 9.55917185740376e-07, + "loss": 0.3636, + "step": 43655 + }, + { + "epoch": 2.46, + "grad_norm": 4.178215742944267, + "learning_rate": 9.549535639564223e-07, + "loss": 0.3581, + "step": 43660 + }, + { + "epoch": 2.46, + "grad_norm": 5.845248663914356, + "learning_rate": 9.539903768305615e-07, + "loss": 0.3925, + "step": 43665 + }, + { + "epoch": 2.46, + "grad_norm": 4.202138617898354, + "learning_rate": 9.530276244662895e-07, + "loss": 0.3565, + "step": 43670 + }, + { + "epoch": 2.46, + "grad_norm": 4.81832249558518, + "learning_rate": 9.520653069670615e-07, + "loss": 0.3339, + "step": 43675 + }, + { + "epoch": 2.46, + "grad_norm": 4.623950232561397, + "learning_rate": 9.511034244362799e-07, + "loss": 0.3553, + "step": 43680 + }, + { + "epoch": 2.46, + "grad_norm": 4.995389542139731, + "learning_rate": 9.501419769773057e-07, + "loss": 0.3518, + "step": 43685 + }, + { + "epoch": 2.46, + "grad_norm": 5.3223302996123385, + "learning_rate": 9.491809646934497e-07, + "loss": 0.3577, + "step": 43690 + }, + { + "epoch": 2.46, + "grad_norm": 5.011617296634846, + "learning_rate": 9.482203876879748e-07, + "loss": 0.322, + "step": 43695 + }, + { + "epoch": 2.46, + "grad_norm": 5.148526822984768, + "learning_rate": 9.472602460641017e-07, + "loss": 0.353, + "step": 43700 + }, + { + "epoch": 2.46, + "grad_norm": 4.801389982432906, + "learning_rate": 9.463005399250024e-07, + "loss": 0.3303, + "step": 43705 + }, + { + "epoch": 2.46, + "grad_norm": 4.734765831681855, + "learning_rate": 9.45341269373799e-07, + "loss": 0.3153, + "step": 43710 + }, + { + "epoch": 2.46, + "grad_norm": 4.798953351536035, + "learning_rate": 9.443824345135721e-07, + "loss": 0.3712, + "step": 43715 + }, + { + "epoch": 2.46, + "grad_norm": 4.577369443166325, + "learning_rate": 9.434240354473501e-07, + "loss": 0.3678, + "step": 43720 + }, + { + "epoch": 2.46, + "grad_norm": 5.8039294527983705, + "learning_rate": 9.424660722781198e-07, + "loss": 0.3272, + "step": 43725 + }, + { + "epoch": 2.46, + "grad_norm": 4.327270569822577, + "learning_rate": 9.415085451088163e-07, + "loss": 0.3406, + "step": 43730 + }, + { + "epoch": 2.46, + "grad_norm": 4.402462600693131, + "learning_rate": 9.405514540423322e-07, + "loss": 0.3424, + "step": 43735 + }, + { + "epoch": 2.46, + "grad_norm": 5.453115794429658, + "learning_rate": 9.395947991815085e-07, + "loss": 0.353, + "step": 43740 + }, + { + "epoch": 2.46, + "grad_norm": 4.687035856849718, + "learning_rate": 9.386385806291453e-07, + "loss": 0.3757, + "step": 43745 + }, + { + "epoch": 2.47, + "grad_norm": 4.541579300240977, + "learning_rate": 9.376827984879905e-07, + "loss": 0.3504, + "step": 43750 + }, + { + "epoch": 2.47, + "grad_norm": 5.024964366744509, + "learning_rate": 9.367274528607462e-07, + "loss": 0.3591, + "step": 43755 + }, + { + "epoch": 2.47, + "grad_norm": 4.805346917594688, + "learning_rate": 9.357725438500714e-07, + "loss": 0.356, + "step": 43760 + }, + { + "epoch": 2.47, + "grad_norm": 4.4208678309441, + "learning_rate": 9.348180715585714e-07, + "loss": 0.3488, + "step": 43765 + }, + { + "epoch": 2.47, + "grad_norm": 4.99069494095584, + "learning_rate": 9.338640360888124e-07, + "loss": 0.336, + "step": 43770 + }, + { + "epoch": 2.47, + "grad_norm": 4.563176922512066, + "learning_rate": 9.32910437543308e-07, + "loss": 0.3356, + "step": 43775 + }, + { + "epoch": 2.47, + "grad_norm": 4.340789279829952, + "learning_rate": 9.319572760245255e-07, + "loss": 0.3628, + "step": 43780 + }, + { + "epoch": 2.47, + "grad_norm": 11.775398930130766, + "learning_rate": 9.310045516348881e-07, + "loss": 0.3846, + "step": 43785 + }, + { + "epoch": 2.47, + "grad_norm": 7.451004191355735, + "learning_rate": 9.300522644767684e-07, + "loss": 0.3944, + "step": 43790 + }, + { + "epoch": 2.47, + "grad_norm": 16.889612762272115, + "learning_rate": 9.291004146524945e-07, + "loss": 0.3807, + "step": 43795 + }, + { + "epoch": 2.47, + "grad_norm": 17.221536250743267, + "learning_rate": 9.281490022643491e-07, + "loss": 0.3498, + "step": 43800 + }, + { + "epoch": 2.47, + "grad_norm": 7.738300720330171, + "learning_rate": 9.271980274145625e-07, + "loss": 0.3382, + "step": 43805 + }, + { + "epoch": 2.47, + "grad_norm": 6.644940660433678, + "learning_rate": 9.262474902053237e-07, + "loss": 0.3232, + "step": 43810 + }, + { + "epoch": 2.47, + "grad_norm": 7.737625667943532, + "learning_rate": 9.252973907387697e-07, + "loss": 0.3598, + "step": 43815 + }, + { + "epoch": 2.47, + "grad_norm": 9.977930268125835, + "learning_rate": 9.24347729116995e-07, + "loss": 0.3683, + "step": 43820 + }, + { + "epoch": 2.47, + "grad_norm": 5.329929468250905, + "learning_rate": 9.233985054420435e-07, + "loss": 0.3515, + "step": 43825 + }, + { + "epoch": 2.47, + "grad_norm": 4.791546485178549, + "learning_rate": 9.22449719815915e-07, + "loss": 0.3869, + "step": 43830 + }, + { + "epoch": 2.47, + "grad_norm": 6.678452082940854, + "learning_rate": 9.215013723405591e-07, + "loss": 0.3919, + "step": 43835 + }, + { + "epoch": 2.47, + "grad_norm": 11.55156797863798, + "learning_rate": 9.205534631178798e-07, + "loss": 0.3425, + "step": 43840 + }, + { + "epoch": 2.47, + "grad_norm": 4.507942225396343, + "learning_rate": 9.196059922497363e-07, + "loss": 0.3336, + "step": 43845 + }, + { + "epoch": 2.47, + "grad_norm": 9.686081776468555, + "learning_rate": 9.186589598379353e-07, + "loss": 0.3502, + "step": 43850 + }, + { + "epoch": 2.47, + "grad_norm": 8.422982866386748, + "learning_rate": 9.17712365984243e-07, + "loss": 0.3744, + "step": 43855 + }, + { + "epoch": 2.47, + "grad_norm": 7.827116610585081, + "learning_rate": 9.167662107903735e-07, + "loss": 0.3364, + "step": 43860 + }, + { + "epoch": 2.47, + "grad_norm": 5.033505566965495, + "learning_rate": 9.15820494357994e-07, + "loss": 0.3962, + "step": 43865 + }, + { + "epoch": 2.47, + "grad_norm": 4.539734137289886, + "learning_rate": 9.148752167887287e-07, + "loss": 0.3561, + "step": 43870 + }, + { + "epoch": 2.47, + "grad_norm": 4.110277854754629, + "learning_rate": 9.139303781841486e-07, + "loss": 0.3245, + "step": 43875 + }, + { + "epoch": 2.47, + "grad_norm": 5.7338676785625475, + "learning_rate": 9.12985978645784e-07, + "loss": 0.3622, + "step": 43880 + }, + { + "epoch": 2.47, + "grad_norm": 4.4537484441987, + "learning_rate": 9.120420182751128e-07, + "loss": 0.3496, + "step": 43885 + }, + { + "epoch": 2.47, + "grad_norm": 5.760842851071135, + "learning_rate": 9.110984971735675e-07, + "loss": 0.3346, + "step": 43890 + }, + { + "epoch": 2.47, + "grad_norm": 5.29210792447318, + "learning_rate": 9.101554154425357e-07, + "loss": 0.3356, + "step": 43895 + }, + { + "epoch": 2.47, + "grad_norm": 4.277067057868431, + "learning_rate": 9.092127731833533e-07, + "loss": 0.3405, + "step": 43900 + }, + { + "epoch": 2.47, + "grad_norm": 4.983623319622154, + "learning_rate": 9.082705704973138e-07, + "loss": 0.3732, + "step": 43905 + }, + { + "epoch": 2.47, + "grad_norm": 4.998600476909276, + "learning_rate": 9.073288074856584e-07, + "loss": 0.3385, + "step": 43910 + }, + { + "epoch": 2.47, + "grad_norm": 4.397915903607824, + "learning_rate": 9.063874842495857e-07, + "loss": 0.3521, + "step": 43915 + }, + { + "epoch": 2.47, + "grad_norm": 4.399127011999149, + "learning_rate": 9.054466008902452e-07, + "loss": 0.3443, + "step": 43920 + }, + { + "epoch": 2.47, + "grad_norm": 5.076140472896828, + "learning_rate": 9.04506157508736e-07, + "loss": 0.3478, + "step": 43925 + }, + { + "epoch": 2.48, + "grad_norm": 4.771040874518689, + "learning_rate": 9.035661542061164e-07, + "loss": 0.3282, + "step": 43930 + }, + { + "epoch": 2.48, + "grad_norm": 4.764495449884524, + "learning_rate": 9.026265910833915e-07, + "loss": 0.3307, + "step": 43935 + }, + { + "epoch": 2.48, + "grad_norm": 4.894289259401423, + "learning_rate": 9.016874682415239e-07, + "loss": 0.343, + "step": 43940 + }, + { + "epoch": 2.48, + "grad_norm": 4.6520390140199215, + "learning_rate": 9.007487857814246e-07, + "loss": 0.3389, + "step": 43945 + }, + { + "epoch": 2.48, + "grad_norm": 4.605151237190835, + "learning_rate": 8.998105438039584e-07, + "loss": 0.3908, + "step": 43950 + }, + { + "epoch": 2.48, + "grad_norm": 5.787140948496046, + "learning_rate": 8.988727424099464e-07, + "loss": 0.3341, + "step": 43955 + }, + { + "epoch": 2.48, + "grad_norm": 5.053377741736455, + "learning_rate": 8.979353817001563e-07, + "loss": 0.369, + "step": 43960 + }, + { + "epoch": 2.48, + "grad_norm": 4.875304711403996, + "learning_rate": 8.969984617753146e-07, + "loss": 0.3375, + "step": 43965 + }, + { + "epoch": 2.48, + "grad_norm": 5.005540197045185, + "learning_rate": 8.960619827360951e-07, + "loss": 0.3607, + "step": 43970 + }, + { + "epoch": 2.48, + "grad_norm": 4.7801687630338945, + "learning_rate": 8.951259446831284e-07, + "loss": 0.3476, + "step": 43975 + }, + { + "epoch": 2.48, + "grad_norm": 4.530451062249858, + "learning_rate": 8.941903477169944e-07, + "loss": 0.3494, + "step": 43980 + }, + { + "epoch": 2.48, + "grad_norm": 4.664961379976749, + "learning_rate": 8.932551919382277e-07, + "loss": 0.3471, + "step": 43985 + }, + { + "epoch": 2.48, + "grad_norm": 5.196541035106309, + "learning_rate": 8.92320477447316e-07, + "loss": 0.3411, + "step": 43990 + }, + { + "epoch": 2.48, + "grad_norm": 4.69161782574326, + "learning_rate": 8.913862043446963e-07, + "loss": 0.3694, + "step": 43995 + }, + { + "epoch": 2.48, + "grad_norm": 4.813935139318951, + "learning_rate": 8.904523727307635e-07, + "loss": 0.3726, + "step": 44000 + }, + { + "epoch": 2.48, + "grad_norm": 4.449693510106746, + "learning_rate": 8.895189827058598e-07, + "loss": 0.3252, + "step": 44005 + }, + { + "epoch": 2.48, + "grad_norm": 6.144595709635144, + "learning_rate": 8.885860343702807e-07, + "loss": 0.3758, + "step": 44010 + }, + { + "epoch": 2.48, + "grad_norm": 4.499951839589321, + "learning_rate": 8.876535278242788e-07, + "loss": 0.369, + "step": 44015 + }, + { + "epoch": 2.48, + "grad_norm": 4.64946762625513, + "learning_rate": 8.867214631680532e-07, + "loss": 0.3473, + "step": 44020 + }, + { + "epoch": 2.48, + "grad_norm": 4.440712374897632, + "learning_rate": 8.857898405017612e-07, + "loss": 0.36, + "step": 44025 + }, + { + "epoch": 2.48, + "grad_norm": 4.373268530531762, + "learning_rate": 8.848586599255076e-07, + "loss": 0.3513, + "step": 44030 + }, + { + "epoch": 2.48, + "grad_norm": 4.732680096831105, + "learning_rate": 8.83927921539352e-07, + "loss": 0.3456, + "step": 44035 + }, + { + "epoch": 2.48, + "grad_norm": 6.303962304466678, + "learning_rate": 8.829976254433076e-07, + "loss": 0.3408, + "step": 44040 + }, + { + "epoch": 2.48, + "grad_norm": 4.901822957771553, + "learning_rate": 8.820677717373372e-07, + "loss": 0.366, + "step": 44045 + }, + { + "epoch": 2.48, + "grad_norm": 6.761613390083313, + "learning_rate": 8.811383605213591e-07, + "loss": 0.3643, + "step": 44050 + }, + { + "epoch": 2.48, + "grad_norm": 5.090882910730315, + "learning_rate": 8.802093918952426e-07, + "loss": 0.3384, + "step": 44055 + }, + { + "epoch": 2.48, + "grad_norm": 4.560597953357254, + "learning_rate": 8.792808659588076e-07, + "loss": 0.358, + "step": 44060 + }, + { + "epoch": 2.48, + "grad_norm": 8.106390771733917, + "learning_rate": 8.783527828118299e-07, + "loss": 0.3393, + "step": 44065 + }, + { + "epoch": 2.48, + "grad_norm": 5.213727936338028, + "learning_rate": 8.774251425540369e-07, + "loss": 0.3574, + "step": 44070 + }, + { + "epoch": 2.48, + "grad_norm": 4.538798588504677, + "learning_rate": 8.764979452851058e-07, + "loss": 0.3298, + "step": 44075 + }, + { + "epoch": 2.48, + "grad_norm": 5.664709588768496, + "learning_rate": 8.755711911046682e-07, + "loss": 0.3632, + "step": 44080 + }, + { + "epoch": 2.48, + "grad_norm": 4.620657667558471, + "learning_rate": 8.746448801123103e-07, + "loss": 0.3218, + "step": 44085 + }, + { + "epoch": 2.48, + "grad_norm": 5.057409884261421, + "learning_rate": 8.737190124075668e-07, + "loss": 0.3441, + "step": 44090 + }, + { + "epoch": 2.48, + "grad_norm": 5.105871557587499, + "learning_rate": 8.72793588089924e-07, + "loss": 0.3323, + "step": 44095 + }, + { + "epoch": 2.48, + "grad_norm": 4.995818818511657, + "learning_rate": 8.718686072588272e-07, + "loss": 0.3182, + "step": 44100 + }, + { + "epoch": 2.49, + "grad_norm": 5.697528507573346, + "learning_rate": 8.709440700136657e-07, + "loss": 0.3698, + "step": 44105 + }, + { + "epoch": 2.49, + "grad_norm": 4.827599048255153, + "learning_rate": 8.700199764537876e-07, + "loss": 0.3809, + "step": 44110 + }, + { + "epoch": 2.49, + "grad_norm": 4.558622120621011, + "learning_rate": 8.690963266784907e-07, + "loss": 0.3368, + "step": 44115 + }, + { + "epoch": 2.49, + "grad_norm": 4.491501574733111, + "learning_rate": 8.681731207870231e-07, + "loss": 0.3596, + "step": 44120 + }, + { + "epoch": 2.49, + "grad_norm": 4.766531291782006, + "learning_rate": 8.672503588785897e-07, + "loss": 0.3342, + "step": 44125 + }, + { + "epoch": 2.49, + "grad_norm": 4.626581899254411, + "learning_rate": 8.663280410523434e-07, + "loss": 0.3654, + "step": 44130 + }, + { + "epoch": 2.49, + "grad_norm": 4.2621047605774915, + "learning_rate": 8.654061674073938e-07, + "loss": 0.3356, + "step": 44135 + }, + { + "epoch": 2.49, + "grad_norm": 4.749185535655736, + "learning_rate": 8.644847380427989e-07, + "loss": 0.3313, + "step": 44140 + }, + { + "epoch": 2.49, + "grad_norm": 5.8056168870327065, + "learning_rate": 8.635637530575691e-07, + "loss": 0.352, + "step": 44145 + }, + { + "epoch": 2.49, + "grad_norm": 5.676457696084985, + "learning_rate": 8.626432125506706e-07, + "loss": 0.3579, + "step": 44150 + }, + { + "epoch": 2.49, + "grad_norm": 4.7567903095705235, + "learning_rate": 8.617231166210172e-07, + "loss": 0.3419, + "step": 44155 + }, + { + "epoch": 2.49, + "grad_norm": 4.889250251464815, + "learning_rate": 8.608034653674785e-07, + "loss": 0.338, + "step": 44160 + }, + { + "epoch": 2.49, + "grad_norm": 4.603592082293575, + "learning_rate": 8.598842588888761e-07, + "loss": 0.3331, + "step": 44165 + }, + { + "epoch": 2.49, + "grad_norm": 5.202106032016938, + "learning_rate": 8.589654972839812e-07, + "loss": 0.3673, + "step": 44170 + }, + { + "epoch": 2.49, + "grad_norm": 4.103395208552594, + "learning_rate": 8.580471806515201e-07, + "loss": 0.3186, + "step": 44175 + }, + { + "epoch": 2.49, + "grad_norm": 5.630736793045195, + "learning_rate": 8.571293090901684e-07, + "loss": 0.3619, + "step": 44180 + }, + { + "epoch": 2.49, + "grad_norm": 4.541640109764103, + "learning_rate": 8.562118826985571e-07, + "loss": 0.3229, + "step": 44185 + }, + { + "epoch": 2.49, + "grad_norm": 4.441121404906713, + "learning_rate": 8.552949015752665e-07, + "loss": 0.3524, + "step": 44190 + }, + { + "epoch": 2.49, + "grad_norm": 4.957926598428815, + "learning_rate": 8.543783658188315e-07, + "loss": 0.3456, + "step": 44195 + }, + { + "epoch": 2.49, + "grad_norm": 4.335333781799094, + "learning_rate": 8.534622755277366e-07, + "loss": 0.3566, + "step": 44200 + }, + { + "epoch": 2.49, + "grad_norm": 5.55555271440869, + "learning_rate": 8.525466308004199e-07, + "loss": 0.3391, + "step": 44205 + }, + { + "epoch": 2.49, + "grad_norm": 4.929445293932128, + "learning_rate": 8.516314317352725e-07, + "loss": 0.3436, + "step": 44210 + }, + { + "epoch": 2.49, + "grad_norm": 5.190184008181839, + "learning_rate": 8.507166784306347e-07, + "loss": 0.3461, + "step": 44215 + }, + { + "epoch": 2.49, + "grad_norm": 4.66077158709162, + "learning_rate": 8.498023709848035e-07, + "loss": 0.3665, + "step": 44220 + }, + { + "epoch": 2.49, + "grad_norm": 5.250552212017445, + "learning_rate": 8.488885094960226e-07, + "loss": 0.3729, + "step": 44225 + }, + { + "epoch": 2.49, + "grad_norm": 4.953751773413122, + "learning_rate": 8.479750940624925e-07, + "loss": 0.3484, + "step": 44230 + }, + { + "epoch": 2.49, + "grad_norm": 4.580191792202555, + "learning_rate": 8.47062124782363e-07, + "loss": 0.3594, + "step": 44235 + }, + { + "epoch": 2.49, + "grad_norm": 4.372847104021162, + "learning_rate": 8.461496017537352e-07, + "loss": 0.3262, + "step": 44240 + }, + { + "epoch": 2.49, + "grad_norm": 4.4175859248973435, + "learning_rate": 8.452375250746658e-07, + "loss": 0.3549, + "step": 44245 + }, + { + "epoch": 2.49, + "grad_norm": 5.072502246571446, + "learning_rate": 8.443258948431604e-07, + "loss": 0.3317, + "step": 44250 + }, + { + "epoch": 2.49, + "grad_norm": 4.925112630434505, + "learning_rate": 8.434147111571772e-07, + "loss": 0.3783, + "step": 44255 + }, + { + "epoch": 2.49, + "grad_norm": 4.795085259124459, + "learning_rate": 8.425039741146296e-07, + "loss": 0.345, + "step": 44260 + }, + { + "epoch": 2.49, + "grad_norm": 4.693133935750688, + "learning_rate": 8.415936838133765e-07, + "loss": 0.3455, + "step": 44265 + }, + { + "epoch": 2.49, + "grad_norm": 4.667041642741462, + "learning_rate": 8.406838403512363e-07, + "loss": 0.3606, + "step": 44270 + }, + { + "epoch": 2.49, + "grad_norm": 4.55027899950803, + "learning_rate": 8.397744438259725e-07, + "loss": 0.3062, + "step": 44275 + }, + { + "epoch": 2.49, + "grad_norm": 5.587479111705073, + "learning_rate": 8.388654943353064e-07, + "loss": 0.3793, + "step": 44280 + }, + { + "epoch": 2.5, + "grad_norm": 4.4443166706002355, + "learning_rate": 8.37956991976907e-07, + "loss": 0.3528, + "step": 44285 + }, + { + "epoch": 2.5, + "grad_norm": 4.750046861597566, + "learning_rate": 8.370489368483969e-07, + "loss": 0.3295, + "step": 44290 + }, + { + "epoch": 2.5, + "grad_norm": 4.2278896348195545, + "learning_rate": 8.361413290473519e-07, + "loss": 0.3173, + "step": 44295 + }, + { + "epoch": 2.5, + "grad_norm": 5.352502786694757, + "learning_rate": 8.352341686712962e-07, + "loss": 0.3529, + "step": 44300 + }, + { + "epoch": 2.5, + "grad_norm": 5.331734081914186, + "learning_rate": 8.343274558177105e-07, + "loss": 0.3595, + "step": 44305 + }, + { + "epoch": 2.5, + "grad_norm": 4.723387677421115, + "learning_rate": 8.33421190584024e-07, + "loss": 0.3273, + "step": 44310 + }, + { + "epoch": 2.5, + "grad_norm": 4.64324574200726, + "learning_rate": 8.325153730676194e-07, + "loss": 0.3286, + "step": 44315 + }, + { + "epoch": 2.5, + "grad_norm": 4.404223866217227, + "learning_rate": 8.316100033658309e-07, + "loss": 0.3163, + "step": 44320 + }, + { + "epoch": 2.5, + "grad_norm": 4.65372281204677, + "learning_rate": 8.307050815759426e-07, + "loss": 0.3414, + "step": 44325 + }, + { + "epoch": 2.5, + "grad_norm": 4.729993214905292, + "learning_rate": 8.298006077951953e-07, + "loss": 0.359, + "step": 44330 + }, + { + "epoch": 2.5, + "grad_norm": 4.269714409863494, + "learning_rate": 8.288965821207756e-07, + "loss": 0.3457, + "step": 44335 + }, + { + "epoch": 2.5, + "grad_norm": 4.542042151469218, + "learning_rate": 8.27993004649828e-07, + "loss": 0.3673, + "step": 44340 + }, + { + "epoch": 2.5, + "grad_norm": 4.7856233782340345, + "learning_rate": 8.270898754794432e-07, + "loss": 0.3841, + "step": 44345 + }, + { + "epoch": 2.5, + "grad_norm": 5.061868233838201, + "learning_rate": 8.26187194706668e-07, + "loss": 0.3367, + "step": 44350 + }, + { + "epoch": 2.5, + "grad_norm": 4.294338768524923, + "learning_rate": 8.252849624285004e-07, + "loss": 0.3368, + "step": 44355 + }, + { + "epoch": 2.5, + "grad_norm": 4.725613968867999, + "learning_rate": 8.243831787418866e-07, + "loss": 0.339, + "step": 44360 + }, + { + "epoch": 2.5, + "grad_norm": 4.93660765090371, + "learning_rate": 8.234818437437298e-07, + "loss": 0.3973, + "step": 44365 + }, + { + "epoch": 2.5, + "grad_norm": 4.914833726032178, + "learning_rate": 8.225809575308818e-07, + "loss": 0.3575, + "step": 44370 + }, + { + "epoch": 2.5, + "grad_norm": 4.60223427380691, + "learning_rate": 8.216805202001443e-07, + "loss": 0.3312, + "step": 44375 + }, + { + "epoch": 2.5, + "grad_norm": 4.865653198684339, + "learning_rate": 8.20780531848277e-07, + "loss": 0.3528, + "step": 44380 + }, + { + "epoch": 2.5, + "grad_norm": 5.373911080635319, + "learning_rate": 8.198809925719841e-07, + "loss": 0.378, + "step": 44385 + }, + { + "epoch": 2.5, + "grad_norm": 4.762112520158262, + "learning_rate": 8.189819024679285e-07, + "loss": 0.3459, + "step": 44390 + }, + { + "epoch": 2.5, + "grad_norm": 4.567877158922223, + "learning_rate": 8.180832616327184e-07, + "loss": 0.3714, + "step": 44395 + }, + { + "epoch": 2.5, + "grad_norm": 4.805688174382899, + "learning_rate": 8.171850701629191e-07, + "loss": 0.318, + "step": 44400 + }, + { + "epoch": 2.5, + "grad_norm": 4.475666983940325, + "learning_rate": 8.162873281550448e-07, + "loss": 0.337, + "step": 44405 + }, + { + "epoch": 2.5, + "grad_norm": 4.543551938126569, + "learning_rate": 8.153900357055594e-07, + "loss": 0.3198, + "step": 44410 + }, + { + "epoch": 2.5, + "grad_norm": 4.541585476119786, + "learning_rate": 8.144931929108841e-07, + "loss": 0.3159, + "step": 44415 + }, + { + "epoch": 2.5, + "grad_norm": 4.953033082317601, + "learning_rate": 8.13596799867386e-07, + "loss": 0.3171, + "step": 44420 + }, + { + "epoch": 2.5, + "grad_norm": 4.845871126148948, + "learning_rate": 8.12700856671389e-07, + "loss": 0.3366, + "step": 44425 + }, + { + "epoch": 2.5, + "grad_norm": 4.487990737133694, + "learning_rate": 8.118053634191637e-07, + "loss": 0.3813, + "step": 44430 + }, + { + "epoch": 2.5, + "grad_norm": 4.829448555182738, + "learning_rate": 8.109103202069373e-07, + "loss": 0.3039, + "step": 44435 + }, + { + "epoch": 2.5, + "grad_norm": 4.185810975738731, + "learning_rate": 8.100157271308834e-07, + "loss": 0.3346, + "step": 44440 + }, + { + "epoch": 2.5, + "grad_norm": 4.312857772746139, + "learning_rate": 8.09121584287132e-07, + "loss": 0.3331, + "step": 44445 + }, + { + "epoch": 2.5, + "grad_norm": 5.0701202927130815, + "learning_rate": 8.082278917717629e-07, + "loss": 0.3664, + "step": 44450 + }, + { + "epoch": 2.5, + "grad_norm": 4.357899393176421, + "learning_rate": 8.073346496808065e-07, + "loss": 0.3089, + "step": 44455 + }, + { + "epoch": 2.51, + "grad_norm": 4.1138905776558925, + "learning_rate": 8.064418581102445e-07, + "loss": 0.3335, + "step": 44460 + }, + { + "epoch": 2.51, + "grad_norm": 4.849758504003736, + "learning_rate": 8.055495171560135e-07, + "loss": 0.3541, + "step": 44465 + }, + { + "epoch": 2.51, + "grad_norm": 4.7556108709580585, + "learning_rate": 8.046576269139977e-07, + "loss": 0.3219, + "step": 44470 + }, + { + "epoch": 2.51, + "grad_norm": 5.328591524710588, + "learning_rate": 8.037661874800362e-07, + "loss": 0.3348, + "step": 44475 + }, + { + "epoch": 2.51, + "grad_norm": 5.198723147603352, + "learning_rate": 8.028751989499162e-07, + "loss": 0.328, + "step": 44480 + }, + { + "epoch": 2.51, + "grad_norm": 5.840623655667464, + "learning_rate": 8.019846614193799e-07, + "loss": 0.3743, + "step": 44485 + }, + { + "epoch": 2.51, + "grad_norm": 4.873340058137853, + "learning_rate": 8.010945749841198e-07, + "loss": 0.3417, + "step": 44490 + }, + { + "epoch": 2.51, + "grad_norm": 4.588646299405075, + "learning_rate": 8.002049397397776e-07, + "loss": 0.3209, + "step": 44495 + }, + { + "epoch": 2.51, + "grad_norm": 4.345110140095459, + "learning_rate": 7.993157557819503e-07, + "loss": 0.3321, + "step": 44500 + }, + { + "epoch": 2.51, + "grad_norm": 4.517240457719273, + "learning_rate": 7.984270232061836e-07, + "loss": 0.3311, + "step": 44505 + }, + { + "epoch": 2.51, + "grad_norm": 5.426640436410661, + "learning_rate": 7.975387421079767e-07, + "loss": 0.3442, + "step": 44510 + }, + { + "epoch": 2.51, + "grad_norm": 4.566775949798058, + "learning_rate": 7.966509125827793e-07, + "loss": 0.3243, + "step": 44515 + }, + { + "epoch": 2.51, + "grad_norm": 4.8314126750494415, + "learning_rate": 7.957635347259907e-07, + "loss": 0.3396, + "step": 44520 + }, + { + "epoch": 2.51, + "grad_norm": 5.074945417485195, + "learning_rate": 7.948766086329651e-07, + "loss": 0.3391, + "step": 44525 + }, + { + "epoch": 2.51, + "grad_norm": 6.219482845702791, + "learning_rate": 7.93990134399008e-07, + "loss": 0.3647, + "step": 44530 + }, + { + "epoch": 2.51, + "grad_norm": 4.562744980177607, + "learning_rate": 7.931041121193722e-07, + "loss": 0.3357, + "step": 44535 + }, + { + "epoch": 2.51, + "grad_norm": 4.787700461660074, + "learning_rate": 7.922185418892669e-07, + "loss": 0.3676, + "step": 44540 + }, + { + "epoch": 2.51, + "grad_norm": 5.402010946003515, + "learning_rate": 7.913334238038484e-07, + "loss": 0.3563, + "step": 44545 + }, + { + "epoch": 2.51, + "grad_norm": 5.21840161300093, + "learning_rate": 7.904487579582292e-07, + "loss": 0.3512, + "step": 44550 + }, + { + "epoch": 2.51, + "grad_norm": 4.4085572660437915, + "learning_rate": 7.895645444474681e-07, + "loss": 0.3544, + "step": 44555 + }, + { + "epoch": 2.51, + "grad_norm": 4.494106842057671, + "learning_rate": 7.886807833665799e-07, + "loss": 0.3337, + "step": 44560 + }, + { + "epoch": 2.51, + "grad_norm": 4.7858159887065765, + "learning_rate": 7.877974748105261e-07, + "loss": 0.3295, + "step": 44565 + }, + { + "epoch": 2.51, + "grad_norm": 5.846625073743843, + "learning_rate": 7.869146188742244e-07, + "loss": 0.3348, + "step": 44570 + }, + { + "epoch": 2.51, + "grad_norm": 4.9417794468001235, + "learning_rate": 7.860322156525413e-07, + "loss": 0.3383, + "step": 44575 + }, + { + "epoch": 2.51, + "grad_norm": 4.562123002643166, + "learning_rate": 7.851502652402926e-07, + "loss": 0.331, + "step": 44580 + }, + { + "epoch": 2.51, + "grad_norm": 4.828027079116812, + "learning_rate": 7.842687677322508e-07, + "loss": 0.3243, + "step": 44585 + }, + { + "epoch": 2.51, + "grad_norm": 4.518395277358132, + "learning_rate": 7.833877232231347e-07, + "loss": 0.3342, + "step": 44590 + }, + { + "epoch": 2.51, + "grad_norm": 5.133451535257724, + "learning_rate": 7.825071318076177e-07, + "loss": 0.3567, + "step": 44595 + }, + { + "epoch": 2.51, + "grad_norm": 4.636806447381211, + "learning_rate": 7.816269935803222e-07, + "loss": 0.3658, + "step": 44600 + }, + { + "epoch": 2.51, + "grad_norm": 4.569085031167726, + "learning_rate": 7.80747308635823e-07, + "loss": 0.3542, + "step": 44605 + }, + { + "epoch": 2.51, + "grad_norm": 4.529129237943756, + "learning_rate": 7.798680770686473e-07, + "loss": 0.3382, + "step": 44610 + }, + { + "epoch": 2.51, + "grad_norm": 5.081965625152124, + "learning_rate": 7.7898929897327e-07, + "loss": 0.347, + "step": 44615 + }, + { + "epoch": 2.51, + "grad_norm": 4.3032046904492685, + "learning_rate": 7.781109744441217e-07, + "loss": 0.3072, + "step": 44620 + }, + { + "epoch": 2.51, + "grad_norm": 4.243785155354974, + "learning_rate": 7.772331035755831e-07, + "loss": 0.347, + "step": 44625 + }, + { + "epoch": 2.51, + "grad_norm": 4.241055760001078, + "learning_rate": 7.763556864619825e-07, + "loss": 0.3502, + "step": 44630 + }, + { + "epoch": 2.51, + "grad_norm": 5.356448601790181, + "learning_rate": 7.754787231976052e-07, + "loss": 0.3243, + "step": 44635 + }, + { + "epoch": 2.52, + "grad_norm": 5.031269602851849, + "learning_rate": 7.746022138766823e-07, + "loss": 0.3346, + "step": 44640 + }, + { + "epoch": 2.52, + "grad_norm": 4.53741830317873, + "learning_rate": 7.737261585934003e-07, + "loss": 0.3324, + "step": 44645 + }, + { + "epoch": 2.52, + "grad_norm": 4.628902859046572, + "learning_rate": 7.728505574418938e-07, + "loss": 0.3618, + "step": 44650 + }, + { + "epoch": 2.52, + "grad_norm": 4.442259678807115, + "learning_rate": 7.719754105162514e-07, + "loss": 0.3546, + "step": 44655 + }, + { + "epoch": 2.52, + "grad_norm": 4.405580851793677, + "learning_rate": 7.711007179105113e-07, + "loss": 0.367, + "step": 44660 + }, + { + "epoch": 2.52, + "grad_norm": 4.821365305669165, + "learning_rate": 7.702264797186615e-07, + "loss": 0.3957, + "step": 44665 + }, + { + "epoch": 2.52, + "grad_norm": 6.29747534519174, + "learning_rate": 7.693526960346448e-07, + "loss": 0.3648, + "step": 44670 + }, + { + "epoch": 2.52, + "grad_norm": 4.983836888656855, + "learning_rate": 7.68479366952351e-07, + "loss": 0.3618, + "step": 44675 + }, + { + "epoch": 2.52, + "grad_norm": 5.943605713343998, + "learning_rate": 7.676064925656257e-07, + "loss": 0.33, + "step": 44680 + }, + { + "epoch": 2.52, + "grad_norm": 4.679080666542682, + "learning_rate": 7.667340729682615e-07, + "loss": 0.3202, + "step": 44685 + }, + { + "epoch": 2.52, + "grad_norm": 5.156348424319493, + "learning_rate": 7.65862108254003e-07, + "loss": 0.3289, + "step": 44690 + }, + { + "epoch": 2.52, + "grad_norm": 5.297079294747502, + "learning_rate": 7.649905985165484e-07, + "loss": 0.3693, + "step": 44695 + }, + { + "epoch": 2.52, + "grad_norm": 5.651999008907444, + "learning_rate": 7.641195438495435e-07, + "loss": 0.3369, + "step": 44700 + }, + { + "epoch": 2.52, + "grad_norm": 4.797409387390218, + "learning_rate": 7.632489443465896e-07, + "loss": 0.3401, + "step": 44705 + }, + { + "epoch": 2.52, + "grad_norm": 4.589892122850783, + "learning_rate": 7.623788001012333e-07, + "loss": 0.3656, + "step": 44710 + }, + { + "epoch": 2.52, + "grad_norm": 4.76219234641212, + "learning_rate": 7.615091112069772e-07, + "loss": 0.3618, + "step": 44715 + }, + { + "epoch": 2.52, + "grad_norm": 4.428078517908885, + "learning_rate": 7.606398777572738e-07, + "loss": 0.3447, + "step": 44720 + }, + { + "epoch": 2.52, + "grad_norm": 4.438532826939408, + "learning_rate": 7.597710998455244e-07, + "loss": 0.3425, + "step": 44725 + }, + { + "epoch": 2.52, + "grad_norm": 5.878997266613701, + "learning_rate": 7.589027775650854e-07, + "loss": 0.3581, + "step": 44730 + }, + { + "epoch": 2.52, + "grad_norm": 4.867534225517133, + "learning_rate": 7.580349110092583e-07, + "loss": 0.3405, + "step": 44735 + }, + { + "epoch": 2.52, + "grad_norm": 5.39431457738843, + "learning_rate": 7.57167500271303e-07, + "loss": 0.3456, + "step": 44740 + }, + { + "epoch": 2.52, + "grad_norm": 4.483054982849157, + "learning_rate": 7.563005454444245e-07, + "loss": 0.3244, + "step": 44745 + }, + { + "epoch": 2.52, + "grad_norm": 4.332740413440662, + "learning_rate": 7.554340466217803e-07, + "loss": 0.2978, + "step": 44750 + }, + { + "epoch": 2.52, + "grad_norm": 4.576924334818634, + "learning_rate": 7.54568003896482e-07, + "loss": 0.3411, + "step": 44755 + }, + { + "epoch": 2.52, + "grad_norm": 4.362288551470274, + "learning_rate": 7.537024173615865e-07, + "loss": 0.3488, + "step": 44760 + }, + { + "epoch": 2.52, + "grad_norm": 4.541460978341073, + "learning_rate": 7.528372871101081e-07, + "loss": 0.3459, + "step": 44765 + }, + { + "epoch": 2.52, + "grad_norm": 4.60757482332367, + "learning_rate": 7.519726132350069e-07, + "loss": 0.3405, + "step": 44770 + }, + { + "epoch": 2.52, + "grad_norm": 5.100913946874496, + "learning_rate": 7.511083958291953e-07, + "loss": 0.3524, + "step": 44775 + }, + { + "epoch": 2.52, + "grad_norm": 4.476606384585978, + "learning_rate": 7.502446349855403e-07, + "loss": 0.3519, + "step": 44780 + }, + { + "epoch": 2.52, + "grad_norm": 5.772859445135238, + "learning_rate": 7.493813307968528e-07, + "loss": 0.3498, + "step": 44785 + }, + { + "epoch": 2.52, + "grad_norm": 4.583888825155642, + "learning_rate": 7.485184833559023e-07, + "loss": 0.3384, + "step": 44790 + }, + { + "epoch": 2.52, + "grad_norm": 4.248775581687325, + "learning_rate": 7.476560927554027e-07, + "loss": 0.341, + "step": 44795 + }, + { + "epoch": 2.52, + "grad_norm": 4.481589917895012, + "learning_rate": 7.46794159088024e-07, + "loss": 0.3449, + "step": 44800 + }, + { + "epoch": 2.52, + "grad_norm": 5.139578821681772, + "learning_rate": 7.459326824463831e-07, + "loss": 0.3481, + "step": 44805 + }, + { + "epoch": 2.52, + "grad_norm": 4.941603480871794, + "learning_rate": 7.450716629230498e-07, + "loss": 0.3823, + "step": 44810 + }, + { + "epoch": 2.53, + "grad_norm": 4.373478099554834, + "learning_rate": 7.442111006105457e-07, + "loss": 0.3309, + "step": 44815 + }, + { + "epoch": 2.53, + "grad_norm": 4.671249747303654, + "learning_rate": 7.433509956013402e-07, + "loss": 0.3588, + "step": 44820 + }, + { + "epoch": 2.53, + "grad_norm": 6.264948532509474, + "learning_rate": 7.424913479878571e-07, + "loss": 0.3662, + "step": 44825 + }, + { + "epoch": 2.53, + "grad_norm": 4.885032567916461, + "learning_rate": 7.416321578624685e-07, + "loss": 0.3448, + "step": 44830 + }, + { + "epoch": 2.53, + "grad_norm": 5.939823260071416, + "learning_rate": 7.40773425317497e-07, + "loss": 0.334, + "step": 44835 + }, + { + "epoch": 2.53, + "grad_norm": 4.626916438486954, + "learning_rate": 7.399151504452201e-07, + "loss": 0.3181, + "step": 44840 + }, + { + "epoch": 2.53, + "grad_norm": 4.4233655638848886, + "learning_rate": 7.390573333378598e-07, + "loss": 0.3818, + "step": 44845 + }, + { + "epoch": 2.53, + "grad_norm": 5.402375823574998, + "learning_rate": 7.381999740875951e-07, + "loss": 0.355, + "step": 44850 + }, + { + "epoch": 2.53, + "grad_norm": 5.271480811664558, + "learning_rate": 7.373430727865521e-07, + "loss": 0.3444, + "step": 44855 + }, + { + "epoch": 2.53, + "grad_norm": 4.992301004960722, + "learning_rate": 7.364866295268069e-07, + "loss": 0.3463, + "step": 44860 + }, + { + "epoch": 2.53, + "grad_norm": 4.596595252823135, + "learning_rate": 7.356306444003913e-07, + "loss": 0.3157, + "step": 44865 + }, + { + "epoch": 2.53, + "grad_norm": 4.38779584756225, + "learning_rate": 7.347751174992818e-07, + "loss": 0.3479, + "step": 44870 + }, + { + "epoch": 2.53, + "grad_norm": 4.808865570577694, + "learning_rate": 7.339200489154113e-07, + "loss": 0.3121, + "step": 44875 + }, + { + "epoch": 2.53, + "grad_norm": 4.6246105385618295, + "learning_rate": 7.330654387406577e-07, + "loss": 0.3295, + "step": 44880 + }, + { + "epoch": 2.53, + "grad_norm": 4.800382089908357, + "learning_rate": 7.322112870668552e-07, + "loss": 0.3592, + "step": 44885 + }, + { + "epoch": 2.53, + "grad_norm": 5.050691840892069, + "learning_rate": 7.313575939857843e-07, + "loss": 0.3398, + "step": 44890 + }, + { + "epoch": 2.53, + "grad_norm": 4.5763900688178385, + "learning_rate": 7.305043595891793e-07, + "loss": 0.307, + "step": 44895 + }, + { + "epoch": 2.53, + "grad_norm": 4.690380999095491, + "learning_rate": 7.296515839687229e-07, + "loss": 0.362, + "step": 44900 + }, + { + "epoch": 2.53, + "grad_norm": 5.106088638126591, + "learning_rate": 7.287992672160505e-07, + "loss": 0.3437, + "step": 44905 + }, + { + "epoch": 2.53, + "grad_norm": 4.326280089942625, + "learning_rate": 7.279474094227479e-07, + "loss": 0.3167, + "step": 44910 + }, + { + "epoch": 2.53, + "grad_norm": 5.105439104678032, + "learning_rate": 7.270960106803504e-07, + "loss": 0.3561, + "step": 44915 + }, + { + "epoch": 2.53, + "grad_norm": 4.626778222128296, + "learning_rate": 7.262450710803426e-07, + "loss": 0.3346, + "step": 44920 + }, + { + "epoch": 2.53, + "grad_norm": 4.587701722220041, + "learning_rate": 7.253945907141652e-07, + "loss": 0.3488, + "step": 44925 + }, + { + "epoch": 2.53, + "grad_norm": 5.6224501817734716, + "learning_rate": 7.245445696732028e-07, + "loss": 0.3586, + "step": 44930 + }, + { + "epoch": 2.53, + "grad_norm": 4.42029361158793, + "learning_rate": 7.23695008048797e-07, + "loss": 0.338, + "step": 44935 + }, + { + "epoch": 2.53, + "grad_norm": 5.518060063351799, + "learning_rate": 7.228459059322351e-07, + "loss": 0.32, + "step": 44940 + }, + { + "epoch": 2.53, + "grad_norm": 4.28541057087806, + "learning_rate": 7.21997263414756e-07, + "loss": 0.3068, + "step": 44945 + }, + { + "epoch": 2.53, + "grad_norm": 4.693612685635041, + "learning_rate": 7.21149080587552e-07, + "loss": 0.3371, + "step": 44950 + }, + { + "epoch": 2.53, + "grad_norm": 4.465311875366154, + "learning_rate": 7.203013575417628e-07, + "loss": 0.3203, + "step": 44955 + }, + { + "epoch": 2.53, + "grad_norm": 4.811512264611589, + "learning_rate": 7.194540943684813e-07, + "loss": 0.3669, + "step": 44960 + }, + { + "epoch": 2.53, + "grad_norm": 4.340615069375246, + "learning_rate": 7.186072911587477e-07, + "loss": 0.3521, + "step": 44965 + }, + { + "epoch": 2.53, + "grad_norm": 4.889031936422385, + "learning_rate": 7.177609480035568e-07, + "loss": 0.3477, + "step": 44970 + }, + { + "epoch": 2.53, + "grad_norm": 4.927139064168435, + "learning_rate": 7.169150649938517e-07, + "loss": 0.3436, + "step": 44975 + }, + { + "epoch": 2.53, + "grad_norm": 4.752788813177939, + "learning_rate": 7.160696422205238e-07, + "loss": 0.3466, + "step": 44980 + }, + { + "epoch": 2.53, + "grad_norm": 4.980336606767357, + "learning_rate": 7.152246797744195e-07, + "loss": 0.3097, + "step": 44985 + }, + { + "epoch": 2.53, + "grad_norm": 5.27047234839068, + "learning_rate": 7.143801777463344e-07, + "loss": 0.3529, + "step": 44990 + }, + { + "epoch": 2.54, + "grad_norm": 4.639829298583398, + "learning_rate": 7.135361362270122e-07, + "loss": 0.38, + "step": 44995 + }, + { + "epoch": 2.54, + "grad_norm": 4.845379526215203, + "learning_rate": 7.126925553071506e-07, + "loss": 0.356, + "step": 45000 + }, + { + "epoch": 2.54, + "grad_norm": 6.376427871005003, + "learning_rate": 7.118494350773946e-07, + "loss": 0.3331, + "step": 45005 + }, + { + "epoch": 2.54, + "grad_norm": 4.7883684348232185, + "learning_rate": 7.110067756283429e-07, + "loss": 0.3592, + "step": 45010 + }, + { + "epoch": 2.54, + "grad_norm": 4.481909323704549, + "learning_rate": 7.101645770505406e-07, + "loss": 0.3049, + "step": 45015 + }, + { + "epoch": 2.54, + "grad_norm": 4.777910108906637, + "learning_rate": 7.093228394344881e-07, + "loss": 0.3344, + "step": 45020 + }, + { + "epoch": 2.54, + "grad_norm": 4.281601427551611, + "learning_rate": 7.084815628706327e-07, + "loss": 0.3446, + "step": 45025 + }, + { + "epoch": 2.54, + "grad_norm": 4.5747065573831325, + "learning_rate": 7.076407474493724e-07, + "loss": 0.3424, + "step": 45030 + }, + { + "epoch": 2.54, + "grad_norm": 5.031091193354879, + "learning_rate": 7.068003932610584e-07, + "loss": 0.3299, + "step": 45035 + }, + { + "epoch": 2.54, + "grad_norm": 4.612869879558003, + "learning_rate": 7.05960500395988e-07, + "loss": 0.3158, + "step": 45040 + }, + { + "epoch": 2.54, + "grad_norm": 4.979012364866844, + "learning_rate": 7.051210689444143e-07, + "loss": 0.3145, + "step": 45045 + }, + { + "epoch": 2.54, + "grad_norm": 4.7305499080220414, + "learning_rate": 7.042820989965355e-07, + "loss": 0.3421, + "step": 45050 + }, + { + "epoch": 2.54, + "grad_norm": 4.739869974815494, + "learning_rate": 7.034435906425047e-07, + "loss": 0.3426, + "step": 45055 + }, + { + "epoch": 2.54, + "grad_norm": 4.406403626184369, + "learning_rate": 7.026055439724222e-07, + "loss": 0.3588, + "step": 45060 + }, + { + "epoch": 2.54, + "grad_norm": 4.979411572308106, + "learning_rate": 7.017679590763388e-07, + "loss": 0.3183, + "step": 45065 + }, + { + "epoch": 2.54, + "grad_norm": 4.61023269706424, + "learning_rate": 7.009308360442585e-07, + "loss": 0.336, + "step": 45070 + }, + { + "epoch": 2.54, + "grad_norm": 5.518898134950466, + "learning_rate": 7.000941749661322e-07, + "loss": 0.3602, + "step": 45075 + }, + { + "epoch": 2.54, + "grad_norm": 5.958979467630164, + "learning_rate": 6.992579759318641e-07, + "loss": 0.3445, + "step": 45080 + }, + { + "epoch": 2.54, + "grad_norm": 5.632274520297883, + "learning_rate": 6.984222390313078e-07, + "loss": 0.3817, + "step": 45085 + }, + { + "epoch": 2.54, + "grad_norm": 7.139154209396844, + "learning_rate": 6.975869643542649e-07, + "loss": 0.3878, + "step": 45090 + }, + { + "epoch": 2.54, + "grad_norm": 4.87928049637677, + "learning_rate": 6.967521519904924e-07, + "loss": 0.3593, + "step": 45095 + }, + { + "epoch": 2.54, + "grad_norm": 4.675614618498371, + "learning_rate": 6.959178020296914e-07, + "loss": 0.332, + "step": 45100 + }, + { + "epoch": 2.54, + "grad_norm": 5.034894840758799, + "learning_rate": 6.95083914561519e-07, + "loss": 0.3697, + "step": 45105 + }, + { + "epoch": 2.54, + "grad_norm": 4.402565703964291, + "learning_rate": 6.942504896755781e-07, + "loss": 0.3291, + "step": 45110 + }, + { + "epoch": 2.54, + "grad_norm": 4.702696547043211, + "learning_rate": 6.934175274614258e-07, + "loss": 0.3273, + "step": 45115 + }, + { + "epoch": 2.54, + "grad_norm": 4.606186573893503, + "learning_rate": 6.925850280085666e-07, + "loss": 0.3238, + "step": 45120 + }, + { + "epoch": 2.54, + "grad_norm": 4.794160241758433, + "learning_rate": 6.917529914064547e-07, + "loss": 0.313, + "step": 45125 + }, + { + "epoch": 2.54, + "grad_norm": 4.9532447284861325, + "learning_rate": 6.909214177444995e-07, + "loss": 0.3491, + "step": 45130 + }, + { + "epoch": 2.54, + "grad_norm": 4.643774767180947, + "learning_rate": 6.900903071120535e-07, + "loss": 0.2958, + "step": 45135 + }, + { + "epoch": 2.54, + "grad_norm": 4.963371489528325, + "learning_rate": 6.892596595984269e-07, + "loss": 0.3375, + "step": 45140 + }, + { + "epoch": 2.54, + "grad_norm": 4.498468147812131, + "learning_rate": 6.884294752928738e-07, + "loss": 0.3285, + "step": 45145 + }, + { + "epoch": 2.54, + "grad_norm": 4.9257190081078335, + "learning_rate": 6.87599754284602e-07, + "loss": 0.3377, + "step": 45150 + }, + { + "epoch": 2.54, + "grad_norm": 4.498426066136765, + "learning_rate": 6.867704966627686e-07, + "loss": 0.3332, + "step": 45155 + }, + { + "epoch": 2.54, + "grad_norm": 7.1564111238574, + "learning_rate": 6.859417025164805e-07, + "loss": 0.3594, + "step": 45160 + }, + { + "epoch": 2.54, + "grad_norm": 4.31933813942458, + "learning_rate": 6.851133719347974e-07, + "loss": 0.3317, + "step": 45165 + }, + { + "epoch": 2.55, + "grad_norm": 4.825389142389356, + "learning_rate": 6.842855050067237e-07, + "loss": 0.351, + "step": 45170 + }, + { + "epoch": 2.55, + "grad_norm": 5.154041907647826, + "learning_rate": 6.834581018212195e-07, + "loss": 0.3359, + "step": 45175 + }, + { + "epoch": 2.55, + "grad_norm": 4.944834533182103, + "learning_rate": 6.826311624671938e-07, + "loss": 0.3291, + "step": 45180 + }, + { + "epoch": 2.55, + "grad_norm": 4.1692637406001305, + "learning_rate": 6.818046870335026e-07, + "loss": 0.3538, + "step": 45185 + }, + { + "epoch": 2.55, + "grad_norm": 5.310571888639479, + "learning_rate": 6.809786756089565e-07, + "loss": 0.3653, + "step": 45190 + }, + { + "epoch": 2.55, + "grad_norm": 6.149389655624969, + "learning_rate": 6.801531282823115e-07, + "loss": 0.3619, + "step": 45195 + }, + { + "epoch": 2.55, + "grad_norm": 8.066637430098174, + "learning_rate": 6.793280451422796e-07, + "loss": 0.3104, + "step": 45200 + }, + { + "epoch": 2.55, + "grad_norm": 4.639527242238606, + "learning_rate": 6.785034262775175e-07, + "loss": 0.3288, + "step": 45205 + }, + { + "epoch": 2.55, + "grad_norm": 4.760593620632931, + "learning_rate": 6.776792717766334e-07, + "loss": 0.3352, + "step": 45210 + }, + { + "epoch": 2.55, + "grad_norm": 4.530244222670664, + "learning_rate": 6.76855581728188e-07, + "loss": 0.3404, + "step": 45215 + }, + { + "epoch": 2.55, + "grad_norm": 4.357855873738075, + "learning_rate": 6.760323562206894e-07, + "loss": 0.3138, + "step": 45220 + }, + { + "epoch": 2.55, + "grad_norm": 4.766820495803968, + "learning_rate": 6.75209595342598e-07, + "loss": 0.3868, + "step": 45225 + }, + { + "epoch": 2.55, + "grad_norm": 4.939430013859291, + "learning_rate": 6.743872991823225e-07, + "loss": 0.3612, + "step": 45230 + }, + { + "epoch": 2.55, + "grad_norm": 4.598328199609213, + "learning_rate": 6.735654678282211e-07, + "loss": 0.3406, + "step": 45235 + }, + { + "epoch": 2.55, + "grad_norm": 5.021533912883923, + "learning_rate": 6.727441013686048e-07, + "loss": 0.3353, + "step": 45240 + }, + { + "epoch": 2.55, + "grad_norm": 4.702531077882601, + "learning_rate": 6.719231998917319e-07, + "loss": 0.3375, + "step": 45245 + }, + { + "epoch": 2.55, + "grad_norm": 4.807710449083324, + "learning_rate": 6.711027634858131e-07, + "loss": 0.3356, + "step": 45250 + }, + { + "epoch": 2.55, + "grad_norm": 4.393230274046004, + "learning_rate": 6.702827922390064e-07, + "loss": 0.3259, + "step": 45255 + }, + { + "epoch": 2.55, + "grad_norm": 4.621976352734638, + "learning_rate": 6.694632862394235e-07, + "loss": 0.3388, + "step": 45260 + }, + { + "epoch": 2.55, + "grad_norm": 5.227356045305042, + "learning_rate": 6.686442455751213e-07, + "loss": 0.3844, + "step": 45265 + }, + { + "epoch": 2.55, + "grad_norm": 4.613606798064376, + "learning_rate": 6.678256703341102e-07, + "loss": 0.3353, + "step": 45270 + }, + { + "epoch": 2.55, + "grad_norm": 4.913645735562954, + "learning_rate": 6.670075606043519e-07, + "loss": 0.3289, + "step": 45275 + }, + { + "epoch": 2.55, + "grad_norm": 5.129590843327389, + "learning_rate": 6.661899164737529e-07, + "loss": 0.3203, + "step": 45280 + }, + { + "epoch": 2.55, + "grad_norm": 5.261275238586701, + "learning_rate": 6.65372738030175e-07, + "loss": 0.3607, + "step": 45285 + }, + { + "epoch": 2.55, + "grad_norm": 4.473467542034233, + "learning_rate": 6.645560253614264e-07, + "loss": 0.3347, + "step": 45290 + }, + { + "epoch": 2.55, + "grad_norm": 5.880070134310221, + "learning_rate": 6.637397785552658e-07, + "loss": 0.3564, + "step": 45295 + }, + { + "epoch": 2.55, + "grad_norm": 4.4688638540940016, + "learning_rate": 6.62923997699404e-07, + "loss": 0.3515, + "step": 45300 + }, + { + "epoch": 2.55, + "grad_norm": 5.005617107071605, + "learning_rate": 6.621086828814987e-07, + "loss": 0.354, + "step": 45305 + }, + { + "epoch": 2.55, + "grad_norm": 5.13218954265486, + "learning_rate": 6.612938341891617e-07, + "loss": 0.3215, + "step": 45310 + }, + { + "epoch": 2.55, + "grad_norm": 4.595094142958957, + "learning_rate": 6.604794517099499e-07, + "loss": 0.3391, + "step": 45315 + }, + { + "epoch": 2.55, + "grad_norm": 4.22501493455035, + "learning_rate": 6.59665535531372e-07, + "loss": 0.3268, + "step": 45320 + }, + { + "epoch": 2.55, + "grad_norm": 5.130955010614197, + "learning_rate": 6.588520857408881e-07, + "loss": 0.3543, + "step": 45325 + }, + { + "epoch": 2.55, + "grad_norm": 4.501903308732604, + "learning_rate": 6.580391024259058e-07, + "loss": 0.3237, + "step": 45330 + }, + { + "epoch": 2.55, + "grad_norm": 4.741030478221836, + "learning_rate": 6.572265856737858e-07, + "loss": 0.3297, + "step": 45335 + }, + { + "epoch": 2.55, + "grad_norm": 4.929840417468209, + "learning_rate": 6.564145355718337e-07, + "loss": 0.3576, + "step": 45340 + }, + { + "epoch": 2.55, + "grad_norm": 4.996367194908822, + "learning_rate": 6.556029522073109e-07, + "loss": 0.3457, + "step": 45345 + }, + { + "epoch": 2.56, + "grad_norm": 5.053237158443029, + "learning_rate": 6.547918356674232e-07, + "loss": 0.3318, + "step": 45350 + }, + { + "epoch": 2.56, + "grad_norm": 4.260114867991111, + "learning_rate": 6.539811860393292e-07, + "loss": 0.316, + "step": 45355 + }, + { + "epoch": 2.56, + "grad_norm": 4.586001229471724, + "learning_rate": 6.531710034101385e-07, + "loss": 0.352, + "step": 45360 + }, + { + "epoch": 2.56, + "grad_norm": 4.597369569231543, + "learning_rate": 6.523612878669066e-07, + "loss": 0.3236, + "step": 45365 + }, + { + "epoch": 2.56, + "grad_norm": 4.5669840695910215, + "learning_rate": 6.51552039496643e-07, + "loss": 0.352, + "step": 45370 + }, + { + "epoch": 2.56, + "grad_norm": 3.900673091862434, + "learning_rate": 6.507432583863039e-07, + "loss": 0.342, + "step": 45375 + }, + { + "epoch": 2.56, + "grad_norm": 4.500785366803441, + "learning_rate": 6.499349446227948e-07, + "loss": 0.3356, + "step": 45380 + }, + { + "epoch": 2.56, + "grad_norm": 5.010166337084637, + "learning_rate": 6.491270982929759e-07, + "loss": 0.3354, + "step": 45385 + }, + { + "epoch": 2.56, + "grad_norm": 4.7542688293039514, + "learning_rate": 6.48319719483651e-07, + "loss": 0.3217, + "step": 45390 + }, + { + "epoch": 2.56, + "grad_norm": 4.746637686073376, + "learning_rate": 6.475128082815784e-07, + "loss": 0.3543, + "step": 45395 + }, + { + "epoch": 2.56, + "grad_norm": 4.750444987209024, + "learning_rate": 6.467063647734639e-07, + "loss": 0.328, + "step": 45400 + }, + { + "epoch": 2.56, + "grad_norm": 5.671654842897656, + "learning_rate": 6.45900389045962e-07, + "loss": 0.3354, + "step": 45405 + }, + { + "epoch": 2.56, + "grad_norm": 4.64219554285188, + "learning_rate": 6.450948811856799e-07, + "loss": 0.3465, + "step": 45410 + }, + { + "epoch": 2.56, + "grad_norm": 4.88899211877032, + "learning_rate": 6.44289841279172e-07, + "loss": 0.3179, + "step": 45415 + }, + { + "epoch": 2.56, + "grad_norm": 5.236730341052237, + "learning_rate": 6.434852694129451e-07, + "loss": 0.3394, + "step": 45420 + }, + { + "epoch": 2.56, + "grad_norm": 4.6235005034713454, + "learning_rate": 6.426811656734522e-07, + "loss": 0.3201, + "step": 45425 + }, + { + "epoch": 2.56, + "grad_norm": 4.552357981016617, + "learning_rate": 6.418775301470975e-07, + "loss": 0.339, + "step": 45430 + }, + { + "epoch": 2.56, + "grad_norm": 4.778345841882862, + "learning_rate": 6.410743629202359e-07, + "loss": 0.3385, + "step": 45435 + }, + { + "epoch": 2.56, + "grad_norm": 4.7549072480216825, + "learning_rate": 6.402716640791729e-07, + "loss": 0.3196, + "step": 45440 + }, + { + "epoch": 2.56, + "grad_norm": 4.258553257149767, + "learning_rate": 6.394694337101593e-07, + "loss": 0.3409, + "step": 45445 + }, + { + "epoch": 2.56, + "grad_norm": 4.856390728138945, + "learning_rate": 6.386676718993994e-07, + "loss": 0.3826, + "step": 45450 + }, + { + "epoch": 2.56, + "grad_norm": 4.347289307679359, + "learning_rate": 6.378663787330475e-07, + "loss": 0.3561, + "step": 45455 + }, + { + "epoch": 2.56, + "grad_norm": 4.637035878644347, + "learning_rate": 6.370655542972043e-07, + "loss": 0.333, + "step": 45460 + }, + { + "epoch": 2.56, + "grad_norm": 4.761770152061556, + "learning_rate": 6.362651986779217e-07, + "loss": 0.3456, + "step": 45465 + }, + { + "epoch": 2.56, + "grad_norm": 4.350322669144332, + "learning_rate": 6.354653119612031e-07, + "loss": 0.336, + "step": 45470 + }, + { + "epoch": 2.56, + "grad_norm": 4.55118386582183, + "learning_rate": 6.346658942329981e-07, + "loss": 0.3171, + "step": 45475 + }, + { + "epoch": 2.56, + "grad_norm": 4.181971097774808, + "learning_rate": 6.33866945579209e-07, + "loss": 0.3399, + "step": 45480 + }, + { + "epoch": 2.56, + "grad_norm": 4.9109321842733795, + "learning_rate": 6.330684660856862e-07, + "loss": 0.3514, + "step": 45485 + }, + { + "epoch": 2.56, + "grad_norm": 4.373303427649885, + "learning_rate": 6.322704558382281e-07, + "loss": 0.316, + "step": 45490 + }, + { + "epoch": 2.56, + "grad_norm": 4.326360984640099, + "learning_rate": 6.314729149225868e-07, + "loss": 0.3236, + "step": 45495 + }, + { + "epoch": 2.56, + "grad_norm": 5.740880807805324, + "learning_rate": 6.306758434244597e-07, + "loss": 0.3271, + "step": 45500 + }, + { + "epoch": 2.56, + "grad_norm": 4.865616169432333, + "learning_rate": 6.29879241429498e-07, + "loss": 0.369, + "step": 45505 + }, + { + "epoch": 2.56, + "grad_norm": 4.665183473917409, + "learning_rate": 6.290831090232979e-07, + "loss": 0.3752, + "step": 45510 + }, + { + "epoch": 2.56, + "grad_norm": 4.695214953843375, + "learning_rate": 6.282874462914074e-07, + "loss": 0.3228, + "step": 45515 + }, + { + "epoch": 2.56, + "grad_norm": 4.458721089517722, + "learning_rate": 6.274922533193256e-07, + "loss": 0.3181, + "step": 45520 + }, + { + "epoch": 2.57, + "grad_norm": 4.859867224898973, + "learning_rate": 6.266975301924977e-07, + "loss": 0.3176, + "step": 45525 + }, + { + "epoch": 2.57, + "grad_norm": 4.802227763539425, + "learning_rate": 6.259032769963208e-07, + "loss": 0.3374, + "step": 45530 + }, + { + "epoch": 2.57, + "grad_norm": 4.450749754037471, + "learning_rate": 6.251094938161423e-07, + "loss": 0.3373, + "step": 45535 + }, + { + "epoch": 2.57, + "grad_norm": 5.3726303450315696, + "learning_rate": 6.24316180737255e-07, + "loss": 0.3368, + "step": 45540 + }, + { + "epoch": 2.57, + "grad_norm": 4.149645236851189, + "learning_rate": 6.235233378449074e-07, + "loss": 0.3112, + "step": 45545 + }, + { + "epoch": 2.57, + "grad_norm": 5.2309138674718625, + "learning_rate": 6.227309652242902e-07, + "loss": 0.3453, + "step": 45550 + }, + { + "epoch": 2.57, + "grad_norm": 4.7130040876794315, + "learning_rate": 6.219390629605504e-07, + "loss": 0.3426, + "step": 45555 + }, + { + "epoch": 2.57, + "grad_norm": 6.8412221986098345, + "learning_rate": 6.211476311387793e-07, + "loss": 0.3321, + "step": 45560 + }, + { + "epoch": 2.57, + "grad_norm": 7.156391661882892, + "learning_rate": 6.203566698440216e-07, + "loss": 0.3167, + "step": 45565 + }, + { + "epoch": 2.57, + "grad_norm": 4.849943062979944, + "learning_rate": 6.195661791612684e-07, + "loss": 0.3254, + "step": 45570 + }, + { + "epoch": 2.57, + "grad_norm": 5.223070098014928, + "learning_rate": 6.187761591754604e-07, + "loss": 0.3541, + "step": 45575 + }, + { + "epoch": 2.57, + "grad_norm": 4.569147490751449, + "learning_rate": 6.179866099714915e-07, + "loss": 0.3659, + "step": 45580 + }, + { + "epoch": 2.57, + "grad_norm": 4.605059064041639, + "learning_rate": 6.171975316341993e-07, + "loss": 0.2976, + "step": 45585 + }, + { + "epoch": 2.57, + "grad_norm": 5.320355527719186, + "learning_rate": 6.164089242483762e-07, + "loss": 0.3378, + "step": 45590 + }, + { + "epoch": 2.57, + "grad_norm": 6.076168671196925, + "learning_rate": 6.156207878987591e-07, + "loss": 0.3459, + "step": 45595 + }, + { + "epoch": 2.57, + "grad_norm": 5.171272005921741, + "learning_rate": 6.148331226700394e-07, + "loss": 0.3014, + "step": 45600 + }, + { + "epoch": 2.57, + "grad_norm": 4.822150537689046, + "learning_rate": 6.140459286468537e-07, + "loss": 0.3476, + "step": 45605 + }, + { + "epoch": 2.57, + "grad_norm": 4.424041346320069, + "learning_rate": 6.132592059137887e-07, + "loss": 0.3543, + "step": 45610 + }, + { + "epoch": 2.57, + "grad_norm": 4.40380960558513, + "learning_rate": 6.124729545553837e-07, + "loss": 0.3399, + "step": 45615 + }, + { + "epoch": 2.57, + "grad_norm": 5.354273054194549, + "learning_rate": 6.116871746561215e-07, + "loss": 0.345, + "step": 45620 + }, + { + "epoch": 2.57, + "grad_norm": 4.716864858849761, + "learning_rate": 6.109018663004401e-07, + "loss": 0.3325, + "step": 45625 + }, + { + "epoch": 2.57, + "grad_norm": 4.808751125848432, + "learning_rate": 6.101170295727243e-07, + "loss": 0.3543, + "step": 45630 + }, + { + "epoch": 2.57, + "grad_norm": 4.666354198133943, + "learning_rate": 6.093326645573072e-07, + "loss": 0.3073, + "step": 45635 + }, + { + "epoch": 2.57, + "grad_norm": 4.472781175296722, + "learning_rate": 6.085487713384736e-07, + "loss": 0.3425, + "step": 45640 + }, + { + "epoch": 2.57, + "grad_norm": 4.091290667815048, + "learning_rate": 6.077653500004543e-07, + "loss": 0.3274, + "step": 45645 + }, + { + "epoch": 2.57, + "grad_norm": 4.384955573643156, + "learning_rate": 6.069824006274338e-07, + "loss": 0.3414, + "step": 45650 + }, + { + "epoch": 2.57, + "grad_norm": 4.716092923737294, + "learning_rate": 6.06199923303542e-07, + "loss": 0.3383, + "step": 45655 + }, + { + "epoch": 2.57, + "grad_norm": 4.819062499765147, + "learning_rate": 6.054179181128589e-07, + "loss": 0.348, + "step": 45660 + }, + { + "epoch": 2.57, + "grad_norm": 4.232173449885258, + "learning_rate": 6.046363851394161e-07, + "loss": 0.3337, + "step": 45665 + }, + { + "epoch": 2.57, + "grad_norm": 5.095511459240964, + "learning_rate": 6.038553244671913e-07, + "loss": 0.3311, + "step": 45670 + }, + { + "epoch": 2.57, + "grad_norm": 4.717930865497621, + "learning_rate": 6.030747361801143e-07, + "loss": 0.3674, + "step": 45675 + }, + { + "epoch": 2.57, + "grad_norm": 4.905351903685142, + "learning_rate": 6.022946203620611e-07, + "loss": 0.3819, + "step": 45680 + }, + { + "epoch": 2.57, + "grad_norm": 4.688057256212101, + "learning_rate": 6.015149770968603e-07, + "loss": 0.3344, + "step": 45685 + }, + { + "epoch": 2.57, + "grad_norm": 4.493660416571188, + "learning_rate": 6.007358064682872e-07, + "loss": 0.3496, + "step": 45690 + }, + { + "epoch": 2.57, + "grad_norm": 5.526337792244437, + "learning_rate": 5.999571085600664e-07, + "loss": 0.3367, + "step": 45695 + }, + { + "epoch": 2.57, + "grad_norm": 4.418054059865418, + "learning_rate": 5.991788834558737e-07, + "loss": 0.3227, + "step": 45700 + }, + { + "epoch": 2.58, + "grad_norm": 4.405627532145009, + "learning_rate": 5.984011312393317e-07, + "loss": 0.3698, + "step": 45705 + }, + { + "epoch": 2.58, + "grad_norm": 4.778899022311874, + "learning_rate": 5.976238519940148e-07, + "loss": 0.357, + "step": 45710 + }, + { + "epoch": 2.58, + "grad_norm": 4.8896373824742, + "learning_rate": 5.968470458034431e-07, + "loss": 0.349, + "step": 45715 + }, + { + "epoch": 2.58, + "grad_norm": 4.529989689394601, + "learning_rate": 5.96070712751089e-07, + "loss": 0.3348, + "step": 45720 + }, + { + "epoch": 2.58, + "grad_norm": 4.7401925352563685, + "learning_rate": 5.952948529203739e-07, + "loss": 0.3746, + "step": 45725 + }, + { + "epoch": 2.58, + "grad_norm": 4.399194053655546, + "learning_rate": 5.945194663946651e-07, + "loss": 0.3388, + "step": 45730 + }, + { + "epoch": 2.58, + "grad_norm": 4.894700584365297, + "learning_rate": 5.937445532572839e-07, + "loss": 0.3295, + "step": 45735 + }, + { + "epoch": 2.58, + "grad_norm": 4.381968416611394, + "learning_rate": 5.929701135914961e-07, + "loss": 0.3339, + "step": 45740 + }, + { + "epoch": 2.58, + "grad_norm": 4.822640331371169, + "learning_rate": 5.921961474805187e-07, + "loss": 0.3622, + "step": 45745 + }, + { + "epoch": 2.58, + "grad_norm": 4.263638447804075, + "learning_rate": 5.914226550075192e-07, + "loss": 0.2818, + "step": 45750 + }, + { + "epoch": 2.58, + "grad_norm": 5.371105809226048, + "learning_rate": 5.906496362556113e-07, + "loss": 0.3414, + "step": 45755 + }, + { + "epoch": 2.58, + "grad_norm": 4.874934446850226, + "learning_rate": 5.898770913078606e-07, + "loss": 0.3285, + "step": 45760 + }, + { + "epoch": 2.58, + "grad_norm": 4.50674168631367, + "learning_rate": 5.891050202472787e-07, + "loss": 0.3216, + "step": 45765 + }, + { + "epoch": 2.58, + "grad_norm": 4.499476756010894, + "learning_rate": 5.883334231568305e-07, + "loss": 0.3397, + "step": 45770 + }, + { + "epoch": 2.58, + "grad_norm": 5.310003041718294, + "learning_rate": 5.875623001194259e-07, + "loss": 0.3647, + "step": 45775 + }, + { + "epoch": 2.58, + "grad_norm": 5.961249246450765, + "learning_rate": 5.86791651217925e-07, + "loss": 0.3353, + "step": 45780 + }, + { + "epoch": 2.58, + "grad_norm": 4.32262916158411, + "learning_rate": 5.860214765351391e-07, + "loss": 0.2882, + "step": 45785 + }, + { + "epoch": 2.58, + "grad_norm": 4.421705242837742, + "learning_rate": 5.852517761538246e-07, + "loss": 0.3322, + "step": 45790 + }, + { + "epoch": 2.58, + "grad_norm": 4.605838844262771, + "learning_rate": 5.84482550156692e-07, + "loss": 0.3156, + "step": 45795 + }, + { + "epoch": 2.58, + "grad_norm": 4.452032822707266, + "learning_rate": 5.837137986263953e-07, + "loss": 0.3134, + "step": 45800 + }, + { + "epoch": 2.58, + "grad_norm": 4.021876932768459, + "learning_rate": 5.82945521645542e-07, + "loss": 0.2999, + "step": 45805 + }, + { + "epoch": 2.58, + "grad_norm": 5.067206429559478, + "learning_rate": 5.82177719296686e-07, + "loss": 0.3067, + "step": 45810 + }, + { + "epoch": 2.58, + "grad_norm": 5.987951345683465, + "learning_rate": 5.814103916623309e-07, + "loss": 0.3455, + "step": 45815 + }, + { + "epoch": 2.58, + "grad_norm": 4.57652436217481, + "learning_rate": 5.806435388249315e-07, + "loss": 0.3402, + "step": 45820 + }, + { + "epoch": 2.58, + "grad_norm": 4.812825272638316, + "learning_rate": 5.798771608668879e-07, + "loss": 0.3352, + "step": 45825 + }, + { + "epoch": 2.58, + "grad_norm": 4.9354679188340835, + "learning_rate": 5.791112578705493e-07, + "loss": 0.3111, + "step": 45830 + }, + { + "epoch": 2.58, + "grad_norm": 5.645811423727055, + "learning_rate": 5.783458299182182e-07, + "loss": 0.3346, + "step": 45835 + }, + { + "epoch": 2.58, + "grad_norm": 4.400523525358847, + "learning_rate": 5.77580877092141e-07, + "loss": 0.362, + "step": 45840 + }, + { + "epoch": 2.58, + "grad_norm": 4.250070510949164, + "learning_rate": 5.768163994745174e-07, + "loss": 0.3336, + "step": 45845 + }, + { + "epoch": 2.58, + "grad_norm": 4.50392073712273, + "learning_rate": 5.760523971474913e-07, + "loss": 0.3264, + "step": 45850 + }, + { + "epoch": 2.58, + "grad_norm": 4.587159037433273, + "learning_rate": 5.752888701931608e-07, + "loss": 0.3125, + "step": 45855 + }, + { + "epoch": 2.58, + "grad_norm": 4.767630603823105, + "learning_rate": 5.745258186935687e-07, + "loss": 0.3303, + "step": 45860 + }, + { + "epoch": 2.58, + "grad_norm": 5.183068224338198, + "learning_rate": 5.737632427307077e-07, + "loss": 0.3465, + "step": 45865 + }, + { + "epoch": 2.58, + "grad_norm": 4.81252838819064, + "learning_rate": 5.730011423865217e-07, + "loss": 0.3482, + "step": 45870 + }, + { + "epoch": 2.58, + "grad_norm": 5.254997130996933, + "learning_rate": 5.722395177428997e-07, + "loss": 0.3291, + "step": 45875 + }, + { + "epoch": 2.59, + "grad_norm": 4.960604300636512, + "learning_rate": 5.714783688816833e-07, + "loss": 0.3475, + "step": 45880 + }, + { + "epoch": 2.59, + "grad_norm": 5.052054771109202, + "learning_rate": 5.707176958846617e-07, + "loss": 0.3386, + "step": 45885 + }, + { + "epoch": 2.59, + "grad_norm": 4.6240626442632085, + "learning_rate": 5.699574988335699e-07, + "loss": 0.3317, + "step": 45890 + }, + { + "epoch": 2.59, + "grad_norm": 5.774035892221966, + "learning_rate": 5.69197777810096e-07, + "loss": 0.3466, + "step": 45895 + }, + { + "epoch": 2.59, + "grad_norm": 5.057198059861786, + "learning_rate": 5.684385328958769e-07, + "loss": 0.3498, + "step": 45900 + }, + { + "epoch": 2.59, + "grad_norm": 4.172782188468089, + "learning_rate": 5.676797641724941e-07, + "loss": 0.3058, + "step": 45905 + }, + { + "epoch": 2.59, + "grad_norm": 4.372762377500973, + "learning_rate": 5.669214717214833e-07, + "loss": 0.3071, + "step": 45910 + }, + { + "epoch": 2.59, + "grad_norm": 5.518003780721541, + "learning_rate": 5.661636556243238e-07, + "loss": 0.3179, + "step": 45915 + }, + { + "epoch": 2.59, + "grad_norm": 4.696054335029156, + "learning_rate": 5.654063159624484e-07, + "loss": 0.3298, + "step": 45920 + }, + { + "epoch": 2.59, + "grad_norm": 4.643356633079536, + "learning_rate": 5.646494528172347e-07, + "loss": 0.3382, + "step": 45925 + }, + { + "epoch": 2.59, + "grad_norm": 4.561951257751429, + "learning_rate": 5.638930662700132e-07, + "loss": 0.3288, + "step": 45930 + }, + { + "epoch": 2.59, + "grad_norm": 5.067931793655128, + "learning_rate": 5.631371564020588e-07, + "loss": 0.341, + "step": 45935 + }, + { + "epoch": 2.59, + "grad_norm": 4.575033423836401, + "learning_rate": 5.623817232945994e-07, + "loss": 0.3251, + "step": 45940 + }, + { + "epoch": 2.59, + "grad_norm": 4.669388142208612, + "learning_rate": 5.616267670288084e-07, + "loss": 0.2978, + "step": 45945 + }, + { + "epoch": 2.59, + "grad_norm": 4.50147579082452, + "learning_rate": 5.60872287685808e-07, + "loss": 0.3579, + "step": 45950 + }, + { + "epoch": 2.59, + "grad_norm": 4.6900589118001035, + "learning_rate": 5.60118285346673e-07, + "loss": 0.3205, + "step": 45955 + }, + { + "epoch": 2.59, + "grad_norm": 5.023171376558399, + "learning_rate": 5.593647600924218e-07, + "loss": 0.3466, + "step": 45960 + }, + { + "epoch": 2.59, + "grad_norm": 4.471142160791876, + "learning_rate": 5.586117120040263e-07, + "loss": 0.3089, + "step": 45965 + }, + { + "epoch": 2.59, + "grad_norm": 7.117263786411965, + "learning_rate": 5.578591411624034e-07, + "loss": 0.3277, + "step": 45970 + }, + { + "epoch": 2.59, + "grad_norm": 5.581250528514581, + "learning_rate": 5.571070476484197e-07, + "loss": 0.334, + "step": 45975 + }, + { + "epoch": 2.59, + "grad_norm": 4.350734881273643, + "learning_rate": 5.563554315428926e-07, + "loss": 0.3265, + "step": 45980 + }, + { + "epoch": 2.59, + "grad_norm": 4.2934887443968615, + "learning_rate": 5.556042929265848e-07, + "loss": 0.3162, + "step": 45985 + }, + { + "epoch": 2.59, + "grad_norm": 4.192878975313347, + "learning_rate": 5.548536318802106e-07, + "loss": 0.2995, + "step": 45990 + }, + { + "epoch": 2.59, + "grad_norm": 4.691314209883258, + "learning_rate": 5.54103448484432e-07, + "loss": 0.3475, + "step": 45995 + }, + { + "epoch": 2.59, + "grad_norm": 5.148991913722001, + "learning_rate": 5.533537428198582e-07, + "loss": 0.3118, + "step": 46000 + }, + { + "epoch": 2.59, + "grad_norm": 4.674358334706993, + "learning_rate": 5.526045149670506e-07, + "loss": 0.3397, + "step": 46005 + }, + { + "epoch": 2.59, + "grad_norm": 4.8153034598221245, + "learning_rate": 5.518557650065149e-07, + "loss": 0.3446, + "step": 46010 + }, + { + "epoch": 2.59, + "grad_norm": 4.329807125740926, + "learning_rate": 5.511074930187088e-07, + "loss": 0.351, + "step": 46015 + }, + { + "epoch": 2.59, + "grad_norm": 4.416949608034508, + "learning_rate": 5.503596990840359e-07, + "loss": 0.3168, + "step": 46020 + }, + { + "epoch": 2.59, + "grad_norm": 4.514112255790046, + "learning_rate": 5.496123832828526e-07, + "loss": 0.3263, + "step": 46025 + }, + { + "epoch": 2.59, + "grad_norm": 4.7644315304352265, + "learning_rate": 5.4886554569546e-07, + "loss": 0.3599, + "step": 46030 + }, + { + "epoch": 2.59, + "grad_norm": 4.898275421218794, + "learning_rate": 5.481191864021074e-07, + "loss": 0.3232, + "step": 46035 + }, + { + "epoch": 2.59, + "grad_norm": 5.717841109761777, + "learning_rate": 5.473733054829971e-07, + "loss": 0.3429, + "step": 46040 + }, + { + "epoch": 2.59, + "grad_norm": 3.8288551452757296, + "learning_rate": 5.466279030182747e-07, + "loss": 0.2774, + "step": 46045 + }, + { + "epoch": 2.59, + "grad_norm": 5.869427798765506, + "learning_rate": 5.458829790880399e-07, + "loss": 0.3592, + "step": 46050 + }, + { + "epoch": 2.59, + "grad_norm": 7.413220163693086, + "learning_rate": 5.451385337723364e-07, + "loss": 0.3292, + "step": 46055 + }, + { + "epoch": 2.6, + "grad_norm": 4.690746837620549, + "learning_rate": 5.443945671511569e-07, + "loss": 0.3145, + "step": 46060 + }, + { + "epoch": 2.6, + "grad_norm": 4.545457596969626, + "learning_rate": 5.436510793044464e-07, + "loss": 0.3319, + "step": 46065 + }, + { + "epoch": 2.6, + "grad_norm": 4.308901417366394, + "learning_rate": 5.429080703120937e-07, + "loss": 0.3173, + "step": 46070 + }, + { + "epoch": 2.6, + "grad_norm": 4.065855482863909, + "learning_rate": 5.421655402539405e-07, + "loss": 0.3087, + "step": 46075 + }, + { + "epoch": 2.6, + "grad_norm": 5.242778617409167, + "learning_rate": 5.414234892097731e-07, + "loss": 0.3292, + "step": 46080 + }, + { + "epoch": 2.6, + "grad_norm": 4.498659647638434, + "learning_rate": 5.406819172593292e-07, + "loss": 0.3348, + "step": 46085 + }, + { + "epoch": 2.6, + "grad_norm": 4.694945929852806, + "learning_rate": 5.399408244822946e-07, + "loss": 0.3315, + "step": 46090 + }, + { + "epoch": 2.6, + "grad_norm": 4.756773699081069, + "learning_rate": 5.392002109583006e-07, + "loss": 0.33, + "step": 46095 + }, + { + "epoch": 2.6, + "grad_norm": 4.820117598788975, + "learning_rate": 5.384600767669324e-07, + "loss": 0.3182, + "step": 46100 + }, + { + "epoch": 2.6, + "grad_norm": 4.385376533448734, + "learning_rate": 5.377204219877186e-07, + "loss": 0.3074, + "step": 46105 + }, + { + "epoch": 2.6, + "grad_norm": 4.957435969071599, + "learning_rate": 5.369812467001395e-07, + "loss": 0.3437, + "step": 46110 + }, + { + "epoch": 2.6, + "grad_norm": 4.4331366581863065, + "learning_rate": 5.362425509836222e-07, + "loss": 0.3089, + "step": 46115 + }, + { + "epoch": 2.6, + "grad_norm": 4.16092374237919, + "learning_rate": 5.35504334917542e-07, + "loss": 0.3285, + "step": 46120 + }, + { + "epoch": 2.6, + "grad_norm": 7.583005746145502, + "learning_rate": 5.347665985812255e-07, + "loss": 0.3172, + "step": 46125 + }, + { + "epoch": 2.6, + "grad_norm": 4.862785861064724, + "learning_rate": 5.340293420539433e-07, + "loss": 0.317, + "step": 46130 + }, + { + "epoch": 2.6, + "grad_norm": 4.497611114307605, + "learning_rate": 5.332925654149191e-07, + "loss": 0.3123, + "step": 46135 + }, + { + "epoch": 2.6, + "grad_norm": 5.290820388737501, + "learning_rate": 5.325562687433222e-07, + "loss": 0.333, + "step": 46140 + }, + { + "epoch": 2.6, + "grad_norm": 4.978808729520784, + "learning_rate": 5.318204521182691e-07, + "loss": 0.3637, + "step": 46145 + }, + { + "epoch": 2.6, + "grad_norm": 4.884145986032276, + "learning_rate": 5.310851156188296e-07, + "loss": 0.316, + "step": 46150 + }, + { + "epoch": 2.6, + "grad_norm": 4.803269631029532, + "learning_rate": 5.303502593240156e-07, + "loss": 0.3391, + "step": 46155 + }, + { + "epoch": 2.6, + "grad_norm": 5.250440770145616, + "learning_rate": 5.296158833127935e-07, + "loss": 0.3359, + "step": 46160 + }, + { + "epoch": 2.6, + "grad_norm": 4.597864642390691, + "learning_rate": 5.288819876640733e-07, + "loss": 0.3356, + "step": 46165 + }, + { + "epoch": 2.6, + "grad_norm": 5.169458501789328, + "learning_rate": 5.28148572456717e-07, + "loss": 0.3477, + "step": 46170 + }, + { + "epoch": 2.6, + "grad_norm": 4.554852646821307, + "learning_rate": 5.274156377695311e-07, + "loss": 0.3162, + "step": 46175 + }, + { + "epoch": 2.6, + "grad_norm": 4.7438969147168395, + "learning_rate": 5.266831836812742e-07, + "loss": 0.3324, + "step": 46180 + }, + { + "epoch": 2.6, + "grad_norm": 5.004805065972454, + "learning_rate": 5.259512102706521e-07, + "loss": 0.3372, + "step": 46185 + }, + { + "epoch": 2.6, + "grad_norm": 4.724986294140625, + "learning_rate": 5.252197176163176e-07, + "loss": 0.3584, + "step": 46190 + }, + { + "epoch": 2.6, + "grad_norm": 5.030369466747653, + "learning_rate": 5.244887057968739e-07, + "loss": 0.3206, + "step": 46195 + }, + { + "epoch": 2.6, + "grad_norm": 4.764662918969909, + "learning_rate": 5.237581748908705e-07, + "loss": 0.3224, + "step": 46200 + }, + { + "epoch": 2.6, + "grad_norm": 4.597006049270838, + "learning_rate": 5.230281249768055e-07, + "loss": 0.3502, + "step": 46205 + }, + { + "epoch": 2.6, + "grad_norm": 4.372302759834244, + "learning_rate": 5.222985561331279e-07, + "loss": 0.3243, + "step": 46210 + }, + { + "epoch": 2.6, + "grad_norm": 5.488255978863521, + "learning_rate": 5.215694684382305e-07, + "loss": 0.3318, + "step": 46215 + }, + { + "epoch": 2.6, + "grad_norm": 5.2101604116508184, + "learning_rate": 5.208408619704602e-07, + "loss": 0.3551, + "step": 46220 + }, + { + "epoch": 2.6, + "grad_norm": 4.484927201361623, + "learning_rate": 5.201127368081072e-07, + "loss": 0.3254, + "step": 46225 + }, + { + "epoch": 2.6, + "grad_norm": 4.812075795884978, + "learning_rate": 5.193850930294109e-07, + "loss": 0.3521, + "step": 46230 + }, + { + "epoch": 2.61, + "grad_norm": 4.227904412365689, + "learning_rate": 5.186579307125617e-07, + "loss": 0.3233, + "step": 46235 + }, + { + "epoch": 2.61, + "grad_norm": 4.48733536905027, + "learning_rate": 5.179312499356942e-07, + "loss": 0.3517, + "step": 46240 + }, + { + "epoch": 2.61, + "grad_norm": 4.9012095004225715, + "learning_rate": 5.172050507768961e-07, + "loss": 0.3212, + "step": 46245 + }, + { + "epoch": 2.61, + "grad_norm": 5.213389781683783, + "learning_rate": 5.164793333141982e-07, + "loss": 0.3478, + "step": 46250 + }, + { + "epoch": 2.61, + "grad_norm": 4.549484073230738, + "learning_rate": 5.157540976255848e-07, + "loss": 0.3423, + "step": 46255 + }, + { + "epoch": 2.61, + "grad_norm": 5.7062223505257474, + "learning_rate": 5.150293437889825e-07, + "loss": 0.32, + "step": 46260 + }, + { + "epoch": 2.61, + "grad_norm": 4.482824436298924, + "learning_rate": 5.143050718822723e-07, + "loss": 0.3258, + "step": 46265 + }, + { + "epoch": 2.61, + "grad_norm": 5.460343629837303, + "learning_rate": 5.135812819832775e-07, + "loss": 0.308, + "step": 46270 + }, + { + "epoch": 2.61, + "grad_norm": 5.612973732820175, + "learning_rate": 5.128579741697742e-07, + "loss": 0.3001, + "step": 46275 + }, + { + "epoch": 2.61, + "grad_norm": 4.801668871586329, + "learning_rate": 5.121351485194864e-07, + "loss": 0.3431, + "step": 46280 + }, + { + "epoch": 2.61, + "grad_norm": 4.711319975233357, + "learning_rate": 5.114128051100825e-07, + "loss": 0.3386, + "step": 46285 + }, + { + "epoch": 2.61, + "grad_norm": 4.656998448073274, + "learning_rate": 5.10690944019182e-07, + "loss": 0.3205, + "step": 46290 + }, + { + "epoch": 2.61, + "grad_norm": 4.47779574776888, + "learning_rate": 5.099695653243531e-07, + "loss": 0.3251, + "step": 46295 + }, + { + "epoch": 2.61, + "grad_norm": 5.191342791615926, + "learning_rate": 5.092486691031095e-07, + "loss": 0.3344, + "step": 46300 + }, + { + "epoch": 2.61, + "grad_norm": 4.7625385521267605, + "learning_rate": 5.085282554329163e-07, + "loss": 0.3457, + "step": 46305 + }, + { + "epoch": 2.61, + "grad_norm": 4.3258968327760146, + "learning_rate": 5.078083243911846e-07, + "loss": 0.3167, + "step": 46310 + }, + { + "epoch": 2.61, + "grad_norm": 6.728115614650199, + "learning_rate": 5.070888760552733e-07, + "loss": 0.3193, + "step": 46315 + }, + { + "epoch": 2.61, + "grad_norm": 4.710623912333051, + "learning_rate": 5.063699105024916e-07, + "loss": 0.3463, + "step": 46320 + }, + { + "epoch": 2.61, + "grad_norm": 4.868176111442243, + "learning_rate": 5.056514278100943e-07, + "loss": 0.3337, + "step": 46325 + }, + { + "epoch": 2.61, + "grad_norm": 4.897804826606414, + "learning_rate": 5.04933428055287e-07, + "loss": 0.3413, + "step": 46330 + }, + { + "epoch": 2.61, + "grad_norm": 4.48526100218459, + "learning_rate": 5.042159113152201e-07, + "loss": 0.3398, + "step": 46335 + }, + { + "epoch": 2.61, + "grad_norm": 5.179763745121832, + "learning_rate": 5.034988776669958e-07, + "loss": 0.3351, + "step": 46340 + }, + { + "epoch": 2.61, + "grad_norm": 4.451738688720434, + "learning_rate": 5.02782327187662e-07, + "loss": 0.3259, + "step": 46345 + }, + { + "epoch": 2.61, + "grad_norm": 5.330320228509962, + "learning_rate": 5.020662599542137e-07, + "loss": 0.3317, + "step": 46350 + }, + { + "epoch": 2.61, + "grad_norm": 5.0568547250309175, + "learning_rate": 5.013506760435966e-07, + "loss": 0.3104, + "step": 46355 + }, + { + "epoch": 2.61, + "grad_norm": 5.541983116924022, + "learning_rate": 5.006355755327041e-07, + "loss": 0.3397, + "step": 46360 + }, + { + "epoch": 2.61, + "grad_norm": 4.547305181395205, + "learning_rate": 4.999209584983755e-07, + "loss": 0.335, + "step": 46365 + }, + { + "epoch": 2.61, + "grad_norm": 4.776630827040175, + "learning_rate": 4.992068250174014e-07, + "loss": 0.3363, + "step": 46370 + }, + { + "epoch": 2.61, + "grad_norm": 5.068470514595292, + "learning_rate": 4.984931751665167e-07, + "loss": 0.3185, + "step": 46375 + }, + { + "epoch": 2.61, + "grad_norm": 4.344035870284013, + "learning_rate": 4.977800090224078e-07, + "loss": 0.3338, + "step": 46380 + }, + { + "epoch": 2.61, + "grad_norm": 4.781788711941363, + "learning_rate": 4.970673266617055e-07, + "loss": 0.3219, + "step": 46385 + }, + { + "epoch": 2.61, + "grad_norm": 4.230395607717361, + "learning_rate": 4.96355128160993e-07, + "loss": 0.3348, + "step": 46390 + }, + { + "epoch": 2.61, + "grad_norm": 4.450801750708208, + "learning_rate": 4.95643413596798e-07, + "loss": 0.3466, + "step": 46395 + }, + { + "epoch": 2.61, + "grad_norm": 4.648472660225281, + "learning_rate": 4.949321830455966e-07, + "loss": 0.3337, + "step": 46400 + }, + { + "epoch": 2.61, + "grad_norm": 4.438694675231268, + "learning_rate": 4.942214365838155e-07, + "loss": 0.3145, + "step": 46405 + }, + { + "epoch": 2.61, + "grad_norm": 5.364386738114903, + "learning_rate": 4.935111742878257e-07, + "loss": 0.3491, + "step": 46410 + }, + { + "epoch": 2.62, + "grad_norm": 5.074838813178466, + "learning_rate": 4.928013962339495e-07, + "loss": 0.3263, + "step": 46415 + }, + { + "epoch": 2.62, + "grad_norm": 4.774070797456436, + "learning_rate": 4.920921024984543e-07, + "loss": 0.3414, + "step": 46420 + }, + { + "epoch": 2.62, + "grad_norm": 5.153494379702369, + "learning_rate": 4.913832931575585e-07, + "loss": 0.2957, + "step": 46425 + }, + { + "epoch": 2.62, + "grad_norm": 4.746596480095154, + "learning_rate": 4.906749682874257e-07, + "loss": 0.3139, + "step": 46430 + }, + { + "epoch": 2.62, + "grad_norm": 4.9289718278731165, + "learning_rate": 4.899671279641682e-07, + "loss": 0.3287, + "step": 46435 + }, + { + "epoch": 2.62, + "grad_norm": 4.1760119861251415, + "learning_rate": 4.892597722638476e-07, + "loss": 0.344, + "step": 46440 + }, + { + "epoch": 2.62, + "grad_norm": 4.576907565430694, + "learning_rate": 4.885529012624713e-07, + "loss": 0.322, + "step": 46445 + }, + { + "epoch": 2.62, + "grad_norm": 4.912295010945285, + "learning_rate": 4.878465150359962e-07, + "loss": 0.3272, + "step": 46450 + }, + { + "epoch": 2.62, + "grad_norm": 6.02785863344295, + "learning_rate": 4.871406136603279e-07, + "loss": 0.3484, + "step": 46455 + }, + { + "epoch": 2.62, + "grad_norm": 4.395925493871996, + "learning_rate": 4.864351972113163e-07, + "loss": 0.3187, + "step": 46460 + }, + { + "epoch": 2.62, + "grad_norm": 5.405042994822878, + "learning_rate": 4.857302657647633e-07, + "loss": 0.3206, + "step": 46465 + }, + { + "epoch": 2.62, + "grad_norm": 4.565139131974331, + "learning_rate": 4.850258193964152e-07, + "loss": 0.3281, + "step": 46470 + }, + { + "epoch": 2.62, + "grad_norm": 4.404712308416535, + "learning_rate": 4.843218581819703e-07, + "loss": 0.3524, + "step": 46475 + }, + { + "epoch": 2.62, + "grad_norm": 5.0384397446786275, + "learning_rate": 4.836183821970708e-07, + "loss": 0.3484, + "step": 46480 + }, + { + "epoch": 2.62, + "grad_norm": 4.936290176541444, + "learning_rate": 4.829153915173074e-07, + "loss": 0.3112, + "step": 46485 + }, + { + "epoch": 2.62, + "grad_norm": 4.399157552521201, + "learning_rate": 4.822128862182218e-07, + "loss": 0.3077, + "step": 46490 + }, + { + "epoch": 2.62, + "grad_norm": 4.886471784660722, + "learning_rate": 4.815108663752988e-07, + "loss": 0.3339, + "step": 46495 + }, + { + "epoch": 2.62, + "grad_norm": 4.611838625994117, + "learning_rate": 4.808093320639762e-07, + "loss": 0.3332, + "step": 46500 + }, + { + "epoch": 2.62, + "grad_norm": 4.615169741809314, + "learning_rate": 4.801082833596344e-07, + "loss": 0.327, + "step": 46505 + }, + { + "epoch": 2.62, + "grad_norm": 6.705207984017148, + "learning_rate": 4.794077203376063e-07, + "loss": 0.3373, + "step": 46510 + }, + { + "epoch": 2.62, + "grad_norm": 5.54452367481361, + "learning_rate": 4.787076430731696e-07, + "loss": 0.3451, + "step": 46515 + }, + { + "epoch": 2.62, + "grad_norm": 4.528245726811377, + "learning_rate": 4.780080516415497e-07, + "loss": 0.3445, + "step": 46520 + }, + { + "epoch": 2.62, + "grad_norm": 4.643215275404237, + "learning_rate": 4.773089461179226e-07, + "loss": 0.3437, + "step": 46525 + }, + { + "epoch": 2.62, + "grad_norm": 4.265512148486137, + "learning_rate": 4.76610326577408e-07, + "loss": 0.342, + "step": 46530 + }, + { + "epoch": 2.62, + "grad_norm": 4.445016881382162, + "learning_rate": 4.7591219309507886e-07, + "loss": 0.3369, + "step": 46535 + }, + { + "epoch": 2.62, + "grad_norm": 4.721337830790504, + "learning_rate": 4.752145457459495e-07, + "loss": 0.3187, + "step": 46540 + }, + { + "epoch": 2.62, + "grad_norm": 5.354957394363971, + "learning_rate": 4.7451738460498664e-07, + "loss": 0.3251, + "step": 46545 + }, + { + "epoch": 2.62, + "grad_norm": 4.767370799936989, + "learning_rate": 4.7382070974710426e-07, + "loss": 0.3112, + "step": 46550 + }, + { + "epoch": 2.62, + "grad_norm": 4.415702968447362, + "learning_rate": 4.731245212471608e-07, + "loss": 0.3086, + "step": 46555 + }, + { + "epoch": 2.62, + "grad_norm": 4.533344978473302, + "learning_rate": 4.724288191799675e-07, + "loss": 0.3226, + "step": 46560 + }, + { + "epoch": 2.62, + "grad_norm": 5.99179213634612, + "learning_rate": 4.717336036202785e-07, + "loss": 0.3524, + "step": 46565 + }, + { + "epoch": 2.62, + "grad_norm": 4.707278535069128, + "learning_rate": 4.7103887464279907e-07, + "loss": 0.3235, + "step": 46570 + }, + { + "epoch": 2.62, + "grad_norm": 4.751202072124831, + "learning_rate": 4.703446323221811e-07, + "loss": 0.3125, + "step": 46575 + }, + { + "epoch": 2.62, + "grad_norm": 4.664623967429161, + "learning_rate": 4.6965087673302155e-07, + "loss": 0.3242, + "step": 46580 + }, + { + "epoch": 2.62, + "grad_norm": 4.453480376202116, + "learning_rate": 4.689576079498703e-07, + "loss": 0.352, + "step": 46585 + }, + { + "epoch": 2.63, + "grad_norm": 5.418918181892823, + "learning_rate": 4.6826482604722057e-07, + "loss": 0.373, + "step": 46590 + }, + { + "epoch": 2.63, + "grad_norm": 5.493629490574223, + "learning_rate": 4.6757253109951607e-07, + "loss": 0.3284, + "step": 46595 + }, + { + "epoch": 2.63, + "grad_norm": 4.269964245495313, + "learning_rate": 4.668807231811462e-07, + "loss": 0.3288, + "step": 46600 + }, + { + "epoch": 2.63, + "grad_norm": 4.53863055003813, + "learning_rate": 4.661894023664482e-07, + "loss": 0.3379, + "step": 46605 + }, + { + "epoch": 2.63, + "grad_norm": 4.255523569854855, + "learning_rate": 4.654985687297087e-07, + "loss": 0.2973, + "step": 46610 + }, + { + "epoch": 2.63, + "grad_norm": 4.526356223682446, + "learning_rate": 4.648082223451594e-07, + "loss": 0.3266, + "step": 46615 + }, + { + "epoch": 2.63, + "grad_norm": 4.421706598087153, + "learning_rate": 4.641183632869828e-07, + "loss": 0.3049, + "step": 46620 + }, + { + "epoch": 2.63, + "grad_norm": 4.573584685497384, + "learning_rate": 4.6342899162930557e-07, + "loss": 0.3584, + "step": 46625 + }, + { + "epoch": 2.63, + "grad_norm": 4.687356946628921, + "learning_rate": 4.6274010744620567e-07, + "loss": 0.3014, + "step": 46630 + }, + { + "epoch": 2.63, + "grad_norm": 4.136176011071282, + "learning_rate": 4.620517108117045e-07, + "loss": 0.2939, + "step": 46635 + }, + { + "epoch": 2.63, + "grad_norm": 4.316344056456715, + "learning_rate": 4.6136380179977456e-07, + "loss": 0.3301, + "step": 46640 + }, + { + "epoch": 2.63, + "grad_norm": 5.3163159087472724, + "learning_rate": 4.6067638048433616e-07, + "loss": 0.3377, + "step": 46645 + }, + { + "epoch": 2.63, + "grad_norm": 4.459929959955323, + "learning_rate": 4.599894469392524e-07, + "loss": 0.3122, + "step": 46650 + }, + { + "epoch": 2.63, + "grad_norm": 4.416213608940536, + "learning_rate": 4.593030012383409e-07, + "loss": 0.3132, + "step": 46655 + }, + { + "epoch": 2.63, + "grad_norm": 4.687807797740529, + "learning_rate": 4.586170434553611e-07, + "loss": 0.3252, + "step": 46660 + }, + { + "epoch": 2.63, + "grad_norm": 5.107190719012708, + "learning_rate": 4.5793157366402165e-07, + "loss": 0.3048, + "step": 46665 + }, + { + "epoch": 2.63, + "grad_norm": 4.956873437855812, + "learning_rate": 4.572465919379815e-07, + "loss": 0.3359, + "step": 46670 + }, + { + "epoch": 2.63, + "grad_norm": 4.735126849106469, + "learning_rate": 4.565620983508423e-07, + "loss": 0.3672, + "step": 46675 + }, + { + "epoch": 2.63, + "grad_norm": 4.472038017751582, + "learning_rate": 4.558780929761586e-07, + "loss": 0.3004, + "step": 46680 + }, + { + "epoch": 2.63, + "grad_norm": 4.6455324004840675, + "learning_rate": 4.5519457588742876e-07, + "loss": 0.3655, + "step": 46685 + }, + { + "epoch": 2.63, + "grad_norm": 4.675128394429441, + "learning_rate": 4.5451154715809855e-07, + "loss": 0.3308, + "step": 46690 + }, + { + "epoch": 2.63, + "grad_norm": 4.246548711309785, + "learning_rate": 4.538290068615636e-07, + "loss": 0.3331, + "step": 46695 + }, + { + "epoch": 2.63, + "grad_norm": 4.2548599837512375, + "learning_rate": 4.5314695507116535e-07, + "loss": 0.2764, + "step": 46700 + }, + { + "epoch": 2.63, + "grad_norm": 4.7598452175108985, + "learning_rate": 4.5246539186019455e-07, + "loss": 0.3209, + "step": 46705 + }, + { + "epoch": 2.63, + "grad_norm": 4.525337179491603, + "learning_rate": 4.517843173018871e-07, + "loss": 0.3056, + "step": 46710 + }, + { + "epoch": 2.63, + "grad_norm": 5.403329916359623, + "learning_rate": 4.5110373146942666e-07, + "loss": 0.3364, + "step": 46715 + }, + { + "epoch": 2.63, + "grad_norm": 4.586314300497535, + "learning_rate": 4.504236344359458e-07, + "loss": 0.3289, + "step": 46720 + }, + { + "epoch": 2.63, + "grad_norm": 5.279324615183544, + "learning_rate": 4.4974402627452565e-07, + "loss": 0.3424, + "step": 46725 + }, + { + "epoch": 2.63, + "grad_norm": 4.547243659490779, + "learning_rate": 4.490649070581904e-07, + "loss": 0.3025, + "step": 46730 + }, + { + "epoch": 2.63, + "grad_norm": 4.927879511934787, + "learning_rate": 4.4838627685991563e-07, + "loss": 0.3238, + "step": 46735 + }, + { + "epoch": 2.63, + "grad_norm": 4.6418661930447636, + "learning_rate": 4.477081357526242e-07, + "loss": 0.3417, + "step": 46740 + }, + { + "epoch": 2.63, + "grad_norm": 4.328746805770058, + "learning_rate": 4.4703048380918434e-07, + "loss": 0.3315, + "step": 46745 + }, + { + "epoch": 2.63, + "grad_norm": 4.469255223881732, + "learning_rate": 4.463533211024118e-07, + "loss": 0.2829, + "step": 46750 + }, + { + "epoch": 2.63, + "grad_norm": 4.294176882946335, + "learning_rate": 4.456766477050728e-07, + "loss": 0.3192, + "step": 46755 + }, + { + "epoch": 2.63, + "grad_norm": 7.866310515172893, + "learning_rate": 4.450004636898769e-07, + "loss": 0.3148, + "step": 46760 + }, + { + "epoch": 2.63, + "grad_norm": 4.792377233032178, + "learning_rate": 4.4432476912948496e-07, + "loss": 0.3386, + "step": 46765 + }, + { + "epoch": 2.64, + "grad_norm": 4.222320133431159, + "learning_rate": 4.436495640965022e-07, + "loss": 0.36, + "step": 46770 + }, + { + "epoch": 2.64, + "grad_norm": 5.149002208127003, + "learning_rate": 4.429748486634811e-07, + "loss": 0.3481, + "step": 46775 + }, + { + "epoch": 2.64, + "grad_norm": 4.3096026730296835, + "learning_rate": 4.4230062290292586e-07, + "loss": 0.3084, + "step": 46780 + }, + { + "epoch": 2.64, + "grad_norm": 4.3440879972361515, + "learning_rate": 4.4162688688728194e-07, + "loss": 0.3442, + "step": 46785 + }, + { + "epoch": 2.64, + "grad_norm": 4.76826264700687, + "learning_rate": 4.4095364068894755e-07, + "loss": 0.3541, + "step": 46790 + }, + { + "epoch": 2.64, + "grad_norm": 4.85168291242625, + "learning_rate": 4.402808843802653e-07, + "loss": 0.3164, + "step": 46795 + }, + { + "epoch": 2.64, + "grad_norm": 4.88661915370977, + "learning_rate": 4.3960861803352463e-07, + "loss": 0.3345, + "step": 46800 + }, + { + "epoch": 2.64, + "grad_norm": 4.421894520432229, + "learning_rate": 4.389368417209655e-07, + "loss": 0.3303, + "step": 46805 + }, + { + "epoch": 2.64, + "grad_norm": 4.668288951826391, + "learning_rate": 4.3826555551477135e-07, + "loss": 0.3409, + "step": 46810 + }, + { + "epoch": 2.64, + "grad_norm": 4.513252889281303, + "learning_rate": 4.3759475948707554e-07, + "loss": 0.3097, + "step": 46815 + }, + { + "epoch": 2.64, + "grad_norm": 5.114137039029897, + "learning_rate": 4.3692445370995875e-07, + "loss": 0.3519, + "step": 46820 + }, + { + "epoch": 2.64, + "grad_norm": 5.072764221298067, + "learning_rate": 4.3625463825544777e-07, + "loss": 0.292, + "step": 46825 + }, + { + "epoch": 2.64, + "grad_norm": 4.560733578451888, + "learning_rate": 4.355853131955173e-07, + "loss": 0.3347, + "step": 46830 + }, + { + "epoch": 2.64, + "grad_norm": 4.779777260598413, + "learning_rate": 4.3491647860208877e-07, + "loss": 0.321, + "step": 46835 + }, + { + "epoch": 2.64, + "grad_norm": 4.687178481559941, + "learning_rate": 4.3424813454703285e-07, + "loss": 0.3136, + "step": 46840 + }, + { + "epoch": 2.64, + "grad_norm": 4.475830281431158, + "learning_rate": 4.335802811021644e-07, + "loss": 0.2911, + "step": 46845 + }, + { + "epoch": 2.64, + "grad_norm": 4.479512983952609, + "learning_rate": 4.329129183392483e-07, + "loss": 0.2995, + "step": 46850 + }, + { + "epoch": 2.64, + "grad_norm": 6.822087055753844, + "learning_rate": 4.3224604632999543e-07, + "loss": 0.3124, + "step": 46855 + }, + { + "epoch": 2.64, + "grad_norm": 5.655909129757384, + "learning_rate": 4.3157966514606295e-07, + "loss": 0.3021, + "step": 46860 + }, + { + "epoch": 2.64, + "grad_norm": 4.529545803506113, + "learning_rate": 4.309137748590581e-07, + "loss": 0.2858, + "step": 46865 + }, + { + "epoch": 2.64, + "grad_norm": 4.256877250547265, + "learning_rate": 4.3024837554053234e-07, + "loss": 0.3019, + "step": 46870 + }, + { + "epoch": 2.64, + "grad_norm": 4.345462366679429, + "learning_rate": 4.2958346726198754e-07, + "loss": 0.3902, + "step": 46875 + }, + { + "epoch": 2.64, + "grad_norm": 4.630696105087748, + "learning_rate": 4.289190500948698e-07, + "loss": 0.3299, + "step": 46880 + }, + { + "epoch": 2.64, + "grad_norm": 4.8876201973539155, + "learning_rate": 4.282551241105731e-07, + "loss": 0.3342, + "step": 46885 + }, + { + "epoch": 2.64, + "grad_norm": 4.424498315357881, + "learning_rate": 4.2759168938044105e-07, + "loss": 0.336, + "step": 46890 + }, + { + "epoch": 2.64, + "grad_norm": 4.799727148662146, + "learning_rate": 4.269287459757604e-07, + "loss": 0.3308, + "step": 46895 + }, + { + "epoch": 2.64, + "grad_norm": 4.187968502330678, + "learning_rate": 4.262662939677692e-07, + "loss": 0.3175, + "step": 46900 + }, + { + "epoch": 2.64, + "grad_norm": 4.715993452326925, + "learning_rate": 4.256043334276494e-07, + "loss": 0.3065, + "step": 46905 + }, + { + "epoch": 2.64, + "grad_norm": 4.771388550950795, + "learning_rate": 4.2494286442653245e-07, + "loss": 0.3536, + "step": 46910 + }, + { + "epoch": 2.64, + "grad_norm": 4.868514320748043, + "learning_rate": 4.24281887035497e-07, + "loss": 0.3239, + "step": 46915 + }, + { + "epoch": 2.64, + "grad_norm": 4.562426506289905, + "learning_rate": 4.236214013255657e-07, + "loss": 0.3238, + "step": 46920 + }, + { + "epoch": 2.64, + "grad_norm": 4.947891848815056, + "learning_rate": 4.22961407367713e-07, + "loss": 0.3429, + "step": 46925 + }, + { + "epoch": 2.64, + "grad_norm": 4.446925438033762, + "learning_rate": 4.2230190523285586e-07, + "loss": 0.2965, + "step": 46930 + }, + { + "epoch": 2.64, + "grad_norm": 4.79046869026732, + "learning_rate": 4.2164289499186327e-07, + "loss": 0.3009, + "step": 46935 + }, + { + "epoch": 2.64, + "grad_norm": 4.84656612305597, + "learning_rate": 4.209843767155475e-07, + "loss": 0.3524, + "step": 46940 + }, + { + "epoch": 2.65, + "grad_norm": 4.612128800859239, + "learning_rate": 4.2032635047466795e-07, + "loss": 0.3391, + "step": 46945 + }, + { + "epoch": 2.65, + "grad_norm": 5.211833582382217, + "learning_rate": 4.1966881633993485e-07, + "loss": 0.3058, + "step": 46950 + }, + { + "epoch": 2.65, + "grad_norm": 3.996857196530111, + "learning_rate": 4.1901177438200103e-07, + "loss": 0.2945, + "step": 46955 + }, + { + "epoch": 2.65, + "grad_norm": 4.858078115209991, + "learning_rate": 4.1835522467147005e-07, + "loss": 0.3363, + "step": 46960 + }, + { + "epoch": 2.65, + "grad_norm": 4.490565710898016, + "learning_rate": 4.176991672788899e-07, + "loss": 0.2949, + "step": 46965 + }, + { + "epoch": 2.65, + "grad_norm": 4.489166325576985, + "learning_rate": 4.1704360227475814e-07, + "loss": 0.3116, + "step": 46970 + }, + { + "epoch": 2.65, + "grad_norm": 4.51153850421733, + "learning_rate": 4.163885297295178e-07, + "loss": 0.3105, + "step": 46975 + }, + { + "epoch": 2.65, + "grad_norm": 4.807365501389042, + "learning_rate": 4.157339497135582e-07, + "loss": 0.3112, + "step": 46980 + }, + { + "epoch": 2.65, + "grad_norm": 4.516339470155919, + "learning_rate": 4.15079862297218e-07, + "loss": 0.3238, + "step": 46985 + }, + { + "epoch": 2.65, + "grad_norm": 4.889724706848004, + "learning_rate": 4.1442626755078095e-07, + "loss": 0.3237, + "step": 46990 + }, + { + "epoch": 2.65, + "grad_norm": 4.885939649506536, + "learning_rate": 4.137731655444799e-07, + "loss": 0.3177, + "step": 46995 + }, + { + "epoch": 2.65, + "grad_norm": 4.767865505475133, + "learning_rate": 4.1312055634849137e-07, + "loss": 0.3126, + "step": 47000 + }, + { + "epoch": 2.65, + "grad_norm": 5.925559666567569, + "learning_rate": 4.1246844003294317e-07, + "loss": 0.3316, + "step": 47005 + }, + { + "epoch": 2.65, + "grad_norm": 5.789895457295862, + "learning_rate": 4.1181681666790817e-07, + "loss": 0.3124, + "step": 47010 + }, + { + "epoch": 2.65, + "grad_norm": 4.48421426232905, + "learning_rate": 4.1116568632340424e-07, + "loss": 0.3126, + "step": 47015 + }, + { + "epoch": 2.65, + "grad_norm": 4.780515250240848, + "learning_rate": 4.105150490694004e-07, + "loss": 0.3327, + "step": 47020 + }, + { + "epoch": 2.65, + "grad_norm": 5.373610834287886, + "learning_rate": 4.098649049758097e-07, + "loss": 0.3343, + "step": 47025 + }, + { + "epoch": 2.65, + "grad_norm": 4.302592901513993, + "learning_rate": 4.0921525411249164e-07, + "loss": 0.3208, + "step": 47030 + }, + { + "epoch": 2.65, + "grad_norm": 4.92804899036466, + "learning_rate": 4.0856609654925606e-07, + "loss": 0.3551, + "step": 47035 + }, + { + "epoch": 2.65, + "grad_norm": 4.689449107764158, + "learning_rate": 4.0791743235585603e-07, + "loss": 0.334, + "step": 47040 + }, + { + "epoch": 2.65, + "grad_norm": 4.276759902726785, + "learning_rate": 4.072692616019952e-07, + "loss": 0.3155, + "step": 47045 + }, + { + "epoch": 2.65, + "grad_norm": 5.3844193098775355, + "learning_rate": 4.066215843573207e-07, + "loss": 0.3324, + "step": 47050 + }, + { + "epoch": 2.65, + "grad_norm": 4.815219341175224, + "learning_rate": 4.0597440069143014e-07, + "loss": 0.3221, + "step": 47055 + }, + { + "epoch": 2.65, + "grad_norm": 4.692955230893898, + "learning_rate": 4.053277106738651e-07, + "loss": 0.3194, + "step": 47060 + }, + { + "epoch": 2.65, + "grad_norm": 4.6323274112565445, + "learning_rate": 4.0468151437411453e-07, + "loss": 0.3141, + "step": 47065 + }, + { + "epoch": 2.65, + "grad_norm": 4.631325692097944, + "learning_rate": 4.040358118616167e-07, + "loss": 0.3533, + "step": 47070 + }, + { + "epoch": 2.65, + "grad_norm": 4.927335165269176, + "learning_rate": 4.033906032057533e-07, + "loss": 0.3314, + "step": 47075 + }, + { + "epoch": 2.65, + "grad_norm": 4.877498789312842, + "learning_rate": 4.027458884758573e-07, + "loss": 0.3216, + "step": 47080 + }, + { + "epoch": 2.65, + "grad_norm": 4.246272771666314, + "learning_rate": 4.0210166774120374e-07, + "loss": 0.3425, + "step": 47085 + }, + { + "epoch": 2.65, + "grad_norm": 4.5247644234160145, + "learning_rate": 4.01457941071019e-07, + "loss": 0.3153, + "step": 47090 + }, + { + "epoch": 2.65, + "grad_norm": 4.463647383380514, + "learning_rate": 4.008147085344721e-07, + "loss": 0.3311, + "step": 47095 + }, + { + "epoch": 2.65, + "grad_norm": 4.559348588349339, + "learning_rate": 4.0017197020068287e-07, + "loss": 0.3375, + "step": 47100 + }, + { + "epoch": 2.65, + "grad_norm": 5.0138635726505365, + "learning_rate": 3.995297261387171e-07, + "loss": 0.3173, + "step": 47105 + }, + { + "epoch": 2.65, + "grad_norm": 5.303357727529372, + "learning_rate": 3.988879764175857e-07, + "loss": 0.2982, + "step": 47110 + }, + { + "epoch": 2.65, + "grad_norm": 4.298462690324856, + "learning_rate": 3.982467211062463e-07, + "loss": 0.3373, + "step": 47115 + }, + { + "epoch": 2.65, + "grad_norm": 4.304089836229813, + "learning_rate": 3.9760596027360655e-07, + "loss": 0.3076, + "step": 47120 + }, + { + "epoch": 2.66, + "grad_norm": 4.839555906421958, + "learning_rate": 3.9696569398851803e-07, + "loss": 0.3307, + "step": 47125 + }, + { + "epoch": 2.66, + "grad_norm": 4.749661601195171, + "learning_rate": 3.9632592231978075e-07, + "loss": 0.3214, + "step": 47130 + }, + { + "epoch": 2.66, + "grad_norm": 4.55621875569353, + "learning_rate": 3.9568664533614023e-07, + "loss": 0.3002, + "step": 47135 + }, + { + "epoch": 2.66, + "grad_norm": 5.0782440393019375, + "learning_rate": 3.950478631062904e-07, + "loss": 0.3302, + "step": 47140 + }, + { + "epoch": 2.66, + "grad_norm": 4.815075060577531, + "learning_rate": 3.944095756988714e-07, + "loss": 0.3214, + "step": 47145 + }, + { + "epoch": 2.66, + "grad_norm": 5.07546656532758, + "learning_rate": 3.9377178318246833e-07, + "loss": 0.3033, + "step": 47150 + }, + { + "epoch": 2.66, + "grad_norm": 4.256030271952451, + "learning_rate": 3.9313448562561694e-07, + "loss": 0.3452, + "step": 47155 + }, + { + "epoch": 2.66, + "grad_norm": 4.7105927018970934, + "learning_rate": 3.924976830967958e-07, + "loss": 0.3139, + "step": 47160 + }, + { + "epoch": 2.66, + "grad_norm": 5.395540297868007, + "learning_rate": 3.9186137566443406e-07, + "loss": 0.3214, + "step": 47165 + }, + { + "epoch": 2.66, + "grad_norm": 4.68165705261665, + "learning_rate": 3.9122556339690475e-07, + "loss": 0.3125, + "step": 47170 + }, + { + "epoch": 2.66, + "grad_norm": 5.225631421305167, + "learning_rate": 3.905902463625277e-07, + "loss": 0.2875, + "step": 47175 + }, + { + "epoch": 2.66, + "grad_norm": 4.282772827428368, + "learning_rate": 3.899554246295717e-07, + "loss": 0.2889, + "step": 47180 + }, + { + "epoch": 2.66, + "grad_norm": 4.838280264949798, + "learning_rate": 3.893210982662515e-07, + "loss": 0.3209, + "step": 47185 + }, + { + "epoch": 2.66, + "grad_norm": 4.877907268717985, + "learning_rate": 3.8868726734072716e-07, + "loss": 0.3296, + "step": 47190 + }, + { + "epoch": 2.66, + "grad_norm": 4.854364067016355, + "learning_rate": 3.880539319211085e-07, + "loss": 0.3373, + "step": 47195 + }, + { + "epoch": 2.66, + "grad_norm": 6.232484394987799, + "learning_rate": 3.8742109207544787e-07, + "loss": 0.3294, + "step": 47200 + }, + { + "epoch": 2.66, + "grad_norm": 4.679387401102267, + "learning_rate": 3.8678874787174804e-07, + "loss": 0.3355, + "step": 47205 + }, + { + "epoch": 2.66, + "grad_norm": 4.816690183043029, + "learning_rate": 3.861568993779569e-07, + "loss": 0.2978, + "step": 47210 + }, + { + "epoch": 2.66, + "grad_norm": 4.60766726117633, + "learning_rate": 3.855255466619695e-07, + "loss": 0.2869, + "step": 47215 + }, + { + "epoch": 2.66, + "grad_norm": 4.56949189783215, + "learning_rate": 3.848946897916267e-07, + "loss": 0.3089, + "step": 47220 + }, + { + "epoch": 2.66, + "grad_norm": 4.957594418906804, + "learning_rate": 3.8426432883471863e-07, + "loss": 0.314, + "step": 47225 + }, + { + "epoch": 2.66, + "grad_norm": 5.282474940892623, + "learning_rate": 3.8363446385897896e-07, + "loss": 0.3333, + "step": 47230 + }, + { + "epoch": 2.66, + "grad_norm": 5.122244530794561, + "learning_rate": 3.8300509493208906e-07, + "loss": 0.3353, + "step": 47235 + }, + { + "epoch": 2.66, + "grad_norm": 6.846221853812618, + "learning_rate": 3.823762221216787e-07, + "loss": 0.329, + "step": 47240 + }, + { + "epoch": 2.66, + "grad_norm": 5.04568969048718, + "learning_rate": 3.8174784549532163e-07, + "loss": 0.36, + "step": 47245 + }, + { + "epoch": 2.66, + "grad_norm": 4.340629017456971, + "learning_rate": 3.8111996512054104e-07, + "loss": 0.3588, + "step": 47250 + }, + { + "epoch": 2.66, + "grad_norm": 4.644180897846171, + "learning_rate": 3.8049258106480525e-07, + "loss": 0.2957, + "step": 47255 + }, + { + "epoch": 2.66, + "grad_norm": 4.14247660758316, + "learning_rate": 3.7986569339552804e-07, + "loss": 0.3081, + "step": 47260 + }, + { + "epoch": 2.66, + "grad_norm": 4.974849820472255, + "learning_rate": 3.7923930218007334e-07, + "loss": 0.324, + "step": 47265 + }, + { + "epoch": 2.66, + "grad_norm": 4.597716890240932, + "learning_rate": 3.786134074857478e-07, + "loss": 0.3488, + "step": 47270 + }, + { + "epoch": 2.66, + "grad_norm": 5.484337347488252, + "learning_rate": 3.779880093798072e-07, + "loss": 0.3258, + "step": 47275 + }, + { + "epoch": 2.66, + "grad_norm": 6.491127473804733, + "learning_rate": 3.773631079294543e-07, + "loss": 0.3093, + "step": 47280 + }, + { + "epoch": 2.66, + "grad_norm": 4.897306774978777, + "learning_rate": 3.767387032018355e-07, + "loss": 0.3383, + "step": 47285 + }, + { + "epoch": 2.66, + "grad_norm": 4.787247236921363, + "learning_rate": 3.761147952640487e-07, + "loss": 0.3003, + "step": 47290 + }, + { + "epoch": 2.66, + "grad_norm": 4.772062282452134, + "learning_rate": 3.7549138418313257e-07, + "loss": 0.3312, + "step": 47295 + }, + { + "epoch": 2.67, + "grad_norm": 5.361851150668638, + "learning_rate": 3.748684700260774e-07, + "loss": 0.3252, + "step": 47300 + }, + { + "epoch": 2.67, + "grad_norm": 6.3037318819974235, + "learning_rate": 3.742460528598169e-07, + "loss": 0.3248, + "step": 47305 + }, + { + "epoch": 2.67, + "grad_norm": 4.569751016260492, + "learning_rate": 3.7362413275123356e-07, + "loss": 0.3017, + "step": 47310 + }, + { + "epoch": 2.67, + "grad_norm": 5.312452729981813, + "learning_rate": 3.7300270976715514e-07, + "loss": 0.3413, + "step": 47315 + }, + { + "epoch": 2.67, + "grad_norm": 4.298661718228763, + "learning_rate": 3.723817839743554e-07, + "loss": 0.3193, + "step": 47320 + }, + { + "epoch": 2.67, + "grad_norm": 4.563306621629549, + "learning_rate": 3.717613554395566e-07, + "loss": 0.3081, + "step": 47325 + }, + { + "epoch": 2.67, + "grad_norm": 4.366297381680645, + "learning_rate": 3.7114142422942534e-07, + "loss": 0.3139, + "step": 47330 + }, + { + "epoch": 2.67, + "grad_norm": 4.856945936723807, + "learning_rate": 3.705219904105778e-07, + "loss": 0.372, + "step": 47335 + }, + { + "epoch": 2.67, + "grad_norm": 4.350410815432054, + "learning_rate": 3.6990305404957404e-07, + "loss": 0.2971, + "step": 47340 + }, + { + "epoch": 2.67, + "grad_norm": 4.962664741492998, + "learning_rate": 3.6928461521292036e-07, + "loss": 0.316, + "step": 47345 + }, + { + "epoch": 2.67, + "grad_norm": 4.734462988417854, + "learning_rate": 3.686666739670719e-07, + "loss": 0.3232, + "step": 47350 + }, + { + "epoch": 2.67, + "grad_norm": 4.996910036124418, + "learning_rate": 3.680492303784289e-07, + "loss": 0.3275, + "step": 47355 + }, + { + "epoch": 2.67, + "grad_norm": 5.170838149373807, + "learning_rate": 3.674322845133388e-07, + "loss": 0.316, + "step": 47360 + }, + { + "epoch": 2.67, + "grad_norm": 4.105056301647215, + "learning_rate": 3.6681583643809416e-07, + "loss": 0.304, + "step": 47365 + }, + { + "epoch": 2.67, + "grad_norm": 4.798121810378818, + "learning_rate": 3.661998862189359e-07, + "loss": 0.328, + "step": 47370 + }, + { + "epoch": 2.67, + "grad_norm": 4.925917264774166, + "learning_rate": 3.65584433922051e-07, + "loss": 0.3384, + "step": 47375 + }, + { + "epoch": 2.67, + "grad_norm": 4.889806270974378, + "learning_rate": 3.649694796135711e-07, + "loss": 0.3082, + "step": 47380 + }, + { + "epoch": 2.67, + "grad_norm": 4.766925180638513, + "learning_rate": 3.643550233595772e-07, + "loss": 0.3309, + "step": 47385 + }, + { + "epoch": 2.67, + "grad_norm": 6.921202646989476, + "learning_rate": 3.637410652260942e-07, + "loss": 0.3213, + "step": 47390 + }, + { + "epoch": 2.67, + "grad_norm": 4.470605708476463, + "learning_rate": 3.631276052790955e-07, + "loss": 0.3101, + "step": 47395 + }, + { + "epoch": 2.67, + "grad_norm": 6.007220511121349, + "learning_rate": 3.6251464358450005e-07, + "loss": 0.3194, + "step": 47400 + }, + { + "epoch": 2.67, + "grad_norm": 4.285716298893889, + "learning_rate": 3.619021802081724e-07, + "loss": 0.3095, + "step": 47405 + }, + { + "epoch": 2.67, + "grad_norm": 5.376894087725877, + "learning_rate": 3.6129021521592533e-07, + "loss": 0.3388, + "step": 47410 + }, + { + "epoch": 2.67, + "grad_norm": 4.372636904633146, + "learning_rate": 3.606787486735158e-07, + "loss": 0.3347, + "step": 47415 + }, + { + "epoch": 2.67, + "grad_norm": 5.155466949842631, + "learning_rate": 3.600677806466507e-07, + "loss": 0.3194, + "step": 47420 + }, + { + "epoch": 2.67, + "grad_norm": 4.440585903857065, + "learning_rate": 3.594573112009803e-07, + "loss": 0.3292, + "step": 47425 + }, + { + "epoch": 2.67, + "grad_norm": 4.925003167153565, + "learning_rate": 3.5884734040210147e-07, + "loss": 0.34, + "step": 47430 + }, + { + "epoch": 2.67, + "grad_norm": 4.586192778987076, + "learning_rate": 3.5823786831555906e-07, + "loss": 0.337, + "step": 47435 + }, + { + "epoch": 2.67, + "grad_norm": 4.935545151503379, + "learning_rate": 3.5762889500684294e-07, + "loss": 0.3562, + "step": 47440 + }, + { + "epoch": 2.67, + "grad_norm": 4.423750760500127, + "learning_rate": 3.570204205413913e-07, + "loss": 0.284, + "step": 47445 + }, + { + "epoch": 2.67, + "grad_norm": 4.483843025042958, + "learning_rate": 3.564124449845857e-07, + "loss": 0.3167, + "step": 47450 + }, + { + "epoch": 2.67, + "grad_norm": 5.115499035227642, + "learning_rate": 3.558049684017573e-07, + "loss": 0.3542, + "step": 47455 + }, + { + "epoch": 2.67, + "grad_norm": 4.597403206810282, + "learning_rate": 3.551979908581804e-07, + "loss": 0.3213, + "step": 47460 + }, + { + "epoch": 2.67, + "grad_norm": 6.082674894256887, + "learning_rate": 3.5459151241907853e-07, + "loss": 0.3356, + "step": 47465 + }, + { + "epoch": 2.67, + "grad_norm": 3.905002825773987, + "learning_rate": 3.539855331496211e-07, + "loss": 0.3163, + "step": 47470 + }, + { + "epoch": 2.67, + "grad_norm": 5.285027227430392, + "learning_rate": 3.533800531149223e-07, + "loss": 0.3021, + "step": 47475 + }, + { + "epoch": 2.68, + "grad_norm": 4.63205866538077, + "learning_rate": 3.527750723800444e-07, + "loss": 0.3138, + "step": 47480 + }, + { + "epoch": 2.68, + "grad_norm": 4.489272684043733, + "learning_rate": 3.5217059100999487e-07, + "loss": 0.3194, + "step": 47485 + }, + { + "epoch": 2.68, + "grad_norm": 3.9383592562869207, + "learning_rate": 3.5156660906972674e-07, + "loss": 0.2854, + "step": 47490 + }, + { + "epoch": 2.68, + "grad_norm": 4.558018527068866, + "learning_rate": 3.50963126624142e-07, + "loss": 0.3072, + "step": 47495 + }, + { + "epoch": 2.68, + "grad_norm": 4.4583540022292105, + "learning_rate": 3.5036014373808716e-07, + "loss": 0.3087, + "step": 47500 + }, + { + "epoch": 2.68, + "grad_norm": 4.95993238449542, + "learning_rate": 3.497576604763553e-07, + "loss": 0.2955, + "step": 47505 + }, + { + "epoch": 2.68, + "grad_norm": 5.438456015228428, + "learning_rate": 3.4915567690368634e-07, + "loss": 0.3387, + "step": 47510 + }, + { + "epoch": 2.68, + "grad_norm": 4.490596650882147, + "learning_rate": 3.485541930847647e-07, + "loss": 0.3084, + "step": 47515 + }, + { + "epoch": 2.68, + "grad_norm": 4.661525193352011, + "learning_rate": 3.479532090842241e-07, + "loss": 0.3273, + "step": 47520 + }, + { + "epoch": 2.68, + "grad_norm": 4.502288950846946, + "learning_rate": 3.4735272496664197e-07, + "loss": 0.3447, + "step": 47525 + }, + { + "epoch": 2.68, + "grad_norm": 4.2317737559166755, + "learning_rate": 3.4675274079654374e-07, + "loss": 0.3172, + "step": 47530 + }, + { + "epoch": 2.68, + "grad_norm": 4.948338884048343, + "learning_rate": 3.461532566383985e-07, + "loss": 0.3162, + "step": 47535 + }, + { + "epoch": 2.68, + "grad_norm": 4.63747731015127, + "learning_rate": 3.4555427255662633e-07, + "loss": 0.2946, + "step": 47540 + }, + { + "epoch": 2.68, + "grad_norm": 4.812212196188589, + "learning_rate": 3.44955788615588e-07, + "loss": 0.3121, + "step": 47545 + }, + { + "epoch": 2.68, + "grad_norm": 6.128273368307266, + "learning_rate": 3.4435780487959593e-07, + "loss": 0.3157, + "step": 47550 + }, + { + "epoch": 2.68, + "grad_norm": 4.638519535583288, + "learning_rate": 3.4376032141290315e-07, + "loss": 0.3339, + "step": 47555 + }, + { + "epoch": 2.68, + "grad_norm": 5.321372248864944, + "learning_rate": 3.4316333827971336e-07, + "loss": 0.3697, + "step": 47560 + }, + { + "epoch": 2.68, + "grad_norm": 4.509442331848998, + "learning_rate": 3.4256685554417637e-07, + "loss": 0.3068, + "step": 47565 + }, + { + "epoch": 2.68, + "grad_norm": 4.670117243748899, + "learning_rate": 3.4197087327038523e-07, + "loss": 0.3229, + "step": 47570 + }, + { + "epoch": 2.68, + "grad_norm": 4.590114746808976, + "learning_rate": 3.4137539152238053e-07, + "loss": 0.3275, + "step": 47575 + }, + { + "epoch": 2.68, + "grad_norm": 4.2817912196370935, + "learning_rate": 3.4078041036415097e-07, + "loss": 0.3199, + "step": 47580 + }, + { + "epoch": 2.68, + "grad_norm": 4.471639898787827, + "learning_rate": 3.401859298596277e-07, + "loss": 0.3002, + "step": 47585 + }, + { + "epoch": 2.68, + "grad_norm": 4.69528646125724, + "learning_rate": 3.3959195007269297e-07, + "loss": 0.338, + "step": 47590 + }, + { + "epoch": 2.68, + "grad_norm": 4.715543890771803, + "learning_rate": 3.3899847106717063e-07, + "loss": 0.3418, + "step": 47595 + }, + { + "epoch": 2.68, + "grad_norm": 5.217419483804426, + "learning_rate": 3.3840549290683313e-07, + "loss": 0.3275, + "step": 47600 + }, + { + "epoch": 2.68, + "grad_norm": 4.790687169048269, + "learning_rate": 3.378130156553988e-07, + "loss": 0.3228, + "step": 47605 + }, + { + "epoch": 2.68, + "grad_norm": 4.399420151552524, + "learning_rate": 3.372210393765313e-07, + "loss": 0.3275, + "step": 47610 + }, + { + "epoch": 2.68, + "grad_norm": 6.143734271429322, + "learning_rate": 3.3662956413384187e-07, + "loss": 0.3386, + "step": 47615 + }, + { + "epoch": 2.68, + "grad_norm": 4.7673449533142325, + "learning_rate": 3.3603858999088635e-07, + "loss": 0.3118, + "step": 47620 + }, + { + "epoch": 2.68, + "grad_norm": 4.854963445528044, + "learning_rate": 3.354481170111684e-07, + "loss": 0.3279, + "step": 47625 + }, + { + "epoch": 2.68, + "grad_norm": 5.078103501442932, + "learning_rate": 3.3485814525813675e-07, + "loss": 0.3434, + "step": 47630 + }, + { + "epoch": 2.68, + "grad_norm": 4.034977945477632, + "learning_rate": 3.3426867479518563e-07, + "loss": 0.2907, + "step": 47635 + }, + { + "epoch": 2.68, + "grad_norm": 4.383761195192665, + "learning_rate": 3.3367970568565665e-07, + "loss": 0.3415, + "step": 47640 + }, + { + "epoch": 2.68, + "grad_norm": 4.386557697058935, + "learning_rate": 3.3309123799283804e-07, + "loss": 0.3412, + "step": 47645 + }, + { + "epoch": 2.68, + "grad_norm": 4.333341839070483, + "learning_rate": 3.32503271779962e-07, + "loss": 0.3098, + "step": 47650 + }, + { + "epoch": 2.69, + "grad_norm": 4.34362991618549, + "learning_rate": 3.319158071102091e-07, + "loss": 0.3106, + "step": 47655 + }, + { + "epoch": 2.69, + "grad_norm": 6.254660178129415, + "learning_rate": 3.313288440467044e-07, + "loss": 0.3172, + "step": 47660 + }, + { + "epoch": 2.69, + "grad_norm": 5.0917619833565775, + "learning_rate": 3.307423826525202e-07, + "loss": 0.3005, + "step": 47665 + }, + { + "epoch": 2.69, + "grad_norm": 5.117232752981816, + "learning_rate": 3.301564229906729e-07, + "loss": 0.3165, + "step": 47670 + }, + { + "epoch": 2.69, + "grad_norm": 4.57774333667238, + "learning_rate": 3.2957096512412856e-07, + "loss": 0.337, + "step": 47675 + }, + { + "epoch": 2.69, + "grad_norm": 4.408126168209125, + "learning_rate": 3.2898600911579646e-07, + "loss": 0.3365, + "step": 47680 + }, + { + "epoch": 2.69, + "grad_norm": 4.889320483711481, + "learning_rate": 3.2840155502853133e-07, + "loss": 0.3189, + "step": 47685 + }, + { + "epoch": 2.69, + "grad_norm": 4.362285259304799, + "learning_rate": 3.278176029251373e-07, + "loss": 0.354, + "step": 47690 + }, + { + "epoch": 2.69, + "grad_norm": 4.8069990841662324, + "learning_rate": 3.272341528683609e-07, + "loss": 0.3232, + "step": 47695 + }, + { + "epoch": 2.69, + "grad_norm": 5.26464172603452, + "learning_rate": 3.266512049208981e-07, + "loss": 0.32, + "step": 47700 + }, + { + "epoch": 2.69, + "grad_norm": 4.565820319698187, + "learning_rate": 3.260687591453876e-07, + "loss": 0.3212, + "step": 47705 + }, + { + "epoch": 2.69, + "grad_norm": 4.741254672867791, + "learning_rate": 3.2548681560441775e-07, + "loss": 0.3151, + "step": 47710 + }, + { + "epoch": 2.69, + "grad_norm": 4.563889335836407, + "learning_rate": 3.2490537436051905e-07, + "loss": 0.2942, + "step": 47715 + }, + { + "epoch": 2.69, + "grad_norm": 4.133163083253335, + "learning_rate": 3.2432443547617045e-07, + "loss": 0.3355, + "step": 47720 + }, + { + "epoch": 2.69, + "grad_norm": 5.16777636294225, + "learning_rate": 3.2374399901379696e-07, + "loss": 0.348, + "step": 47725 + }, + { + "epoch": 2.69, + "grad_norm": 5.777794135019218, + "learning_rate": 3.2316406503576816e-07, + "loss": 0.3163, + "step": 47730 + }, + { + "epoch": 2.69, + "grad_norm": 4.558942667410501, + "learning_rate": 3.2258463360440083e-07, + "loss": 0.3191, + "step": 47735 + }, + { + "epoch": 2.69, + "grad_norm": 4.955876311305226, + "learning_rate": 3.2200570478195857e-07, + "loss": 0.3727, + "step": 47740 + }, + { + "epoch": 2.69, + "grad_norm": 4.512120144335555, + "learning_rate": 3.214272786306483e-07, + "loss": 0.3147, + "step": 47745 + }, + { + "epoch": 2.69, + "grad_norm": 6.178539157244023, + "learning_rate": 3.208493552126257e-07, + "loss": 0.3429, + "step": 47750 + }, + { + "epoch": 2.69, + "grad_norm": 4.697660185506693, + "learning_rate": 3.202719345899896e-07, + "loss": 0.3188, + "step": 47755 + }, + { + "epoch": 2.69, + "grad_norm": 4.448063930221362, + "learning_rate": 3.1969501682478807e-07, + "loss": 0.3018, + "step": 47760 + }, + { + "epoch": 2.69, + "grad_norm": 4.3512123122337805, + "learning_rate": 3.1911860197901267e-07, + "loss": 0.3118, + "step": 47765 + }, + { + "epoch": 2.69, + "grad_norm": 3.9023805265563696, + "learning_rate": 3.185426901146016e-07, + "loss": 0.2779, + "step": 47770 + }, + { + "epoch": 2.69, + "grad_norm": 4.751318911524001, + "learning_rate": 3.179672812934398e-07, + "loss": 0.3207, + "step": 47775 + }, + { + "epoch": 2.69, + "grad_norm": 5.11648296728079, + "learning_rate": 3.1739237557735613e-07, + "loss": 0.3055, + "step": 47780 + }, + { + "epoch": 2.69, + "grad_norm": 4.574882169294049, + "learning_rate": 3.168179730281279e-07, + "loss": 0.3356, + "step": 47785 + }, + { + "epoch": 2.69, + "grad_norm": 5.027181941766636, + "learning_rate": 3.162440737074768e-07, + "loss": 0.3081, + "step": 47790 + }, + { + "epoch": 2.69, + "grad_norm": 5.112720326862771, + "learning_rate": 3.156706776770718e-07, + "loss": 0.3308, + "step": 47795 + }, + { + "epoch": 2.69, + "grad_norm": 4.473631153161822, + "learning_rate": 3.1509778499852597e-07, + "loss": 0.2996, + "step": 47800 + }, + { + "epoch": 2.69, + "grad_norm": 4.23761028541806, + "learning_rate": 3.1452539573339826e-07, + "loss": 0.3251, + "step": 47805 + }, + { + "epoch": 2.69, + "grad_norm": 4.861753624417564, + "learning_rate": 3.1395350994319617e-07, + "loss": 0.3309, + "step": 47810 + }, + { + "epoch": 2.69, + "grad_norm": 5.10625851829245, + "learning_rate": 3.1338212768936947e-07, + "loss": 0.3801, + "step": 47815 + }, + { + "epoch": 2.69, + "grad_norm": 4.485121295117052, + "learning_rate": 3.128112490333179e-07, + "loss": 0.315, + "step": 47820 + }, + { + "epoch": 2.69, + "grad_norm": 4.166181052258781, + "learning_rate": 3.12240874036383e-07, + "loss": 0.3275, + "step": 47825 + }, + { + "epoch": 2.69, + "grad_norm": 5.497452351302918, + "learning_rate": 3.1167100275985464e-07, + "loss": 0.3266, + "step": 47830 + }, + { + "epoch": 2.7, + "grad_norm": 4.630048357828678, + "learning_rate": 3.111016352649687e-07, + "loss": 0.3196, + "step": 47835 + }, + { + "epoch": 2.7, + "grad_norm": 4.718753036645414, + "learning_rate": 3.1053277161290585e-07, + "loss": 0.3036, + "step": 47840 + }, + { + "epoch": 2.7, + "grad_norm": 4.262779770629185, + "learning_rate": 3.0996441186479265e-07, + "loss": 0.3289, + "step": 47845 + }, + { + "epoch": 2.7, + "grad_norm": 5.573369072323231, + "learning_rate": 3.0939655608170306e-07, + "loss": 0.3034, + "step": 47850 + }, + { + "epoch": 2.7, + "grad_norm": 5.434752011193726, + "learning_rate": 3.0882920432465324e-07, + "loss": 0.3272, + "step": 47855 + }, + { + "epoch": 2.7, + "grad_norm": 5.4238194974092595, + "learning_rate": 3.0826235665461004e-07, + "loss": 0.3355, + "step": 47860 + }, + { + "epoch": 2.7, + "grad_norm": 5.945574492558196, + "learning_rate": 3.0769601313248244e-07, + "loss": 0.2991, + "step": 47865 + }, + { + "epoch": 2.7, + "grad_norm": 4.430777388168083, + "learning_rate": 3.0713017381912736e-07, + "loss": 0.3396, + "step": 47870 + }, + { + "epoch": 2.7, + "grad_norm": 4.468089592102152, + "learning_rate": 3.0656483877534557e-07, + "loss": 0.3102, + "step": 47875 + }, + { + "epoch": 2.7, + "grad_norm": 4.902235614015058, + "learning_rate": 3.0600000806188633e-07, + "loss": 0.3292, + "step": 47880 + }, + { + "epoch": 2.7, + "grad_norm": 4.368090791078648, + "learning_rate": 3.0543568173944273e-07, + "loss": 0.296, + "step": 47885 + }, + { + "epoch": 2.7, + "grad_norm": 5.083264614720332, + "learning_rate": 3.0487185986865285e-07, + "loss": 0.3502, + "step": 47890 + }, + { + "epoch": 2.7, + "grad_norm": 5.317452748816307, + "learning_rate": 3.043085425101033e-07, + "loss": 0.3234, + "step": 47895 + }, + { + "epoch": 2.7, + "grad_norm": 4.408397455549362, + "learning_rate": 3.0374572972432405e-07, + "loss": 0.314, + "step": 47900 + }, + { + "epoch": 2.7, + "grad_norm": 4.5821145538223345, + "learning_rate": 3.0318342157179326e-07, + "loss": 0.3383, + "step": 47905 + }, + { + "epoch": 2.7, + "grad_norm": 4.343002499016802, + "learning_rate": 3.0262161811293155e-07, + "loss": 0.3213, + "step": 47910 + }, + { + "epoch": 2.7, + "grad_norm": 4.287329186077056, + "learning_rate": 3.020603194081084e-07, + "loss": 0.2996, + "step": 47915 + }, + { + "epoch": 2.7, + "grad_norm": 4.71053580808724, + "learning_rate": 3.0149952551763726e-07, + "loss": 0.311, + "step": 47920 + }, + { + "epoch": 2.7, + "grad_norm": 5.076330986299103, + "learning_rate": 3.0093923650177827e-07, + "loss": 0.3253, + "step": 47925 + }, + { + "epoch": 2.7, + "grad_norm": 4.483208349833166, + "learning_rate": 3.0037945242073706e-07, + "loss": 0.3263, + "step": 47930 + }, + { + "epoch": 2.7, + "grad_norm": 4.786764538160286, + "learning_rate": 2.998201733346645e-07, + "loss": 0.3244, + "step": 47935 + }, + { + "epoch": 2.7, + "grad_norm": 4.341878249099872, + "learning_rate": 2.9926139930365807e-07, + "loss": 0.3228, + "step": 47940 + }, + { + "epoch": 2.7, + "grad_norm": 4.6848355866544065, + "learning_rate": 2.987031303877602e-07, + "loss": 0.3238, + "step": 47945 + }, + { + "epoch": 2.7, + "grad_norm": 4.875822688048872, + "learning_rate": 2.9814536664695914e-07, + "loss": 0.327, + "step": 47950 + }, + { + "epoch": 2.7, + "grad_norm": 4.128496690051945, + "learning_rate": 2.9758810814118967e-07, + "loss": 0.3279, + "step": 47955 + }, + { + "epoch": 2.7, + "grad_norm": 4.502126110869699, + "learning_rate": 2.9703135493033053e-07, + "loss": 0.3209, + "step": 47960 + }, + { + "epoch": 2.7, + "grad_norm": 4.372510229707003, + "learning_rate": 2.964751070742089e-07, + "loss": 0.3188, + "step": 47965 + }, + { + "epoch": 2.7, + "grad_norm": 4.99897515610518, + "learning_rate": 2.9591936463259585e-07, + "loss": 0.3095, + "step": 47970 + }, + { + "epoch": 2.7, + "grad_norm": 4.836799461654691, + "learning_rate": 2.953641276652064e-07, + "loss": 0.3462, + "step": 47975 + }, + { + "epoch": 2.7, + "grad_norm": 5.0562436096974, + "learning_rate": 2.94809396231705e-07, + "loss": 0.3292, + "step": 47980 + }, + { + "epoch": 2.7, + "grad_norm": 4.5157550259236485, + "learning_rate": 2.942551703916996e-07, + "loss": 0.3549, + "step": 47985 + }, + { + "epoch": 2.7, + "grad_norm": 5.155132591518107, + "learning_rate": 2.937014502047442e-07, + "loss": 0.353, + "step": 47990 + }, + { + "epoch": 2.7, + "grad_norm": 4.4462293514944955, + "learning_rate": 2.931482357303389e-07, + "loss": 0.3202, + "step": 47995 + }, + { + "epoch": 2.7, + "grad_norm": 5.008498180222657, + "learning_rate": 2.9259552702792793e-07, + "loss": 0.3432, + "step": 48000 + }, + { + "epoch": 2.7, + "grad_norm": 6.013341013228123, + "learning_rate": 2.9204332415690264e-07, + "loss": 0.3249, + "step": 48005 + }, + { + "epoch": 2.71, + "grad_norm": 4.294916659419839, + "learning_rate": 2.9149162717660107e-07, + "loss": 0.2977, + "step": 48010 + }, + { + "epoch": 2.71, + "grad_norm": 4.596923682020863, + "learning_rate": 2.9094043614630305e-07, + "loss": 0.3387, + "step": 48015 + }, + { + "epoch": 2.71, + "grad_norm": 4.340613792857137, + "learning_rate": 2.9038975112523847e-07, + "loss": 0.2886, + "step": 48020 + }, + { + "epoch": 2.71, + "grad_norm": 4.588770766517988, + "learning_rate": 2.898395721725805e-07, + "loss": 0.297, + "step": 48025 + }, + { + "epoch": 2.71, + "grad_norm": 4.636047363816947, + "learning_rate": 2.89289899347448e-07, + "loss": 0.3187, + "step": 48030 + }, + { + "epoch": 2.71, + "grad_norm": 4.456184039149601, + "learning_rate": 2.8874073270890536e-07, + "loss": 0.2915, + "step": 48035 + }, + { + "epoch": 2.71, + "grad_norm": 5.119085168184362, + "learning_rate": 2.8819207231596424e-07, + "loss": 0.34, + "step": 48040 + }, + { + "epoch": 2.71, + "grad_norm": 4.550098220068273, + "learning_rate": 2.8764391822757867e-07, + "loss": 0.2936, + "step": 48045 + }, + { + "epoch": 2.71, + "grad_norm": 4.840685867505534, + "learning_rate": 2.87096270502652e-07, + "loss": 0.335, + "step": 48050 + }, + { + "epoch": 2.71, + "grad_norm": 8.596024096044914, + "learning_rate": 2.8654912920003106e-07, + "loss": 0.3078, + "step": 48055 + }, + { + "epoch": 2.71, + "grad_norm": 5.39991037935613, + "learning_rate": 2.8600249437850725e-07, + "loss": 0.3083, + "step": 48060 + }, + { + "epoch": 2.71, + "grad_norm": 4.66445646412765, + "learning_rate": 2.8545636609682017e-07, + "loss": 0.3247, + "step": 48065 + }, + { + "epoch": 2.71, + "grad_norm": 4.939081279564444, + "learning_rate": 2.849107444136534e-07, + "loss": 0.3177, + "step": 48070 + }, + { + "epoch": 2.71, + "grad_norm": 4.371676845073246, + "learning_rate": 2.8436562938763676e-07, + "loss": 0.2955, + "step": 48075 + }, + { + "epoch": 2.71, + "grad_norm": 4.7915892877165485, + "learning_rate": 2.8382102107734513e-07, + "loss": 0.3347, + "step": 48080 + }, + { + "epoch": 2.71, + "grad_norm": 4.900237168206849, + "learning_rate": 2.8327691954129823e-07, + "loss": 0.3351, + "step": 48085 + }, + { + "epoch": 2.71, + "grad_norm": 4.6266733418582895, + "learning_rate": 2.827333248379632e-07, + "loss": 0.2983, + "step": 48090 + }, + { + "epoch": 2.71, + "grad_norm": 4.536261795730333, + "learning_rate": 2.821902370257507e-07, + "loss": 0.3433, + "step": 48095 + }, + { + "epoch": 2.71, + "grad_norm": 5.028518494385905, + "learning_rate": 2.8164765616301827e-07, + "loss": 0.3138, + "step": 48100 + }, + { + "epoch": 2.71, + "grad_norm": 4.13318173852176, + "learning_rate": 2.8110558230806993e-07, + "loss": 0.3143, + "step": 48105 + }, + { + "epoch": 2.71, + "grad_norm": 4.754033215845489, + "learning_rate": 2.805640155191519e-07, + "loss": 0.3252, + "step": 48110 + }, + { + "epoch": 2.71, + "grad_norm": 4.318566146912634, + "learning_rate": 2.800229558544598e-07, + "loss": 0.3267, + "step": 48115 + }, + { + "epoch": 2.71, + "grad_norm": 4.6740527822773155, + "learning_rate": 2.7948240337213164e-07, + "loss": 0.3072, + "step": 48120 + }, + { + "epoch": 2.71, + "grad_norm": 5.607471005702266, + "learning_rate": 2.789423581302525e-07, + "loss": 0.3046, + "step": 48125 + }, + { + "epoch": 2.71, + "grad_norm": 5.386526630340396, + "learning_rate": 2.7840282018685215e-07, + "loss": 0.3458, + "step": 48130 + }, + { + "epoch": 2.71, + "grad_norm": 4.719610208826684, + "learning_rate": 2.77863789599907e-07, + "loss": 0.3177, + "step": 48135 + }, + { + "epoch": 2.71, + "grad_norm": 4.751449920202733, + "learning_rate": 2.773252664273385e-07, + "loss": 0.2937, + "step": 48140 + }, + { + "epoch": 2.71, + "grad_norm": 5.935405482501159, + "learning_rate": 2.767872507270125e-07, + "loss": 0.3242, + "step": 48145 + }, + { + "epoch": 2.71, + "grad_norm": 4.553882914254541, + "learning_rate": 2.7624974255674175e-07, + "loss": 0.3157, + "step": 48150 + }, + { + "epoch": 2.71, + "grad_norm": 4.8498608330619675, + "learning_rate": 2.757127419742828e-07, + "loss": 0.316, + "step": 48155 + }, + { + "epoch": 2.71, + "grad_norm": 4.544397331845315, + "learning_rate": 2.751762490373405e-07, + "loss": 0.319, + "step": 48160 + }, + { + "epoch": 2.71, + "grad_norm": 5.354977104200328, + "learning_rate": 2.7464026380356277e-07, + "loss": 0.3192, + "step": 48165 + }, + { + "epoch": 2.71, + "grad_norm": 5.785778652903822, + "learning_rate": 2.741047863305424e-07, + "loss": 0.3041, + "step": 48170 + }, + { + "epoch": 2.71, + "grad_norm": 4.947356521333472, + "learning_rate": 2.735698166758205e-07, + "loss": 0.3401, + "step": 48175 + }, + { + "epoch": 2.71, + "grad_norm": 4.265983571552172, + "learning_rate": 2.7303535489688004e-07, + "loss": 0.3404, + "step": 48180 + }, + { + "epoch": 2.71, + "grad_norm": 4.923016936776703, + "learning_rate": 2.725014010511534e-07, + "loss": 0.35, + "step": 48185 + }, + { + "epoch": 2.72, + "grad_norm": 4.29443219370602, + "learning_rate": 2.719679551960147e-07, + "loss": 0.2947, + "step": 48190 + }, + { + "epoch": 2.72, + "grad_norm": 4.403594296346075, + "learning_rate": 2.714350173887853e-07, + "loss": 0.3307, + "step": 48195 + }, + { + "epoch": 2.72, + "grad_norm": 4.832278401792976, + "learning_rate": 2.7090258768673274e-07, + "loss": 0.3229, + "step": 48200 + }, + { + "epoch": 2.72, + "grad_norm": 4.291655696679214, + "learning_rate": 2.703706661470679e-07, + "loss": 0.2846, + "step": 48205 + }, + { + "epoch": 2.72, + "grad_norm": 4.725478258645117, + "learning_rate": 2.6983925282694887e-07, + "loss": 0.337, + "step": 48210 + }, + { + "epoch": 2.72, + "grad_norm": 4.636798919742124, + "learning_rate": 2.693083477834768e-07, + "loss": 0.3493, + "step": 48215 + }, + { + "epoch": 2.72, + "grad_norm": 4.384085649462769, + "learning_rate": 2.6877795107370206e-07, + "loss": 0.2948, + "step": 48220 + }, + { + "epoch": 2.72, + "grad_norm": 6.095443922903502, + "learning_rate": 2.682480627546169e-07, + "loss": 0.3046, + "step": 48225 + }, + { + "epoch": 2.72, + "grad_norm": 4.340265699931569, + "learning_rate": 2.677186828831596e-07, + "loss": 0.3098, + "step": 48230 + }, + { + "epoch": 2.72, + "grad_norm": 4.573530998144871, + "learning_rate": 2.671898115162158e-07, + "loss": 0.3356, + "step": 48235 + }, + { + "epoch": 2.72, + "grad_norm": 4.553303339441197, + "learning_rate": 2.666614487106134e-07, + "loss": 0.2995, + "step": 48240 + }, + { + "epoch": 2.72, + "grad_norm": 4.380895526287801, + "learning_rate": 2.661335945231286e-07, + "loss": 0.3202, + "step": 48245 + }, + { + "epoch": 2.72, + "grad_norm": 4.645901369789934, + "learning_rate": 2.6560624901048103e-07, + "loss": 0.3077, + "step": 48250 + }, + { + "epoch": 2.72, + "grad_norm": 4.9588769543741975, + "learning_rate": 2.650794122293365e-07, + "loss": 0.3044, + "step": 48255 + }, + { + "epoch": 2.72, + "grad_norm": 4.7119343066023, + "learning_rate": 2.645530842363059e-07, + "loss": 0.2998, + "step": 48260 + }, + { + "epoch": 2.72, + "grad_norm": 5.200997395871063, + "learning_rate": 2.640272650879455e-07, + "loss": 0.3053, + "step": 48265 + }, + { + "epoch": 2.72, + "grad_norm": 4.358109505683691, + "learning_rate": 2.635019548407575e-07, + "loss": 0.2984, + "step": 48270 + }, + { + "epoch": 2.72, + "grad_norm": 4.821216192556364, + "learning_rate": 2.6297715355118714e-07, + "loss": 0.3151, + "step": 48275 + }, + { + "epoch": 2.72, + "grad_norm": 4.443574438783689, + "learning_rate": 2.624528612756283e-07, + "loss": 0.3325, + "step": 48280 + }, + { + "epoch": 2.72, + "grad_norm": 4.514500606658843, + "learning_rate": 2.6192907807041756e-07, + "loss": 0.3345, + "step": 48285 + }, + { + "epoch": 2.72, + "grad_norm": 4.763407786139578, + "learning_rate": 2.6140580399183824e-07, + "loss": 0.308, + "step": 48290 + }, + { + "epoch": 2.72, + "grad_norm": 4.506694333749776, + "learning_rate": 2.6088303909611866e-07, + "loss": 0.2847, + "step": 48295 + }, + { + "epoch": 2.72, + "grad_norm": 4.439390995243577, + "learning_rate": 2.603607834394312e-07, + "loss": 0.2956, + "step": 48300 + }, + { + "epoch": 2.72, + "grad_norm": 5.328194427436979, + "learning_rate": 2.5983903707789583e-07, + "loss": 0.3656, + "step": 48305 + }, + { + "epoch": 2.72, + "grad_norm": 4.788156999472909, + "learning_rate": 2.593178000675761e-07, + "loss": 0.315, + "step": 48310 + }, + { + "epoch": 2.72, + "grad_norm": 4.622047510705419, + "learning_rate": 2.5879707246448003e-07, + "loss": 0.3451, + "step": 48315 + }, + { + "epoch": 2.72, + "grad_norm": 4.675275570473539, + "learning_rate": 2.5827685432456394e-07, + "loss": 0.3276, + "step": 48320 + }, + { + "epoch": 2.72, + "grad_norm": 4.65125097104725, + "learning_rate": 2.577571457037259e-07, + "loss": 0.3243, + "step": 48325 + }, + { + "epoch": 2.72, + "grad_norm": 4.543791125320193, + "learning_rate": 2.572379466578123e-07, + "loss": 0.3808, + "step": 48330 + }, + { + "epoch": 2.72, + "grad_norm": 4.341357498998987, + "learning_rate": 2.5671925724261193e-07, + "loss": 0.3199, + "step": 48335 + }, + { + "epoch": 2.72, + "grad_norm": 5.640376502483177, + "learning_rate": 2.562010775138618e-07, + "loss": 0.2978, + "step": 48340 + }, + { + "epoch": 2.72, + "grad_norm": 4.520983776948888, + "learning_rate": 2.5568340752724185e-07, + "loss": 0.3284, + "step": 48345 + }, + { + "epoch": 2.72, + "grad_norm": 4.565842464269002, + "learning_rate": 2.5516624733837756e-07, + "loss": 0.3174, + "step": 48350 + }, + { + "epoch": 2.72, + "grad_norm": 4.199199793015165, + "learning_rate": 2.546495970028412e-07, + "loss": 0.3123, + "step": 48355 + }, + { + "epoch": 2.72, + "grad_norm": 4.0215539304949495, + "learning_rate": 2.5413345657614774e-07, + "loss": 0.3042, + "step": 48360 + }, + { + "epoch": 2.73, + "grad_norm": 4.293481420370327, + "learning_rate": 2.536178261137601e-07, + "loss": 0.3513, + "step": 48365 + }, + { + "epoch": 2.73, + "grad_norm": 5.107031913559453, + "learning_rate": 2.5310270567108385e-07, + "loss": 0.3289, + "step": 48370 + }, + { + "epoch": 2.73, + "grad_norm": 4.21557193488729, + "learning_rate": 2.5258809530347204e-07, + "loss": 0.2998, + "step": 48375 + }, + { + "epoch": 2.73, + "grad_norm": 4.784881851311318, + "learning_rate": 2.52073995066221e-07, + "loss": 0.3874, + "step": 48380 + }, + { + "epoch": 2.73, + "grad_norm": 5.177736110934562, + "learning_rate": 2.5156040501457256e-07, + "loss": 0.3371, + "step": 48385 + }, + { + "epoch": 2.73, + "grad_norm": 4.341872033030984, + "learning_rate": 2.5104732520371656e-07, + "loss": 0.337, + "step": 48390 + }, + { + "epoch": 2.73, + "grad_norm": 4.691009906416703, + "learning_rate": 2.5053475568878384e-07, + "loss": 0.3476, + "step": 48395 + }, + { + "epoch": 2.73, + "grad_norm": 4.9428860993574215, + "learning_rate": 2.50022696524852e-07, + "loss": 0.3407, + "step": 48400 + }, + { + "epoch": 2.73, + "grad_norm": 4.584417431322471, + "learning_rate": 2.495111477669454e-07, + "loss": 0.3304, + "step": 48405 + }, + { + "epoch": 2.73, + "grad_norm": 4.607442312863722, + "learning_rate": 2.4900010947003117e-07, + "loss": 0.3314, + "step": 48410 + }, + { + "epoch": 2.73, + "grad_norm": 4.505455187839345, + "learning_rate": 2.4848958168902317e-07, + "loss": 0.3172, + "step": 48415 + }, + { + "epoch": 2.73, + "grad_norm": 5.054524398874461, + "learning_rate": 2.479795644787786e-07, + "loss": 0.3252, + "step": 48420 + }, + { + "epoch": 2.73, + "grad_norm": 4.75513974897599, + "learning_rate": 2.474700578941036e-07, + "loss": 0.297, + "step": 48425 + }, + { + "epoch": 2.73, + "grad_norm": 4.251761658206857, + "learning_rate": 2.46961061989745e-07, + "loss": 0.3399, + "step": 48430 + }, + { + "epoch": 2.73, + "grad_norm": 5.022015028529235, + "learning_rate": 2.4645257682039626e-07, + "loss": 0.3622, + "step": 48435 + }, + { + "epoch": 2.73, + "grad_norm": 4.2767440878294325, + "learning_rate": 2.459446024406975e-07, + "loss": 0.329, + "step": 48440 + }, + { + "epoch": 2.73, + "grad_norm": 4.817468601166096, + "learning_rate": 2.4543713890523237e-07, + "loss": 0.3575, + "step": 48445 + }, + { + "epoch": 2.73, + "grad_norm": 4.741023448168125, + "learning_rate": 2.449301862685305e-07, + "loss": 0.3118, + "step": 48450 + }, + { + "epoch": 2.73, + "grad_norm": 4.611835086952191, + "learning_rate": 2.4442374458506613e-07, + "loss": 0.3131, + "step": 48455 + }, + { + "epoch": 2.73, + "grad_norm": 4.326487496709411, + "learning_rate": 2.439178139092574e-07, + "loss": 0.2935, + "step": 48460 + }, + { + "epoch": 2.73, + "grad_norm": 4.878973820550783, + "learning_rate": 2.434123942954703e-07, + "loss": 0.3207, + "step": 48465 + }, + { + "epoch": 2.73, + "grad_norm": 4.719651152630841, + "learning_rate": 2.42907485798014e-07, + "loss": 0.319, + "step": 48470 + }, + { + "epoch": 2.73, + "grad_norm": 5.083885644675477, + "learning_rate": 2.424030884711426e-07, + "loss": 0.3317, + "step": 48475 + }, + { + "epoch": 2.73, + "grad_norm": 6.191228762060517, + "learning_rate": 2.418992023690564e-07, + "loss": 0.3175, + "step": 48480 + }, + { + "epoch": 2.73, + "grad_norm": 4.888713359558324, + "learning_rate": 2.4139582754589996e-07, + "loss": 0.3227, + "step": 48485 + }, + { + "epoch": 2.73, + "grad_norm": 3.9695097719879, + "learning_rate": 2.4089296405576334e-07, + "loss": 0.3097, + "step": 48490 + }, + { + "epoch": 2.73, + "grad_norm": 4.831692727842725, + "learning_rate": 2.403906119526811e-07, + "loss": 0.324, + "step": 48495 + }, + { + "epoch": 2.73, + "grad_norm": 4.769783229227645, + "learning_rate": 2.3988877129063336e-07, + "loss": 0.3347, + "step": 48500 + }, + { + "epoch": 2.73, + "grad_norm": 4.395280734211998, + "learning_rate": 2.393874421235448e-07, + "loss": 0.3117, + "step": 48505 + }, + { + "epoch": 2.73, + "grad_norm": 5.613298053689998, + "learning_rate": 2.388866245052868e-07, + "loss": 0.3316, + "step": 48510 + }, + { + "epoch": 2.73, + "grad_norm": 5.291076970437595, + "learning_rate": 2.3838631848967286e-07, + "loss": 0.3143, + "step": 48515 + }, + { + "epoch": 2.73, + "grad_norm": 5.117887596077893, + "learning_rate": 2.3788652413046341e-07, + "loss": 0.3072, + "step": 48520 + }, + { + "epoch": 2.73, + "grad_norm": 4.382365807835714, + "learning_rate": 2.3738724148136383e-07, + "loss": 0.3198, + "step": 48525 + }, + { + "epoch": 2.73, + "grad_norm": 4.866468843543766, + "learning_rate": 2.3688847059602393e-07, + "loss": 0.3539, + "step": 48530 + }, + { + "epoch": 2.73, + "grad_norm": 4.515002010174925, + "learning_rate": 2.3639021152803977e-07, + "loss": 0.3472, + "step": 48535 + }, + { + "epoch": 2.73, + "grad_norm": 5.201456980862455, + "learning_rate": 2.358924643309507e-07, + "loss": 0.3177, + "step": 48540 + }, + { + "epoch": 2.74, + "grad_norm": 4.648208394085241, + "learning_rate": 2.353952290582412e-07, + "loss": 0.3321, + "step": 48545 + }, + { + "epoch": 2.74, + "grad_norm": 4.358863393218217, + "learning_rate": 2.3489850576334294e-07, + "loss": 0.2982, + "step": 48550 + }, + { + "epoch": 2.74, + "grad_norm": 5.173404261867256, + "learning_rate": 2.3440229449962936e-07, + "loss": 0.3139, + "step": 48555 + }, + { + "epoch": 2.74, + "grad_norm": 4.362149370807334, + "learning_rate": 2.339065953204217e-07, + "loss": 0.3312, + "step": 48560 + }, + { + "epoch": 2.74, + "grad_norm": 4.400072583425098, + "learning_rate": 2.3341140827898512e-07, + "loss": 0.348, + "step": 48565 + }, + { + "epoch": 2.74, + "grad_norm": 4.604628847225456, + "learning_rate": 2.3291673342852872e-07, + "loss": 0.3405, + "step": 48570 + }, + { + "epoch": 2.74, + "grad_norm": 4.34353342388737, + "learning_rate": 2.3242257082220886e-07, + "loss": 0.3149, + "step": 48575 + }, + { + "epoch": 2.74, + "grad_norm": 4.279829571817658, + "learning_rate": 2.3192892051312365e-07, + "loss": 0.3226, + "step": 48580 + }, + { + "epoch": 2.74, + "grad_norm": 4.48094737447629, + "learning_rate": 2.3143578255432008e-07, + "loss": 0.3033, + "step": 48585 + }, + { + "epoch": 2.74, + "grad_norm": 4.294683799143649, + "learning_rate": 2.3094315699878633e-07, + "loss": 0.3142, + "step": 48590 + }, + { + "epoch": 2.74, + "grad_norm": 4.722491596409391, + "learning_rate": 2.3045104389945838e-07, + "loss": 0.2974, + "step": 48595 + }, + { + "epoch": 2.74, + "grad_norm": 4.256774612443694, + "learning_rate": 2.2995944330921504e-07, + "loss": 0.3026, + "step": 48600 + }, + { + "epoch": 2.74, + "grad_norm": 4.231845486959496, + "learning_rate": 2.294683552808813e-07, + "loss": 0.3165, + "step": 48605 + }, + { + "epoch": 2.74, + "grad_norm": 5.10781775538336, + "learning_rate": 2.2897777986722713e-07, + "loss": 0.3125, + "step": 48610 + }, + { + "epoch": 2.74, + "grad_norm": 4.513689272308508, + "learning_rate": 2.284877171209654e-07, + "loss": 0.3132, + "step": 48615 + }, + { + "epoch": 2.74, + "grad_norm": 4.935012379172134, + "learning_rate": 2.2799816709475842e-07, + "loss": 0.3079, + "step": 48620 + }, + { + "epoch": 2.74, + "grad_norm": 5.069317755187218, + "learning_rate": 2.2750912984120798e-07, + "loss": 0.3369, + "step": 48625 + }, + { + "epoch": 2.74, + "grad_norm": 4.479800405668751, + "learning_rate": 2.2702060541286374e-07, + "loss": 0.3139, + "step": 48630 + }, + { + "epoch": 2.74, + "grad_norm": 4.91373517562012, + "learning_rate": 2.2653259386222092e-07, + "loss": 0.317, + "step": 48635 + }, + { + "epoch": 2.74, + "grad_norm": 5.191209904819061, + "learning_rate": 2.260450952417176e-07, + "loss": 0.3472, + "step": 48640 + }, + { + "epoch": 2.74, + "grad_norm": 4.738003488603212, + "learning_rate": 2.255581096037379e-07, + "loss": 0.3352, + "step": 48645 + }, + { + "epoch": 2.74, + "grad_norm": 4.462488744125328, + "learning_rate": 2.2507163700061007e-07, + "loss": 0.3288, + "step": 48650 + }, + { + "epoch": 2.74, + "grad_norm": 4.460285896466044, + "learning_rate": 2.2458567748460834e-07, + "loss": 0.3027, + "step": 48655 + }, + { + "epoch": 2.74, + "grad_norm": 4.835187356687008, + "learning_rate": 2.2410023110795154e-07, + "loss": 0.3217, + "step": 48660 + }, + { + "epoch": 2.74, + "grad_norm": 4.4936032920551465, + "learning_rate": 2.2361529792280235e-07, + "loss": 0.3171, + "step": 48665 + }, + { + "epoch": 2.74, + "grad_norm": 4.279379016726062, + "learning_rate": 2.2313087798126908e-07, + "loss": 0.2975, + "step": 48670 + }, + { + "epoch": 2.74, + "grad_norm": 5.173206576804991, + "learning_rate": 2.2264697133540514e-07, + "loss": 0.3115, + "step": 48675 + }, + { + "epoch": 2.74, + "grad_norm": 5.264923126794077, + "learning_rate": 2.2216357803720834e-07, + "loss": 0.3162, + "step": 48680 + }, + { + "epoch": 2.74, + "grad_norm": 4.154763208221577, + "learning_rate": 2.2168069813862102e-07, + "loss": 0.3072, + "step": 48685 + }, + { + "epoch": 2.74, + "grad_norm": 7.52416814448695, + "learning_rate": 2.211983316915306e-07, + "loss": 0.3169, + "step": 48690 + }, + { + "epoch": 2.74, + "grad_norm": 4.8239161198660465, + "learning_rate": 2.2071647874777058e-07, + "loss": 0.3462, + "step": 48695 + }, + { + "epoch": 2.74, + "grad_norm": 4.743538371289594, + "learning_rate": 2.2023513935911734e-07, + "loss": 0.3607, + "step": 48700 + }, + { + "epoch": 2.74, + "grad_norm": 4.349279132536488, + "learning_rate": 2.197543135772928e-07, + "loss": 0.3328, + "step": 48705 + }, + { + "epoch": 2.74, + "grad_norm": 4.75414314900954, + "learning_rate": 2.192740014539646e-07, + "loss": 0.3145, + "step": 48710 + }, + { + "epoch": 2.74, + "grad_norm": 5.228703320654876, + "learning_rate": 2.1879420304074362e-07, + "loss": 0.3201, + "step": 48715 + }, + { + "epoch": 2.75, + "grad_norm": 4.063793745033728, + "learning_rate": 2.1831491838918694e-07, + "loss": 0.2697, + "step": 48720 + }, + { + "epoch": 2.75, + "grad_norm": 4.536008369701413, + "learning_rate": 2.1783614755079453e-07, + "loss": 0.3163, + "step": 48725 + }, + { + "epoch": 2.75, + "grad_norm": 4.980949776874142, + "learning_rate": 2.1735789057701407e-07, + "loss": 0.3169, + "step": 48730 + }, + { + "epoch": 2.75, + "grad_norm": 4.498869917530692, + "learning_rate": 2.168801475192356e-07, + "loss": 0.3127, + "step": 48735 + }, + { + "epoch": 2.75, + "grad_norm": 4.935501703167351, + "learning_rate": 2.1640291842879468e-07, + "loss": 0.328, + "step": 48740 + }, + { + "epoch": 2.75, + "grad_norm": 5.179056348410177, + "learning_rate": 2.1592620335697144e-07, + "loss": 0.3088, + "step": 48745 + }, + { + "epoch": 2.75, + "grad_norm": 4.170036214389213, + "learning_rate": 2.1545000235499158e-07, + "loss": 0.3049, + "step": 48750 + }, + { + "epoch": 2.75, + "grad_norm": 4.957812299908879, + "learning_rate": 2.1497431547402524e-07, + "loss": 0.2965, + "step": 48755 + }, + { + "epoch": 2.75, + "grad_norm": 5.052972801970208, + "learning_rate": 2.14499142765186e-07, + "loss": 0.2887, + "step": 48760 + }, + { + "epoch": 2.75, + "grad_norm": 5.790161194528763, + "learning_rate": 2.140244842795347e-07, + "loss": 0.3333, + "step": 48765 + }, + { + "epoch": 2.75, + "grad_norm": 4.821468909260666, + "learning_rate": 2.135503400680744e-07, + "loss": 0.2937, + "step": 48770 + }, + { + "epoch": 2.75, + "grad_norm": 4.477710390220459, + "learning_rate": 2.130767101817538e-07, + "loss": 0.304, + "step": 48775 + }, + { + "epoch": 2.75, + "grad_norm": 3.8026257383268662, + "learning_rate": 2.1260359467146774e-07, + "loss": 0.2992, + "step": 48780 + }, + { + "epoch": 2.75, + "grad_norm": 5.476622510171095, + "learning_rate": 2.121309935880528e-07, + "loss": 0.3191, + "step": 48785 + }, + { + "epoch": 2.75, + "grad_norm": 4.753207872192619, + "learning_rate": 2.116589069822944e-07, + "loss": 0.3132, + "step": 48790 + }, + { + "epoch": 2.75, + "grad_norm": 4.326528265772871, + "learning_rate": 2.1118733490491816e-07, + "loss": 0.3121, + "step": 48795 + }, + { + "epoch": 2.75, + "grad_norm": 4.452515476611273, + "learning_rate": 2.1071627740659738e-07, + "loss": 0.2989, + "step": 48800 + }, + { + "epoch": 2.75, + "grad_norm": 5.090399182091981, + "learning_rate": 2.1024573453794994e-07, + "loss": 0.3416, + "step": 48805 + }, + { + "epoch": 2.75, + "grad_norm": 4.486045533116705, + "learning_rate": 2.097757063495365e-07, + "loss": 0.3273, + "step": 48810 + }, + { + "epoch": 2.75, + "grad_norm": 4.786701399162015, + "learning_rate": 2.0930619289186438e-07, + "loss": 0.3166, + "step": 48815 + }, + { + "epoch": 2.75, + "grad_norm": 4.578007534203883, + "learning_rate": 2.0883719421538496e-07, + "loss": 0.3507, + "step": 48820 + }, + { + "epoch": 2.75, + "grad_norm": 4.6778620259830825, + "learning_rate": 2.08368710370494e-07, + "loss": 0.3054, + "step": 48825 + }, + { + "epoch": 2.75, + "grad_norm": 4.672715960745678, + "learning_rate": 2.0790074140753181e-07, + "loss": 0.3208, + "step": 48830 + }, + { + "epoch": 2.75, + "grad_norm": 4.316246102513153, + "learning_rate": 2.074332873767848e-07, + "loss": 0.2947, + "step": 48835 + }, + { + "epoch": 2.75, + "grad_norm": 4.057774998230994, + "learning_rate": 2.0696634832848117e-07, + "loss": 0.3299, + "step": 48840 + }, + { + "epoch": 2.75, + "grad_norm": 4.789371789010931, + "learning_rate": 2.0649992431279743e-07, + "loss": 0.3352, + "step": 48845 + }, + { + "epoch": 2.75, + "grad_norm": 5.0122057332275025, + "learning_rate": 2.0603401537985235e-07, + "loss": 0.3315, + "step": 48850 + }, + { + "epoch": 2.75, + "grad_norm": 5.324040746124454, + "learning_rate": 2.055686215797098e-07, + "loss": 0.3209, + "step": 48855 + }, + { + "epoch": 2.75, + "grad_norm": 4.923613594591983, + "learning_rate": 2.0510374296237755e-07, + "loss": 0.3089, + "step": 48860 + }, + { + "epoch": 2.75, + "grad_norm": 4.495983971185289, + "learning_rate": 2.0463937957781065e-07, + "loss": 0.3005, + "step": 48865 + }, + { + "epoch": 2.75, + "grad_norm": 4.800563641183687, + "learning_rate": 2.0417553147590475e-07, + "loss": 0.3153, + "step": 48870 + }, + { + "epoch": 2.75, + "grad_norm": 4.527806384209996, + "learning_rate": 2.0371219870650438e-07, + "loss": 0.3016, + "step": 48875 + }, + { + "epoch": 2.75, + "grad_norm": 4.4945441170043114, + "learning_rate": 2.0324938131939588e-07, + "loss": 0.3009, + "step": 48880 + }, + { + "epoch": 2.75, + "grad_norm": 4.329729661208378, + "learning_rate": 2.0278707936431053e-07, + "loss": 0.3187, + "step": 48885 + }, + { + "epoch": 2.75, + "grad_norm": 4.839627579434529, + "learning_rate": 2.0232529289092583e-07, + "loss": 0.3242, + "step": 48890 + }, + { + "epoch": 2.75, + "grad_norm": 4.548891180838907, + "learning_rate": 2.0186402194886156e-07, + "loss": 0.3208, + "step": 48895 + }, + { + "epoch": 2.76, + "grad_norm": 4.540142339527456, + "learning_rate": 2.0140326658768473e-07, + "loss": 0.3337, + "step": 48900 + }, + { + "epoch": 2.76, + "grad_norm": 4.807465281063605, + "learning_rate": 2.0094302685690402e-07, + "loss": 0.3054, + "step": 48905 + }, + { + "epoch": 2.76, + "grad_norm": 5.38012020671858, + "learning_rate": 2.0048330280597495e-07, + "loss": 0.3184, + "step": 48910 + }, + { + "epoch": 2.76, + "grad_norm": 4.4830538938930244, + "learning_rate": 2.0002409448429738e-07, + "loss": 0.3248, + "step": 48915 + }, + { + "epoch": 2.76, + "grad_norm": 4.261476325984831, + "learning_rate": 1.995654019412141e-07, + "loss": 0.3565, + "step": 48920 + }, + { + "epoch": 2.76, + "grad_norm": 4.579978223742383, + "learning_rate": 1.9910722522601455e-07, + "loss": 0.3226, + "step": 48925 + }, + { + "epoch": 2.76, + "grad_norm": 4.892672366414126, + "learning_rate": 1.9864956438793215e-07, + "loss": 0.3424, + "step": 48930 + }, + { + "epoch": 2.76, + "grad_norm": 4.572617588825795, + "learning_rate": 1.981924194761431e-07, + "loss": 0.3241, + "step": 48935 + }, + { + "epoch": 2.76, + "grad_norm": 4.561838938066252, + "learning_rate": 1.9773579053977143e-07, + "loss": 0.3191, + "step": 48940 + }, + { + "epoch": 2.76, + "grad_norm": 4.667196042380077, + "learning_rate": 1.9727967762788236e-07, + "loss": 0.2989, + "step": 48945 + }, + { + "epoch": 2.76, + "grad_norm": 4.180949793823962, + "learning_rate": 1.9682408078948835e-07, + "loss": 0.3102, + "step": 48950 + }, + { + "epoch": 2.76, + "grad_norm": 4.6479624764642145, + "learning_rate": 1.9636900007354464e-07, + "loss": 0.3413, + "step": 48955 + }, + { + "epoch": 2.76, + "grad_norm": 4.571997812211011, + "learning_rate": 1.9591443552895273e-07, + "loss": 0.3337, + "step": 48960 + }, + { + "epoch": 2.76, + "grad_norm": 4.376117436048435, + "learning_rate": 1.954603872045563e-07, + "loss": 0.2789, + "step": 48965 + }, + { + "epoch": 2.76, + "grad_norm": 4.540570814512873, + "learning_rate": 1.9500685514914463e-07, + "loss": 0.3183, + "step": 48970 + }, + { + "epoch": 2.76, + "grad_norm": 4.778227282715019, + "learning_rate": 1.9455383941145322e-07, + "loss": 0.3109, + "step": 48975 + }, + { + "epoch": 2.76, + "grad_norm": 5.472267768248948, + "learning_rate": 1.9410134004015925e-07, + "loss": 0.3048, + "step": 48980 + }, + { + "epoch": 2.76, + "grad_norm": 5.253835751740025, + "learning_rate": 1.936493570838871e-07, + "loss": 0.3057, + "step": 48985 + }, + { + "epoch": 2.76, + "grad_norm": 5.312471739582353, + "learning_rate": 1.9319789059120298e-07, + "loss": 0.3459, + "step": 48990 + }, + { + "epoch": 2.76, + "grad_norm": 4.629782722527147, + "learning_rate": 1.9274694061061972e-07, + "loss": 0.3007, + "step": 48995 + }, + { + "epoch": 2.76, + "grad_norm": 4.44227520736774, + "learning_rate": 1.9229650719059412e-07, + "loss": 0.2945, + "step": 49000 + }, + { + "epoch": 2.76, + "grad_norm": 4.620035652646511, + "learning_rate": 1.918465903795258e-07, + "loss": 0.331, + "step": 49005 + }, + { + "epoch": 2.76, + "grad_norm": 4.5860921186999954, + "learning_rate": 1.913971902257622e-07, + "loss": 0.3342, + "step": 49010 + }, + { + "epoch": 2.76, + "grad_norm": 4.6381365769760965, + "learning_rate": 1.909483067775919e-07, + "loss": 0.2948, + "step": 49015 + }, + { + "epoch": 2.76, + "grad_norm": 4.669053813092479, + "learning_rate": 1.9049994008325022e-07, + "loss": 0.3256, + "step": 49020 + }, + { + "epoch": 2.76, + "grad_norm": 5.011530251180718, + "learning_rate": 1.900520901909164e-07, + "loss": 0.2778, + "step": 49025 + }, + { + "epoch": 2.76, + "grad_norm": 4.766175203791341, + "learning_rate": 1.8960475714871252e-07, + "loss": 0.3287, + "step": 49030 + }, + { + "epoch": 2.76, + "grad_norm": 4.432442576871143, + "learning_rate": 1.8915794100470842e-07, + "loss": 0.3115, + "step": 49035 + }, + { + "epoch": 2.76, + "grad_norm": 4.799932467394133, + "learning_rate": 1.8871164180691458e-07, + "loss": 0.3475, + "step": 49040 + }, + { + "epoch": 2.76, + "grad_norm": 4.0604929545214405, + "learning_rate": 1.8826585960328936e-07, + "loss": 0.3452, + "step": 49045 + }, + { + "epoch": 2.76, + "grad_norm": 4.67532811778708, + "learning_rate": 1.8782059444173327e-07, + "loss": 0.3077, + "step": 49050 + }, + { + "epoch": 2.76, + "grad_norm": 4.018812750614052, + "learning_rate": 1.8737584637009143e-07, + "loss": 0.3176, + "step": 49055 + }, + { + "epoch": 2.76, + "grad_norm": 4.718907937700562, + "learning_rate": 1.86931615436155e-07, + "loss": 0.3308, + "step": 49060 + }, + { + "epoch": 2.76, + "grad_norm": 4.215585333702029, + "learning_rate": 1.8648790168765807e-07, + "loss": 0.3195, + "step": 49065 + }, + { + "epoch": 2.76, + "grad_norm": 4.6316512024975225, + "learning_rate": 1.8604470517228024e-07, + "loss": 0.3455, + "step": 49070 + }, + { + "epoch": 2.77, + "grad_norm": 41.47359500203935, + "learning_rate": 1.8560202593764343e-07, + "loss": 0.3519, + "step": 49075 + }, + { + "epoch": 2.77, + "grad_norm": 5.400187130489294, + "learning_rate": 1.8515986403131736e-07, + "loss": 0.3255, + "step": 49080 + }, + { + "epoch": 2.77, + "grad_norm": 4.302854837180305, + "learning_rate": 1.8471821950081347e-07, + "loss": 0.2863, + "step": 49085 + }, + { + "epoch": 2.77, + "grad_norm": 4.912319645572431, + "learning_rate": 1.8427709239358772e-07, + "loss": 0.3328, + "step": 49090 + }, + { + "epoch": 2.77, + "grad_norm": 4.7087066202501315, + "learning_rate": 1.8383648275704214e-07, + "loss": 0.3128, + "step": 49095 + }, + { + "epoch": 2.77, + "grad_norm": 5.029776857488416, + "learning_rate": 1.8339639063852165e-07, + "loss": 0.2887, + "step": 49100 + }, + { + "epoch": 2.77, + "grad_norm": 4.40376162963985, + "learning_rate": 1.829568160853168e-07, + "loss": 0.3137, + "step": 49105 + }, + { + "epoch": 2.77, + "grad_norm": 4.992769460417173, + "learning_rate": 1.8251775914466086e-07, + "loss": 0.3242, + "step": 49110 + }, + { + "epoch": 2.77, + "grad_norm": 4.448704637868842, + "learning_rate": 1.8207921986373223e-07, + "loss": 0.3034, + "step": 49115 + }, + { + "epoch": 2.77, + "grad_norm": 5.127651118463407, + "learning_rate": 1.8164119828965542e-07, + "loss": 0.3707, + "step": 49120 + }, + { + "epoch": 2.77, + "grad_norm": 4.571870026758502, + "learning_rate": 1.812036944694967e-07, + "loss": 0.3391, + "step": 49125 + }, + { + "epoch": 2.77, + "grad_norm": 4.023194414765683, + "learning_rate": 1.8076670845026844e-07, + "loss": 0.2649, + "step": 49130 + }, + { + "epoch": 2.77, + "grad_norm": 4.794355404408927, + "learning_rate": 1.8033024027892588e-07, + "loss": 0.3236, + "step": 49135 + }, + { + "epoch": 2.77, + "grad_norm": 4.447515597431007, + "learning_rate": 1.7989429000236923e-07, + "loss": 0.3395, + "step": 49140 + }, + { + "epoch": 2.77, + "grad_norm": 5.053976987911622, + "learning_rate": 1.7945885766744497e-07, + "loss": 0.3379, + "step": 49145 + }, + { + "epoch": 2.77, + "grad_norm": 5.2742981381202245, + "learning_rate": 1.7902394332094008e-07, + "loss": 0.3287, + "step": 49150 + }, + { + "epoch": 2.77, + "grad_norm": 4.835210739452109, + "learning_rate": 1.7858954700958996e-07, + "loss": 0.3457, + "step": 49155 + }, + { + "epoch": 2.77, + "grad_norm": 4.215503775241933, + "learning_rate": 1.7815566878007064e-07, + "loss": 0.3217, + "step": 49160 + }, + { + "epoch": 2.77, + "grad_norm": 4.584841483240674, + "learning_rate": 1.7772230867900587e-07, + "loss": 0.3237, + "step": 49165 + }, + { + "epoch": 2.77, + "grad_norm": 4.654602953852498, + "learning_rate": 1.7728946675296122e-07, + "loss": 0.3368, + "step": 49170 + }, + { + "epoch": 2.77, + "grad_norm": 4.831925840461089, + "learning_rate": 1.7685714304844726e-07, + "loss": 0.2665, + "step": 49175 + }, + { + "epoch": 2.77, + "grad_norm": 4.466258043059292, + "learning_rate": 1.7642533761192016e-07, + "loss": 0.3556, + "step": 49180 + }, + { + "epoch": 2.77, + "grad_norm": 4.825888510969101, + "learning_rate": 1.759940504897778e-07, + "loss": 0.3107, + "step": 49185 + }, + { + "epoch": 2.77, + "grad_norm": 4.050732475009098, + "learning_rate": 1.7556328172836533e-07, + "loss": 0.2989, + "step": 49190 + }, + { + "epoch": 2.77, + "grad_norm": 4.385419000452627, + "learning_rate": 1.7513303137396909e-07, + "loss": 0.3139, + "step": 49195 + }, + { + "epoch": 2.77, + "grad_norm": 4.219793906782917, + "learning_rate": 1.7470329947282372e-07, + "loss": 0.3046, + "step": 49200 + }, + { + "epoch": 2.77, + "grad_norm": 4.721609037698076, + "learning_rate": 1.742740860711034e-07, + "loss": 0.2973, + "step": 49205 + }, + { + "epoch": 2.77, + "grad_norm": 4.771189694919438, + "learning_rate": 1.7384539121493016e-07, + "loss": 0.3297, + "step": 49210 + }, + { + "epoch": 2.77, + "grad_norm": 4.635735068267216, + "learning_rate": 1.7341721495036933e-07, + "loss": 0.33, + "step": 49215 + }, + { + "epoch": 2.77, + "grad_norm": 4.438209290682146, + "learning_rate": 1.7298955732343082e-07, + "loss": 0.2988, + "step": 49220 + }, + { + "epoch": 2.77, + "grad_norm": 4.483691036687797, + "learning_rate": 1.7256241838006672e-07, + "loss": 0.334, + "step": 49225 + }, + { + "epoch": 2.77, + "grad_norm": 5.242663895481422, + "learning_rate": 1.7213579816617588e-07, + "loss": 0.3316, + "step": 49230 + }, + { + "epoch": 2.77, + "grad_norm": 4.365577246093222, + "learning_rate": 1.717096967276005e-07, + "loss": 0.3115, + "step": 49235 + }, + { + "epoch": 2.77, + "grad_norm": 4.4489874026179255, + "learning_rate": 1.7128411411012792e-07, + "loss": 0.3319, + "step": 49240 + }, + { + "epoch": 2.77, + "grad_norm": 4.4687936706397196, + "learning_rate": 1.7085905035948703e-07, + "loss": 0.324, + "step": 49245 + }, + { + "epoch": 2.77, + "grad_norm": 3.92941646054833, + "learning_rate": 1.704345055213541e-07, + "loss": 0.2879, + "step": 49250 + }, + { + "epoch": 2.78, + "grad_norm": 4.5929394896555715, + "learning_rate": 1.7001047964134875e-07, + "loss": 0.3413, + "step": 49255 + }, + { + "epoch": 2.78, + "grad_norm": 4.680853375632767, + "learning_rate": 1.6958697276503288e-07, + "loss": 0.2759, + "step": 49260 + }, + { + "epoch": 2.78, + "grad_norm": 5.380558261041858, + "learning_rate": 1.6916398493791507e-07, + "loss": 0.308, + "step": 49265 + }, + { + "epoch": 2.78, + "grad_norm": 4.27707596732731, + "learning_rate": 1.6874151620544732e-07, + "loss": 0.3338, + "step": 49270 + }, + { + "epoch": 2.78, + "grad_norm": 4.861723308646287, + "learning_rate": 1.6831956661302607e-07, + "loss": 0.2765, + "step": 49275 + }, + { + "epoch": 2.78, + "grad_norm": 4.851889775601613, + "learning_rate": 1.6789813620599116e-07, + "loss": 0.3043, + "step": 49280 + }, + { + "epoch": 2.78, + "grad_norm": 4.758076018914188, + "learning_rate": 1.674772250296264e-07, + "loss": 0.3296, + "step": 49285 + }, + { + "epoch": 2.78, + "grad_norm": 4.495918493541164, + "learning_rate": 1.6705683312916166e-07, + "loss": 0.309, + "step": 49290 + }, + { + "epoch": 2.78, + "grad_norm": 4.611089598335909, + "learning_rate": 1.6663696054976975e-07, + "loss": 0.3084, + "step": 49295 + }, + { + "epoch": 2.78, + "grad_norm": 5.339235409887899, + "learning_rate": 1.6621760733656734e-07, + "loss": 0.3135, + "step": 49300 + }, + { + "epoch": 2.78, + "grad_norm": 4.342331312679037, + "learning_rate": 1.6579877353461616e-07, + "loss": 0.3142, + "step": 49305 + }, + { + "epoch": 2.78, + "grad_norm": 4.24058743942037, + "learning_rate": 1.6538045918892243e-07, + "loss": 0.3111, + "step": 49310 + }, + { + "epoch": 2.78, + "grad_norm": 4.938797914048927, + "learning_rate": 1.649626643444352e-07, + "loss": 0.3406, + "step": 49315 + }, + { + "epoch": 2.78, + "grad_norm": 4.456487517168583, + "learning_rate": 1.6454538904604744e-07, + "loss": 0.3171, + "step": 49320 + }, + { + "epoch": 2.78, + "grad_norm": 4.485593420063372, + "learning_rate": 1.6412863333859885e-07, + "loss": 0.3154, + "step": 49325 + }, + { + "epoch": 2.78, + "grad_norm": 4.279469998994342, + "learning_rate": 1.6371239726687082e-07, + "loss": 0.3019, + "step": 49330 + }, + { + "epoch": 2.78, + "grad_norm": 4.258712483394506, + "learning_rate": 1.632966808755898e-07, + "loss": 0.2986, + "step": 49335 + }, + { + "epoch": 2.78, + "grad_norm": 4.3646259616186995, + "learning_rate": 1.6288148420942672e-07, + "loss": 0.313, + "step": 49340 + }, + { + "epoch": 2.78, + "grad_norm": 4.689260127423614, + "learning_rate": 1.6246680731299536e-07, + "loss": 0.3125, + "step": 49345 + }, + { + "epoch": 2.78, + "grad_norm": 4.383301993976913, + "learning_rate": 1.620526502308556e-07, + "loss": 0.3004, + "step": 49350 + }, + { + "epoch": 2.78, + "grad_norm": 5.503178423014603, + "learning_rate": 1.616390130075096e-07, + "loss": 0.3835, + "step": 49355 + }, + { + "epoch": 2.78, + "grad_norm": 4.374325781219972, + "learning_rate": 1.6122589568740576e-07, + "loss": 0.3118, + "step": 49360 + }, + { + "epoch": 2.78, + "grad_norm": 4.4613099149958195, + "learning_rate": 1.608132983149341e-07, + "loss": 0.3093, + "step": 49365 + }, + { + "epoch": 2.78, + "grad_norm": 5.004622784835653, + "learning_rate": 1.6040122093443022e-07, + "loss": 0.3362, + "step": 49370 + }, + { + "epoch": 2.78, + "grad_norm": 4.956152039375732, + "learning_rate": 1.5998966359017488e-07, + "loss": 0.338, + "step": 49375 + }, + { + "epoch": 2.78, + "grad_norm": 4.690217694208695, + "learning_rate": 1.5957862632638987e-07, + "loss": 0.3198, + "step": 49380 + }, + { + "epoch": 2.78, + "grad_norm": 4.690957375560183, + "learning_rate": 1.591681091872438e-07, + "loss": 0.2817, + "step": 49385 + }, + { + "epoch": 2.78, + "grad_norm": 5.003496220778562, + "learning_rate": 1.5875811221684912e-07, + "loss": 0.3261, + "step": 49390 + }, + { + "epoch": 2.78, + "grad_norm": 4.545500669800517, + "learning_rate": 1.583486354592606e-07, + "loss": 0.2923, + "step": 49395 + }, + { + "epoch": 2.78, + "grad_norm": 4.203199731145849, + "learning_rate": 1.5793967895848027e-07, + "loss": 0.3127, + "step": 49400 + }, + { + "epoch": 2.78, + "grad_norm": 4.576779035060099, + "learning_rate": 1.5753124275844967e-07, + "loss": 0.2906, + "step": 49405 + }, + { + "epoch": 2.78, + "grad_norm": 5.00853311597256, + "learning_rate": 1.5712332690305977e-07, + "loss": 0.3344, + "step": 49410 + }, + { + "epoch": 2.78, + "grad_norm": 4.09984843274896, + "learning_rate": 1.567159314361405e-07, + "loss": 0.3177, + "step": 49415 + }, + { + "epoch": 2.78, + "grad_norm": 4.731019089443841, + "learning_rate": 1.563090564014702e-07, + "loss": 0.33, + "step": 49420 + }, + { + "epoch": 2.78, + "grad_norm": 4.699799571044952, + "learning_rate": 1.559027018427689e-07, + "loss": 0.3077, + "step": 49425 + }, + { + "epoch": 2.79, + "grad_norm": 4.915617021912783, + "learning_rate": 1.5549686780369999e-07, + "loss": 0.3241, + "step": 49430 + }, + { + "epoch": 2.79, + "grad_norm": 4.445358027809136, + "learning_rate": 1.5509155432787414e-07, + "loss": 0.3111, + "step": 49435 + }, + { + "epoch": 2.79, + "grad_norm": 4.8622196114555045, + "learning_rate": 1.5468676145884208e-07, + "loss": 0.3391, + "step": 49440 + }, + { + "epoch": 2.79, + "grad_norm": 4.794898668313322, + "learning_rate": 1.5428248924010182e-07, + "loss": 0.3349, + "step": 49445 + }, + { + "epoch": 2.79, + "grad_norm": 5.148367879965321, + "learning_rate": 1.538787377150941e-07, + "loss": 0.3473, + "step": 49450 + }, + { + "epoch": 2.79, + "grad_norm": 5.018720295060291, + "learning_rate": 1.5347550692720313e-07, + "loss": 0.3127, + "step": 49455 + }, + { + "epoch": 2.79, + "grad_norm": 5.126032955709537, + "learning_rate": 1.5307279691975873e-07, + "loss": 0.3122, + "step": 49460 + }, + { + "epoch": 2.79, + "grad_norm": 4.645274597456493, + "learning_rate": 1.5267060773603294e-07, + "loss": 0.3082, + "step": 49465 + }, + { + "epoch": 2.79, + "grad_norm": 4.539644192415214, + "learning_rate": 1.5226893941924392e-07, + "loss": 0.3386, + "step": 49470 + }, + { + "epoch": 2.79, + "grad_norm": 4.8818796766097154, + "learning_rate": 1.5186779201255108e-07, + "loss": 0.2992, + "step": 49475 + }, + { + "epoch": 2.79, + "grad_norm": 4.664261169459152, + "learning_rate": 1.514671655590605e-07, + "loss": 0.3001, + "step": 49480 + }, + { + "epoch": 2.79, + "grad_norm": 4.481285845733247, + "learning_rate": 1.5106706010182214e-07, + "loss": 0.3058, + "step": 49485 + }, + { + "epoch": 2.79, + "grad_norm": 4.316575255719252, + "learning_rate": 1.5066747568382723e-07, + "loss": 0.3043, + "step": 49490 + }, + { + "epoch": 2.79, + "grad_norm": 5.342556509589545, + "learning_rate": 1.502684123480147e-07, + "loss": 0.3385, + "step": 49495 + }, + { + "epoch": 2.79, + "grad_norm": 4.764862801595888, + "learning_rate": 1.4986987013726363e-07, + "loss": 0.319, + "step": 49500 + }, + { + "epoch": 2.79, + "grad_norm": 5.182945761708489, + "learning_rate": 1.4947184909440138e-07, + "loss": 0.341, + "step": 49505 + }, + { + "epoch": 2.79, + "grad_norm": 5.218815957627943, + "learning_rate": 1.4907434926219545e-07, + "loss": 0.3083, + "step": 49510 + }, + { + "epoch": 2.79, + "grad_norm": 4.512042894996303, + "learning_rate": 1.4867737068335941e-07, + "loss": 0.3451, + "step": 49515 + }, + { + "epoch": 2.79, + "grad_norm": 7.04880001576011, + "learning_rate": 1.4828091340055085e-07, + "loss": 0.3417, + "step": 49520 + }, + { + "epoch": 2.79, + "grad_norm": 3.874911424061834, + "learning_rate": 1.4788497745636955e-07, + "loss": 0.3032, + "step": 49525 + }, + { + "epoch": 2.79, + "grad_norm": 4.53295590430745, + "learning_rate": 1.4748956289336203e-07, + "loss": 0.2939, + "step": 49530 + }, + { + "epoch": 2.79, + "grad_norm": 4.210848335554472, + "learning_rate": 1.4709466975401708e-07, + "loss": 0.2776, + "step": 49535 + }, + { + "epoch": 2.79, + "grad_norm": 4.4851897229701, + "learning_rate": 1.4670029808076634e-07, + "loss": 0.3267, + "step": 49540 + }, + { + "epoch": 2.79, + "grad_norm": 4.5599165578640894, + "learning_rate": 1.463064479159887e-07, + "loss": 0.3307, + "step": 49545 + }, + { + "epoch": 2.79, + "grad_norm": 4.367003965898049, + "learning_rate": 1.459131193020036e-07, + "loss": 0.3101, + "step": 49550 + }, + { + "epoch": 2.79, + "grad_norm": 3.9377164708803516, + "learning_rate": 1.455203122810772e-07, + "loss": 0.2938, + "step": 49555 + }, + { + "epoch": 2.79, + "grad_norm": 4.61850378796758, + "learning_rate": 1.451280268954175e-07, + "loss": 0.3013, + "step": 49560 + }, + { + "epoch": 2.79, + "grad_norm": 4.324312135203147, + "learning_rate": 1.4473626318717737e-07, + "loss": 0.3012, + "step": 49565 + }, + { + "epoch": 2.79, + "grad_norm": 4.807004375756143, + "learning_rate": 1.4434502119845319e-07, + "loss": 0.2695, + "step": 49570 + }, + { + "epoch": 2.79, + "grad_norm": 4.920628767107733, + "learning_rate": 1.4395430097128627e-07, + "loss": 0.2959, + "step": 49575 + }, + { + "epoch": 2.79, + "grad_norm": 4.364254147624005, + "learning_rate": 1.4356410254766196e-07, + "loss": 0.3229, + "step": 49580 + }, + { + "epoch": 2.79, + "grad_norm": 5.088700187800069, + "learning_rate": 1.4317442596950726e-07, + "loss": 0.3166, + "step": 49585 + }, + { + "epoch": 2.79, + "grad_norm": 4.216262749120216, + "learning_rate": 1.4278527127869535e-07, + "loss": 0.3044, + "step": 49590 + }, + { + "epoch": 2.79, + "grad_norm": 5.014721378114283, + "learning_rate": 1.4239663851704277e-07, + "loss": 0.2915, + "step": 49595 + }, + { + "epoch": 2.79, + "grad_norm": 5.690237564984018, + "learning_rate": 1.4200852772630946e-07, + "loss": 0.3207, + "step": 49600 + }, + { + "epoch": 2.79, + "grad_norm": 4.391227977336331, + "learning_rate": 1.4162093894819984e-07, + "loss": 0.289, + "step": 49605 + }, + { + "epoch": 2.8, + "grad_norm": 4.051753165624551, + "learning_rate": 1.4123387222436168e-07, + "loss": 0.2867, + "step": 49610 + }, + { + "epoch": 2.8, + "grad_norm": 4.724103774928652, + "learning_rate": 1.4084732759638786e-07, + "loss": 0.2793, + "step": 49615 + }, + { + "epoch": 2.8, + "grad_norm": 4.668499657819339, + "learning_rate": 1.4046130510581403e-07, + "loss": 0.3097, + "step": 49620 + }, + { + "epoch": 2.8, + "grad_norm": 4.669216058746535, + "learning_rate": 1.4007580479411863e-07, + "loss": 0.3034, + "step": 49625 + }, + { + "epoch": 2.8, + "grad_norm": 5.937021637540202, + "learning_rate": 1.396908267027275e-07, + "loss": 0.3261, + "step": 49630 + }, + { + "epoch": 2.8, + "grad_norm": 4.467460570511647, + "learning_rate": 1.3930637087300635e-07, + "loss": 0.3193, + "step": 49635 + }, + { + "epoch": 2.8, + "grad_norm": 4.883384265978267, + "learning_rate": 1.3892243734626831e-07, + "loss": 0.3153, + "step": 49640 + }, + { + "epoch": 2.8, + "grad_norm": 4.8550942496126055, + "learning_rate": 1.3853902616376703e-07, + "loss": 0.3356, + "step": 49645 + }, + { + "epoch": 2.8, + "grad_norm": 4.3256939303010205, + "learning_rate": 1.3815613736670397e-07, + "loss": 0.3203, + "step": 49650 + }, + { + "epoch": 2.8, + "grad_norm": 5.881945932100836, + "learning_rate": 1.3777377099621958e-07, + "loss": 0.3214, + "step": 49655 + }, + { + "epoch": 2.8, + "grad_norm": 4.33756910057617, + "learning_rate": 1.3739192709340321e-07, + "loss": 0.3182, + "step": 49660 + }, + { + "epoch": 2.8, + "grad_norm": 4.679228348160198, + "learning_rate": 1.370106056992837e-07, + "loss": 0.3017, + "step": 49665 + }, + { + "epoch": 2.8, + "grad_norm": 5.820551716831073, + "learning_rate": 1.3662980685483662e-07, + "loss": 0.2874, + "step": 49670 + }, + { + "epoch": 2.8, + "grad_norm": 4.277810315998395, + "learning_rate": 1.3624953060098145e-07, + "loss": 0.3168, + "step": 49675 + }, + { + "epoch": 2.8, + "grad_norm": 5.091580304351212, + "learning_rate": 1.3586977697857882e-07, + "loss": 0.3188, + "step": 49680 + }, + { + "epoch": 2.8, + "grad_norm": 4.907577781622324, + "learning_rate": 1.3549054602843558e-07, + "loss": 0.3495, + "step": 49685 + }, + { + "epoch": 2.8, + "grad_norm": 4.410263128612622, + "learning_rate": 1.3511183779130243e-07, + "loss": 0.3316, + "step": 49690 + }, + { + "epoch": 2.8, + "grad_norm": 4.517350551085498, + "learning_rate": 1.347336523078724e-07, + "loss": 0.2913, + "step": 49695 + }, + { + "epoch": 2.8, + "grad_norm": 4.555476506318306, + "learning_rate": 1.3435598961878404e-07, + "loss": 0.3228, + "step": 49700 + }, + { + "epoch": 2.8, + "grad_norm": 4.866765603990417, + "learning_rate": 1.3397884976461716e-07, + "loss": 0.3171, + "step": 49705 + }, + { + "epoch": 2.8, + "grad_norm": 4.8702380224929875, + "learning_rate": 1.3360223278589935e-07, + "loss": 0.3126, + "step": 49710 + }, + { + "epoch": 2.8, + "grad_norm": 4.811413132990187, + "learning_rate": 1.3322613872309874e-07, + "loss": 0.2999, + "step": 49715 + }, + { + "epoch": 2.8, + "grad_norm": 5.271145464293548, + "learning_rate": 1.328505676166275e-07, + "loss": 0.3083, + "step": 49720 + }, + { + "epoch": 2.8, + "grad_norm": 4.5236074869459, + "learning_rate": 1.3247551950684334e-07, + "loss": 0.334, + "step": 49725 + }, + { + "epoch": 2.8, + "grad_norm": 4.34227982709005, + "learning_rate": 1.321009944340468e-07, + "loss": 0.3082, + "step": 49730 + }, + { + "epoch": 2.8, + "grad_norm": 4.538327848443762, + "learning_rate": 1.3172699243848174e-07, + "loss": 0.2933, + "step": 49735 + }, + { + "epoch": 2.8, + "grad_norm": 4.385324363870202, + "learning_rate": 1.313535135603372e-07, + "loss": 0.3179, + "step": 49740 + }, + { + "epoch": 2.8, + "grad_norm": 5.608675996383441, + "learning_rate": 1.309805578397444e-07, + "loss": 0.2889, + "step": 49745 + }, + { + "epoch": 2.8, + "grad_norm": 4.448251701218472, + "learning_rate": 1.3060812531677902e-07, + "loss": 0.3121, + "step": 49750 + }, + { + "epoch": 2.8, + "grad_norm": 4.425920239944105, + "learning_rate": 1.302362160314613e-07, + "loss": 0.3083, + "step": 49755 + }, + { + "epoch": 2.8, + "grad_norm": 4.530434310691104, + "learning_rate": 1.2986483002375316e-07, + "loss": 0.2997, + "step": 49760 + }, + { + "epoch": 2.8, + "grad_norm": 4.525187526324167, + "learning_rate": 1.294939673335638e-07, + "loss": 0.301, + "step": 49765 + }, + { + "epoch": 2.8, + "grad_norm": 4.210430002668408, + "learning_rate": 1.2912362800074185e-07, + "loss": 0.32, + "step": 49770 + }, + { + "epoch": 2.8, + "grad_norm": 4.765892537710484, + "learning_rate": 1.287538120650833e-07, + "loss": 0.3037, + "step": 49775 + }, + { + "epoch": 2.8, + "grad_norm": 4.444989167541033, + "learning_rate": 1.283845195663258e-07, + "loss": 0.2965, + "step": 49780 + }, + { + "epoch": 2.81, + "grad_norm": 5.08504032953018, + "learning_rate": 1.2801575054415262e-07, + "loss": 0.2916, + "step": 49785 + }, + { + "epoch": 2.81, + "grad_norm": 4.452497176596786, + "learning_rate": 1.2764750503818756e-07, + "loss": 0.3068, + "step": 49790 + }, + { + "epoch": 2.81, + "grad_norm": 4.493273299309027, + "learning_rate": 1.2727978308800237e-07, + "loss": 0.3258, + "step": 49795 + }, + { + "epoch": 2.81, + "grad_norm": 4.738191199702944, + "learning_rate": 1.269125847331093e-07, + "loss": 0.3204, + "step": 49800 + }, + { + "epoch": 2.81, + "grad_norm": 4.32332726726489, + "learning_rate": 1.2654591001296513e-07, + "loss": 0.3149, + "step": 49805 + }, + { + "epoch": 2.81, + "grad_norm": 5.2361901713688175, + "learning_rate": 1.2617975896697165e-07, + "loss": 0.302, + "step": 49810 + }, + { + "epoch": 2.81, + "grad_norm": 4.346275468583683, + "learning_rate": 1.2581413163447243e-07, + "loss": 0.3074, + "step": 49815 + }, + { + "epoch": 2.81, + "grad_norm": 4.000187465023883, + "learning_rate": 1.2544902805475657e-07, + "loss": 0.307, + "step": 49820 + }, + { + "epoch": 2.81, + "grad_norm": 4.387719405463538, + "learning_rate": 1.2508444826705545e-07, + "loss": 0.3053, + "step": 49825 + }, + { + "epoch": 2.81, + "grad_norm": 3.6913978646148995, + "learning_rate": 1.24720392310545e-07, + "loss": 0.308, + "step": 49830 + }, + { + "epoch": 2.81, + "grad_norm": 4.840259371819478, + "learning_rate": 1.2435686022434446e-07, + "loss": 0.358, + "step": 49835 + }, + { + "epoch": 2.81, + "grad_norm": 4.471284889555268, + "learning_rate": 1.2399385204751702e-07, + "loss": 0.3018, + "step": 49840 + }, + { + "epoch": 2.81, + "grad_norm": 5.426073064427697, + "learning_rate": 1.236313678190698e-07, + "loss": 0.3217, + "step": 49845 + }, + { + "epoch": 2.81, + "grad_norm": 4.578297894204973, + "learning_rate": 1.232694075779539e-07, + "loss": 0.3089, + "step": 49850 + }, + { + "epoch": 2.81, + "grad_norm": 4.269928237123571, + "learning_rate": 1.2290797136306154e-07, + "loss": 0.3162, + "step": 49855 + }, + { + "epoch": 2.81, + "grad_norm": 4.69279024555451, + "learning_rate": 1.2254705921323329e-07, + "loss": 0.3412, + "step": 49860 + }, + { + "epoch": 2.81, + "grad_norm": 5.293566881388998, + "learning_rate": 1.2218667116724814e-07, + "loss": 0.323, + "step": 49865 + }, + { + "epoch": 2.81, + "grad_norm": 4.383699298284824, + "learning_rate": 1.2182680726383344e-07, + "loss": 0.2978, + "step": 49870 + }, + { + "epoch": 2.81, + "grad_norm": 4.379892479437391, + "learning_rate": 1.2146746754165662e-07, + "loss": 0.3105, + "step": 49875 + }, + { + "epoch": 2.81, + "grad_norm": 4.370298698302884, + "learning_rate": 1.2110865203933175e-07, + "loss": 0.3196, + "step": 49880 + }, + { + "epoch": 2.81, + "grad_norm": 4.849092078290703, + "learning_rate": 1.2075036079541413e-07, + "loss": 0.3048, + "step": 49885 + }, + { + "epoch": 2.81, + "grad_norm": 4.735955032794627, + "learning_rate": 1.2039259384840407e-07, + "loss": 0.3441, + "step": 49890 + }, + { + "epoch": 2.81, + "grad_norm": 4.994709702676883, + "learning_rate": 1.2003535123674526e-07, + "loss": 0.346, + "step": 49895 + }, + { + "epoch": 2.81, + "grad_norm": 5.038490161528967, + "learning_rate": 1.196786329988242e-07, + "loss": 0.3123, + "step": 49900 + }, + { + "epoch": 2.81, + "grad_norm": 7.386741008813954, + "learning_rate": 1.1932243917297304e-07, + "loss": 0.311, + "step": 49905 + }, + { + "epoch": 2.81, + "grad_norm": 4.751250148862671, + "learning_rate": 1.1896676979746613e-07, + "loss": 0.3255, + "step": 49910 + }, + { + "epoch": 2.81, + "grad_norm": 6.343921331576727, + "learning_rate": 1.1861162491052069e-07, + "loss": 0.3684, + "step": 49915 + }, + { + "epoch": 2.81, + "grad_norm": 4.897884985248466, + "learning_rate": 1.1825700455030009e-07, + "loss": 0.3058, + "step": 49920 + }, + { + "epoch": 2.81, + "grad_norm": 4.617782399359281, + "learning_rate": 1.1790290875490884e-07, + "loss": 0.3025, + "step": 49925 + }, + { + "epoch": 2.81, + "grad_norm": 6.331148135171444, + "learning_rate": 1.175493375623965e-07, + "loss": 0.3246, + "step": 49930 + }, + { + "epoch": 2.81, + "grad_norm": 4.879795047588261, + "learning_rate": 1.1719629101075547e-07, + "loss": 0.3164, + "step": 49935 + }, + { + "epoch": 2.81, + "grad_norm": 4.736021943573125, + "learning_rate": 1.1684376913792206e-07, + "loss": 0.3051, + "step": 49940 + }, + { + "epoch": 2.81, + "grad_norm": 4.547630193154529, + "learning_rate": 1.1649177198177764e-07, + "loss": 0.2954, + "step": 49945 + }, + { + "epoch": 2.81, + "grad_norm": 4.544386783902927, + "learning_rate": 1.1614029958014416e-07, + "loss": 0.351, + "step": 49950 + }, + { + "epoch": 2.81, + "grad_norm": 4.8959323739397345, + "learning_rate": 1.1578935197078978e-07, + "loss": 0.3315, + "step": 49955 + }, + { + "epoch": 2.81, + "grad_norm": 4.45870327068497, + "learning_rate": 1.1543892919142541e-07, + "loss": 0.2912, + "step": 49960 + }, + { + "epoch": 2.82, + "grad_norm": 4.886514263568746, + "learning_rate": 1.1508903127970539e-07, + "loss": 0.3026, + "step": 49965 + }, + { + "epoch": 2.82, + "grad_norm": 4.763516862907151, + "learning_rate": 1.1473965827322742e-07, + "loss": 0.313, + "step": 49970 + }, + { + "epoch": 2.82, + "grad_norm": 4.5561197614542195, + "learning_rate": 1.1439081020953368e-07, + "loss": 0.3288, + "step": 49975 + }, + { + "epoch": 2.82, + "grad_norm": 4.456281693326938, + "learning_rate": 1.1404248712610922e-07, + "loss": 0.2934, + "step": 49980 + }, + { + "epoch": 2.82, + "grad_norm": 4.52520700641327, + "learning_rate": 1.136946890603824e-07, + "loss": 0.3026, + "step": 49985 + }, + { + "epoch": 2.82, + "grad_norm": 4.241907598125358, + "learning_rate": 1.1334741604972666e-07, + "loss": 0.3145, + "step": 49990 + }, + { + "epoch": 2.82, + "grad_norm": 4.390998513873911, + "learning_rate": 1.1300066813145771e-07, + "loss": 0.3377, + "step": 49995 + }, + { + "epoch": 2.82, + "grad_norm": 4.276889926816193, + "learning_rate": 1.1265444534283465e-07, + "loss": 0.36, + "step": 50000 + }, + { + "epoch": 2.82, + "grad_norm": 3.7710719196745943, + "learning_rate": 1.1230874772106104e-07, + "loss": 0.3169, + "step": 50005 + }, + { + "epoch": 2.82, + "grad_norm": 4.728281178786274, + "learning_rate": 1.1196357530328272e-07, + "loss": 0.3403, + "step": 50010 + }, + { + "epoch": 2.82, + "grad_norm": 4.400948003221979, + "learning_rate": 1.116189281265917e-07, + "loss": 0.3041, + "step": 50015 + }, + { + "epoch": 2.82, + "grad_norm": 3.822692472062315, + "learning_rate": 1.1127480622802056e-07, + "loss": 0.2699, + "step": 50020 + }, + { + "epoch": 2.82, + "grad_norm": 4.452412220408982, + "learning_rate": 1.109312096445475e-07, + "loss": 0.2988, + "step": 50025 + }, + { + "epoch": 2.82, + "grad_norm": 4.304613634028073, + "learning_rate": 1.1058813841309246e-07, + "loss": 0.3306, + "step": 50030 + }, + { + "epoch": 2.82, + "grad_norm": 4.549176845585562, + "learning_rate": 1.1024559257052036e-07, + "loss": 0.3103, + "step": 50035 + }, + { + "epoch": 2.82, + "grad_norm": 4.5313067796554245, + "learning_rate": 1.0990357215364067e-07, + "loss": 0.3472, + "step": 50040 + }, + { + "epoch": 2.82, + "grad_norm": 4.262994160246877, + "learning_rate": 1.0956207719920231e-07, + "loss": 0.3213, + "step": 50045 + }, + { + "epoch": 2.82, + "grad_norm": 4.934739579450185, + "learning_rate": 1.0922110774390315e-07, + "loss": 0.3274, + "step": 50050 + }, + { + "epoch": 2.82, + "grad_norm": 4.058559541882657, + "learning_rate": 1.0888066382438056e-07, + "loss": 0.2915, + "step": 50055 + }, + { + "epoch": 2.82, + "grad_norm": 4.914887366797448, + "learning_rate": 1.0854074547721583e-07, + "loss": 0.3336, + "step": 50060 + }, + { + "epoch": 2.82, + "grad_norm": 4.565490339689186, + "learning_rate": 1.082013527389364e-07, + "loss": 0.3224, + "step": 50065 + }, + { + "epoch": 2.82, + "grad_norm": 4.869785079152946, + "learning_rate": 1.0786248564601031e-07, + "loss": 0.3075, + "step": 50070 + }, + { + "epoch": 2.82, + "grad_norm": 4.418472574847041, + "learning_rate": 1.0752414423485125e-07, + "loss": 0.2944, + "step": 50075 + }, + { + "epoch": 2.82, + "grad_norm": 4.437129635372558, + "learning_rate": 1.071863285418151e-07, + "loss": 0.3191, + "step": 50080 + }, + { + "epoch": 2.82, + "grad_norm": 5.648109498374428, + "learning_rate": 1.0684903860320117e-07, + "loss": 0.3547, + "step": 50085 + }, + { + "epoch": 2.82, + "grad_norm": 4.490040844108931, + "learning_rate": 1.065122744552538e-07, + "loss": 0.3147, + "step": 50090 + }, + { + "epoch": 2.82, + "grad_norm": 5.036602894296083, + "learning_rate": 1.061760361341585e-07, + "loss": 0.3318, + "step": 50095 + }, + { + "epoch": 2.82, + "grad_norm": 4.197471938487059, + "learning_rate": 1.0584032367604636e-07, + "loss": 0.3113, + "step": 50100 + }, + { + "epoch": 2.82, + "grad_norm": 4.6151931997693305, + "learning_rate": 1.0550513711699129e-07, + "loss": 0.2871, + "step": 50105 + }, + { + "epoch": 2.82, + "grad_norm": 4.347851844019679, + "learning_rate": 1.0517047649300949e-07, + "loss": 0.327, + "step": 50110 + }, + { + "epoch": 2.82, + "grad_norm": 4.412524011331123, + "learning_rate": 1.048363418400633e-07, + "loss": 0.3219, + "step": 50115 + }, + { + "epoch": 2.82, + "grad_norm": 4.6523083585095, + "learning_rate": 1.0450273319405569e-07, + "loss": 0.3497, + "step": 50120 + }, + { + "epoch": 2.82, + "grad_norm": 5.194009118699771, + "learning_rate": 1.0416965059083517e-07, + "loss": 0.3483, + "step": 50125 + }, + { + "epoch": 2.82, + "grad_norm": 4.825721169687486, + "learning_rate": 1.0383709406619258e-07, + "loss": 0.3421, + "step": 50130 + }, + { + "epoch": 2.82, + "grad_norm": 5.34509778362892, + "learning_rate": 1.0350506365586266e-07, + "loss": 0.317, + "step": 50135 + }, + { + "epoch": 2.83, + "grad_norm": 4.429398368110269, + "learning_rate": 1.0317355939552354e-07, + "loss": 0.31, + "step": 50140 + }, + { + "epoch": 2.83, + "grad_norm": 4.374883202619751, + "learning_rate": 1.0284258132079671e-07, + "loss": 0.3399, + "step": 50145 + }, + { + "epoch": 2.83, + "grad_norm": 4.480022612470613, + "learning_rate": 1.0251212946724765e-07, + "loss": 0.3126, + "step": 50150 + }, + { + "epoch": 2.83, + "grad_norm": 4.525072716123713, + "learning_rate": 1.0218220387038402e-07, + "loss": 0.3126, + "step": 50155 + }, + { + "epoch": 2.83, + "grad_norm": 4.86095362396342, + "learning_rate": 1.0185280456565916e-07, + "loss": 0.3288, + "step": 50160 + }, + { + "epoch": 2.83, + "grad_norm": 4.324954894412038, + "learning_rate": 1.0152393158846696e-07, + "loss": 0.314, + "step": 50165 + }, + { + "epoch": 2.83, + "grad_norm": 5.124842077445845, + "learning_rate": 1.0119558497414694e-07, + "loss": 0.3308, + "step": 50170 + }, + { + "epoch": 2.83, + "grad_norm": 5.028774692554803, + "learning_rate": 1.0086776475798144e-07, + "loss": 0.3266, + "step": 50175 + }, + { + "epoch": 2.83, + "grad_norm": 4.359988857098868, + "learning_rate": 1.0054047097519615e-07, + "loss": 0.335, + "step": 50180 + }, + { + "epoch": 2.83, + "grad_norm": 4.753341481215494, + "learning_rate": 1.0021370366096017e-07, + "loss": 0.3254, + "step": 50185 + }, + { + "epoch": 2.83, + "grad_norm": 5.104290093826002, + "learning_rate": 9.988746285038597e-08, + "loss": 0.3108, + "step": 50190 + }, + { + "epoch": 2.83, + "grad_norm": 5.243322106856568, + "learning_rate": 9.956174857853052e-08, + "loss": 0.3075, + "step": 50195 + }, + { + "epoch": 2.83, + "grad_norm": 4.263327233984905, + "learning_rate": 9.92365608803919e-08, + "loss": 0.2867, + "step": 50200 + }, + { + "epoch": 2.83, + "grad_norm": 4.490589622086641, + "learning_rate": 9.891189979091332e-08, + "loss": 0.2977, + "step": 50205 + }, + { + "epoch": 2.83, + "grad_norm": 5.216487071346811, + "learning_rate": 9.858776534498126e-08, + "loss": 0.2882, + "step": 50210 + }, + { + "epoch": 2.83, + "grad_norm": 4.462840569198014, + "learning_rate": 9.82641575774257e-08, + "loss": 0.3089, + "step": 50215 + }, + { + "epoch": 2.83, + "grad_norm": 5.284583532159085, + "learning_rate": 9.794107652301931e-08, + "loss": 0.3371, + "step": 50220 + }, + { + "epoch": 2.83, + "grad_norm": 4.6185822898545315, + "learning_rate": 9.761852221647883e-08, + "loss": 0.2963, + "step": 50225 + }, + { + "epoch": 2.83, + "grad_norm": 4.596465076660738, + "learning_rate": 9.729649469246372e-08, + "loss": 0.3246, + "step": 50230 + }, + { + "epoch": 2.83, + "grad_norm": 4.963895889346747, + "learning_rate": 9.697499398557796e-08, + "loss": 0.3311, + "step": 50235 + }, + { + "epoch": 2.83, + "grad_norm": 4.850941408885426, + "learning_rate": 9.665402013036728e-08, + "loss": 0.3294, + "step": 50240 + }, + { + "epoch": 2.83, + "grad_norm": 4.692459297777503, + "learning_rate": 9.633357316132296e-08, + "loss": 0.2874, + "step": 50245 + }, + { + "epoch": 2.83, + "grad_norm": 4.072748928491446, + "learning_rate": 9.601365311287746e-08, + "loss": 0.3128, + "step": 50250 + }, + { + "epoch": 2.83, + "grad_norm": 4.471451359124489, + "learning_rate": 9.569426001940718e-08, + "loss": 0.3171, + "step": 50255 + }, + { + "epoch": 2.83, + "grad_norm": 4.685336420093143, + "learning_rate": 9.53753939152341e-08, + "loss": 0.2928, + "step": 50260 + }, + { + "epoch": 2.83, + "grad_norm": 4.6468380348736265, + "learning_rate": 9.505705483461969e-08, + "loss": 0.3119, + "step": 50265 + }, + { + "epoch": 2.83, + "grad_norm": 4.98904304011332, + "learning_rate": 9.473924281177272e-08, + "loss": 0.3268, + "step": 50270 + }, + { + "epoch": 2.83, + "grad_norm": 4.858420597418929, + "learning_rate": 9.442195788084196e-08, + "loss": 0.3224, + "step": 50275 + }, + { + "epoch": 2.83, + "grad_norm": 5.872932590235281, + "learning_rate": 9.410520007592239e-08, + "loss": 0.2924, + "step": 50280 + }, + { + "epoch": 2.83, + "grad_norm": 4.801675970462846, + "learning_rate": 9.378896943105009e-08, + "loss": 0.3019, + "step": 50285 + }, + { + "epoch": 2.83, + "grad_norm": 3.978054949396103, + "learning_rate": 9.347326598020568e-08, + "loss": 0.305, + "step": 50290 + }, + { + "epoch": 2.83, + "grad_norm": 4.741435531719537, + "learning_rate": 9.315808975731367e-08, + "loss": 0.3038, + "step": 50295 + }, + { + "epoch": 2.83, + "grad_norm": 4.233291559055745, + "learning_rate": 9.284344079623974e-08, + "loss": 0.3248, + "step": 50300 + }, + { + "epoch": 2.83, + "grad_norm": 4.910759586828501, + "learning_rate": 9.252931913079521e-08, + "loss": 0.3128, + "step": 50305 + }, + { + "epoch": 2.83, + "grad_norm": 5.431837773424744, + "learning_rate": 9.221572479473361e-08, + "loss": 0.3459, + "step": 50310 + }, + { + "epoch": 2.83, + "grad_norm": 4.644901691822145, + "learning_rate": 9.19026578217519e-08, + "loss": 0.351, + "step": 50315 + }, + { + "epoch": 2.84, + "grad_norm": 4.842504398392006, + "learning_rate": 9.159011824549146e-08, + "loss": 0.2872, + "step": 50320 + }, + { + "epoch": 2.84, + "grad_norm": 4.165281179086053, + "learning_rate": 9.127810609953436e-08, + "loss": 0.3334, + "step": 50325 + }, + { + "epoch": 2.84, + "grad_norm": 4.7562824696128265, + "learning_rate": 9.096662141740931e-08, + "loss": 0.2934, + "step": 50330 + }, + { + "epoch": 2.84, + "grad_norm": 4.744936752511307, + "learning_rate": 9.065566423258621e-08, + "loss": 0.3094, + "step": 50335 + }, + { + "epoch": 2.84, + "grad_norm": 4.186146136092161, + "learning_rate": 9.034523457847777e-08, + "loss": 0.2915, + "step": 50340 + }, + { + "epoch": 2.84, + "grad_norm": 4.611058917440578, + "learning_rate": 9.003533248844232e-08, + "loss": 0.3197, + "step": 50345 + }, + { + "epoch": 2.84, + "grad_norm": 4.868501460717823, + "learning_rate": 8.972595799577933e-08, + "loss": 0.2739, + "step": 50350 + }, + { + "epoch": 2.84, + "grad_norm": 4.467519703279405, + "learning_rate": 8.94171111337333e-08, + "loss": 0.3662, + "step": 50355 + }, + { + "epoch": 2.84, + "grad_norm": 4.486339241583467, + "learning_rate": 8.91087919354905e-08, + "loss": 0.3105, + "step": 50360 + }, + { + "epoch": 2.84, + "grad_norm": 4.3476448630490845, + "learning_rate": 8.88010004341816e-08, + "loss": 0.3027, + "step": 50365 + }, + { + "epoch": 2.84, + "grad_norm": 4.754309533708539, + "learning_rate": 8.849373666287964e-08, + "loss": 0.295, + "step": 50370 + }, + { + "epoch": 2.84, + "grad_norm": 4.482721899119557, + "learning_rate": 8.818700065460206e-08, + "loss": 0.3103, + "step": 50375 + }, + { + "epoch": 2.84, + "grad_norm": 5.038167578700791, + "learning_rate": 8.788079244230863e-08, + "loss": 0.2929, + "step": 50380 + }, + { + "epoch": 2.84, + "grad_norm": 4.824291950693941, + "learning_rate": 8.757511205890301e-08, + "loss": 0.32, + "step": 50385 + }, + { + "epoch": 2.84, + "grad_norm": 4.742804541170898, + "learning_rate": 8.726995953723172e-08, + "loss": 0.3246, + "step": 50390 + }, + { + "epoch": 2.84, + "grad_norm": 5.057409040654424, + "learning_rate": 8.696533491008463e-08, + "loss": 0.3376, + "step": 50395 + }, + { + "epoch": 2.84, + "grad_norm": 5.186778939491333, + "learning_rate": 8.666123821019501e-08, + "loss": 0.2972, + "step": 50400 + }, + { + "epoch": 2.84, + "grad_norm": 5.212756169312982, + "learning_rate": 8.635766947024004e-08, + "loss": 0.3137, + "step": 50405 + }, + { + "epoch": 2.84, + "grad_norm": 5.09780841698599, + "learning_rate": 8.605462872283865e-08, + "loss": 0.3228, + "step": 50410 + }, + { + "epoch": 2.84, + "grad_norm": 4.363497119152206, + "learning_rate": 8.575211600055478e-08, + "loss": 0.318, + "step": 50415 + }, + { + "epoch": 2.84, + "grad_norm": 4.611170072335453, + "learning_rate": 8.545013133589463e-08, + "loss": 0.2828, + "step": 50420 + }, + { + "epoch": 2.84, + "grad_norm": 4.655326337914719, + "learning_rate": 8.514867476130673e-08, + "loss": 0.3046, + "step": 50425 + }, + { + "epoch": 2.84, + "grad_norm": 4.815812570756807, + "learning_rate": 8.484774630918457e-08, + "loss": 0.2977, + "step": 50430 + }, + { + "epoch": 2.84, + "grad_norm": 5.265079692222116, + "learning_rate": 8.454734601186454e-08, + "loss": 0.3096, + "step": 50435 + }, + { + "epoch": 2.84, + "grad_norm": 4.265372329352981, + "learning_rate": 8.42474739016258e-08, + "loss": 0.327, + "step": 50440 + }, + { + "epoch": 2.84, + "grad_norm": 4.675661363396215, + "learning_rate": 8.394813001069035e-08, + "loss": 0.2908, + "step": 50445 + }, + { + "epoch": 2.84, + "grad_norm": 4.689339208331142, + "learning_rate": 8.364931437122525e-08, + "loss": 0.2852, + "step": 50450 + }, + { + "epoch": 2.84, + "grad_norm": 4.095372852553672, + "learning_rate": 8.33510270153387e-08, + "loss": 0.2904, + "step": 50455 + }, + { + "epoch": 2.84, + "grad_norm": 4.838247684494885, + "learning_rate": 8.305326797508229e-08, + "loss": 0.315, + "step": 50460 + }, + { + "epoch": 2.84, + "grad_norm": 4.446557729521616, + "learning_rate": 8.275603728245319e-08, + "loss": 0.3177, + "step": 50465 + }, + { + "epoch": 2.84, + "grad_norm": 4.594023950904019, + "learning_rate": 8.245933496938918e-08, + "loss": 0.3099, + "step": 50470 + }, + { + "epoch": 2.84, + "grad_norm": 4.988865002310838, + "learning_rate": 8.216316106777256e-08, + "loss": 0.3001, + "step": 50475 + }, + { + "epoch": 2.84, + "grad_norm": 4.488323348782298, + "learning_rate": 8.186751560942785e-08, + "loss": 0.2989, + "step": 50480 + }, + { + "epoch": 2.84, + "grad_norm": 4.099294440369799, + "learning_rate": 8.157239862612409e-08, + "loss": 0.282, + "step": 50485 + }, + { + "epoch": 2.84, + "grad_norm": 4.45459536026761, + "learning_rate": 8.127781014957259e-08, + "loss": 0.2877, + "step": 50490 + }, + { + "epoch": 2.85, + "grad_norm": 4.452539301615366, + "learning_rate": 8.098375021142857e-08, + "loss": 0.2955, + "step": 50495 + }, + { + "epoch": 2.85, + "grad_norm": 4.680812573590136, + "learning_rate": 8.06902188432901e-08, + "loss": 0.3018, + "step": 50500 + }, + { + "epoch": 2.85, + "grad_norm": 4.443811170868664, + "learning_rate": 8.039721607669859e-08, + "loss": 0.322, + "step": 50505 + }, + { + "epoch": 2.85, + "grad_norm": 4.783655534184915, + "learning_rate": 8.010474194313721e-08, + "loss": 0.3034, + "step": 50510 + }, + { + "epoch": 2.85, + "grad_norm": 4.367756587249246, + "learning_rate": 7.981279647403528e-08, + "loss": 0.3133, + "step": 50515 + }, + { + "epoch": 2.85, + "grad_norm": 4.537199180987554, + "learning_rate": 7.952137970076212e-08, + "loss": 0.2931, + "step": 50520 + }, + { + "epoch": 2.85, + "grad_norm": 3.9418821906300536, + "learning_rate": 7.92304916546327e-08, + "loss": 0.3302, + "step": 50525 + }, + { + "epoch": 2.85, + "grad_norm": 4.4318586025931825, + "learning_rate": 7.894013236690368e-08, + "loss": 0.3137, + "step": 50530 + }, + { + "epoch": 2.85, + "grad_norm": 4.775949796296001, + "learning_rate": 7.86503018687762e-08, + "loss": 0.3038, + "step": 50535 + }, + { + "epoch": 2.85, + "grad_norm": 4.858496311439641, + "learning_rate": 7.836100019139314e-08, + "loss": 0.356, + "step": 50540 + }, + { + "epoch": 2.85, + "grad_norm": 4.593863582966466, + "learning_rate": 7.80722273658413e-08, + "loss": 0.2841, + "step": 50545 + }, + { + "epoch": 2.85, + "grad_norm": 4.52734634243024, + "learning_rate": 7.778398342315085e-08, + "loss": 0.3098, + "step": 50550 + }, + { + "epoch": 2.85, + "grad_norm": 4.32822498058099, + "learning_rate": 7.74962683942948e-08, + "loss": 0.3201, + "step": 50555 + }, + { + "epoch": 2.85, + "grad_norm": 4.69765347920918, + "learning_rate": 7.720908231018953e-08, + "loss": 0.3095, + "step": 50560 + }, + { + "epoch": 2.85, + "grad_norm": 4.737113373927384, + "learning_rate": 7.69224252016948e-08, + "loss": 0.2923, + "step": 50565 + }, + { + "epoch": 2.85, + "grad_norm": 4.888793443075394, + "learning_rate": 7.663629709961152e-08, + "loss": 0.3189, + "step": 50570 + }, + { + "epoch": 2.85, + "grad_norm": 4.555395636448465, + "learning_rate": 7.635069803468675e-08, + "loss": 0.3505, + "step": 50575 + }, + { + "epoch": 2.85, + "grad_norm": 4.568835597456061, + "learning_rate": 7.606562803760987e-08, + "loss": 0.304, + "step": 50580 + }, + { + "epoch": 2.85, + "grad_norm": 4.858743827885877, + "learning_rate": 7.578108713901189e-08, + "loss": 0.3302, + "step": 50585 + }, + { + "epoch": 2.85, + "grad_norm": 4.78588864979234, + "learning_rate": 7.549707536946837e-08, + "loss": 0.3167, + "step": 50590 + }, + { + "epoch": 2.85, + "grad_norm": 4.6799114837666345, + "learning_rate": 7.52135927594977e-08, + "loss": 0.3198, + "step": 50595 + }, + { + "epoch": 2.85, + "grad_norm": 4.362470865277408, + "learning_rate": 7.493063933956101e-08, + "loss": 0.3333, + "step": 50600 + }, + { + "epoch": 2.85, + "grad_norm": 4.20192899568831, + "learning_rate": 7.464821514006293e-08, + "loss": 0.3239, + "step": 50605 + }, + { + "epoch": 2.85, + "grad_norm": 4.395174539192422, + "learning_rate": 7.436632019135193e-08, + "loss": 0.2918, + "step": 50610 + }, + { + "epoch": 2.85, + "grad_norm": 4.5539554577142445, + "learning_rate": 7.408495452371766e-08, + "loss": 0.3208, + "step": 50615 + }, + { + "epoch": 2.85, + "grad_norm": 5.737968736331961, + "learning_rate": 7.380411816739541e-08, + "loss": 0.3223, + "step": 50620 + }, + { + "epoch": 2.85, + "grad_norm": 4.413379561627084, + "learning_rate": 7.352381115256157e-08, + "loss": 0.3122, + "step": 50625 + }, + { + "epoch": 2.85, + "grad_norm": 4.892905667302753, + "learning_rate": 7.324403350933595e-08, + "loss": 0.3419, + "step": 50630 + }, + { + "epoch": 2.85, + "grad_norm": 4.62858968304428, + "learning_rate": 7.296478526778283e-08, + "loss": 0.3023, + "step": 50635 + }, + { + "epoch": 2.85, + "grad_norm": 4.427630774315769, + "learning_rate": 7.268606645790766e-08, + "loss": 0.3073, + "step": 50640 + }, + { + "epoch": 2.85, + "grad_norm": 4.596735394376822, + "learning_rate": 7.240787710966146e-08, + "loss": 0.3014, + "step": 50645 + }, + { + "epoch": 2.85, + "grad_norm": 4.994393004728246, + "learning_rate": 7.213021725293534e-08, + "loss": 0.2905, + "step": 50650 + }, + { + "epoch": 2.85, + "grad_norm": 4.852710346439697, + "learning_rate": 7.185308691756543e-08, + "loss": 0.3172, + "step": 50655 + }, + { + "epoch": 2.85, + "grad_norm": 4.806108788659589, + "learning_rate": 7.157648613333124e-08, + "loss": 0.3141, + "step": 50660 + }, + { + "epoch": 2.85, + "grad_norm": 4.6204702214533775, + "learning_rate": 7.130041492995454e-08, + "loss": 0.3417, + "step": 50665 + }, + { + "epoch": 2.85, + "grad_norm": 4.475126761571318, + "learning_rate": 7.102487333709995e-08, + "loss": 0.3054, + "step": 50670 + }, + { + "epoch": 2.86, + "grad_norm": 5.006984284747317, + "learning_rate": 7.0749861384376e-08, + "loss": 0.2868, + "step": 50675 + }, + { + "epoch": 2.86, + "grad_norm": 5.614477656095139, + "learning_rate": 7.047537910133406e-08, + "loss": 0.3288, + "step": 50680 + }, + { + "epoch": 2.86, + "grad_norm": 4.424057852884947, + "learning_rate": 7.020142651746831e-08, + "loss": 0.2923, + "step": 50685 + }, + { + "epoch": 2.86, + "grad_norm": 4.626664531497191, + "learning_rate": 6.992800366221574e-08, + "loss": 0.3219, + "step": 50690 + }, + { + "epoch": 2.86, + "grad_norm": 4.436549046950527, + "learning_rate": 6.965511056495789e-08, + "loss": 0.2995, + "step": 50695 + }, + { + "epoch": 2.86, + "grad_norm": 4.327832563040486, + "learning_rate": 6.938274725501736e-08, + "loss": 0.3295, + "step": 50700 + }, + { + "epoch": 2.86, + "grad_norm": 4.868305097715036, + "learning_rate": 6.911091376166135e-08, + "loss": 0.3089, + "step": 50705 + }, + { + "epoch": 2.86, + "grad_norm": 4.752385747264929, + "learning_rate": 6.883961011409923e-08, + "loss": 0.3074, + "step": 50710 + }, + { + "epoch": 2.86, + "grad_norm": 4.443208251519891, + "learning_rate": 6.856883634148326e-08, + "loss": 0.3151, + "step": 50715 + }, + { + "epoch": 2.86, + "grad_norm": 4.88315770304472, + "learning_rate": 6.829859247291071e-08, + "loss": 0.3095, + "step": 50720 + }, + { + "epoch": 2.86, + "grad_norm": 4.459678871667474, + "learning_rate": 6.802887853741891e-08, + "loss": 0.2983, + "step": 50725 + }, + { + "epoch": 2.86, + "grad_norm": 4.499026110495813, + "learning_rate": 6.775969456399134e-08, + "loss": 0.2603, + "step": 50730 + }, + { + "epoch": 2.86, + "grad_norm": 4.395285627269967, + "learning_rate": 6.749104058155209e-08, + "loss": 0.319, + "step": 50735 + }, + { + "epoch": 2.86, + "grad_norm": 5.158844704155739, + "learning_rate": 6.722291661896863e-08, + "loss": 0.3155, + "step": 50740 + }, + { + "epoch": 2.86, + "grad_norm": 5.065675143292178, + "learning_rate": 6.695532270505345e-08, + "loss": 0.3409, + "step": 50745 + }, + { + "epoch": 2.86, + "grad_norm": 4.556871803932034, + "learning_rate": 6.668825886855967e-08, + "loss": 0.2996, + "step": 50750 + }, + { + "epoch": 2.86, + "grad_norm": 4.228147917553167, + "learning_rate": 6.642172513818545e-08, + "loss": 0.301, + "step": 50755 + }, + { + "epoch": 2.86, + "grad_norm": 4.318879529234859, + "learning_rate": 6.615572154256956e-08, + "loss": 0.3336, + "step": 50760 + }, + { + "epoch": 2.86, + "grad_norm": 4.846962166350977, + "learning_rate": 6.589024811029632e-08, + "loss": 0.3293, + "step": 50765 + }, + { + "epoch": 2.86, + "grad_norm": 4.43983729156032, + "learning_rate": 6.562530486989238e-08, + "loss": 0.3256, + "step": 50770 + }, + { + "epoch": 2.86, + "grad_norm": 4.480583732531065, + "learning_rate": 6.536089184982552e-08, + "loss": 0.2767, + "step": 50775 + }, + { + "epoch": 2.86, + "grad_norm": 5.536696216981906, + "learning_rate": 6.509700907851024e-08, + "loss": 0.3306, + "step": 50780 + }, + { + "epoch": 2.86, + "grad_norm": 4.449547707099758, + "learning_rate": 6.483365658429997e-08, + "loss": 0.3008, + "step": 50785 + }, + { + "epoch": 2.86, + "grad_norm": 4.851405238208118, + "learning_rate": 6.457083439549372e-08, + "loss": 0.3008, + "step": 50790 + }, + { + "epoch": 2.86, + "grad_norm": 4.771014055575448, + "learning_rate": 6.430854254033391e-08, + "loss": 0.3318, + "step": 50795 + }, + { + "epoch": 2.86, + "grad_norm": 4.547439424919254, + "learning_rate": 6.404678104700301e-08, + "loss": 0.3117, + "step": 50800 + }, + { + "epoch": 2.86, + "grad_norm": 4.995651136815408, + "learning_rate": 6.378554994363018e-08, + "loss": 0.314, + "step": 50805 + }, + { + "epoch": 2.86, + "grad_norm": 4.35402770455956, + "learning_rate": 6.352484925828462e-08, + "loss": 0.299, + "step": 50810 + }, + { + "epoch": 2.86, + "grad_norm": 4.387502089185089, + "learning_rate": 6.326467901898114e-08, + "loss": 0.2782, + "step": 50815 + }, + { + "epoch": 2.86, + "grad_norm": 4.901541909443361, + "learning_rate": 6.300503925367518e-08, + "loss": 0.2988, + "step": 50820 + }, + { + "epoch": 2.86, + "grad_norm": 4.331618446550664, + "learning_rate": 6.274592999026552e-08, + "loss": 0.2851, + "step": 50825 + }, + { + "epoch": 2.86, + "grad_norm": 4.4672706474430415, + "learning_rate": 6.248735125659655e-08, + "loss": 0.3123, + "step": 50830 + }, + { + "epoch": 2.86, + "grad_norm": 5.282887454706712, + "learning_rate": 6.222930308045161e-08, + "loss": 0.3238, + "step": 50835 + }, + { + "epoch": 2.86, + "grad_norm": 4.571514056594294, + "learning_rate": 6.197178548956073e-08, + "loss": 0.3184, + "step": 50840 + }, + { + "epoch": 2.86, + "grad_norm": 5.2451860988571575, + "learning_rate": 6.171479851159456e-08, + "loss": 0.3283, + "step": 50845 + }, + { + "epoch": 2.87, + "grad_norm": 4.180400392962699, + "learning_rate": 6.145834217416769e-08, + "loss": 0.3083, + "step": 50850 + }, + { + "epoch": 2.87, + "grad_norm": 5.105132746372893, + "learning_rate": 6.120241650483694e-08, + "loss": 0.3176, + "step": 50855 + }, + { + "epoch": 2.87, + "grad_norm": 6.134096959045546, + "learning_rate": 6.094702153110365e-08, + "loss": 0.3211, + "step": 50860 + }, + { + "epoch": 2.87, + "grad_norm": 4.592566732063401, + "learning_rate": 6.069215728041034e-08, + "loss": 0.3463, + "step": 50865 + }, + { + "epoch": 2.87, + "grad_norm": 4.702514719183494, + "learning_rate": 6.04378237801434e-08, + "loss": 0.3038, + "step": 50870 + }, + { + "epoch": 2.87, + "grad_norm": 4.787882802699646, + "learning_rate": 6.018402105763265e-08, + "loss": 0.3235, + "step": 50875 + }, + { + "epoch": 2.87, + "grad_norm": 5.672450653568845, + "learning_rate": 5.993074914014962e-08, + "loss": 0.2991, + "step": 50880 + }, + { + "epoch": 2.87, + "grad_norm": 4.100656691620931, + "learning_rate": 5.967800805490975e-08, + "loss": 0.3047, + "step": 50885 + }, + { + "epoch": 2.87, + "grad_norm": 5.012854771157292, + "learning_rate": 5.9425797829071875e-08, + "loss": 0.33, + "step": 50890 + }, + { + "epoch": 2.87, + "grad_norm": 5.461547457372237, + "learning_rate": 5.917411848973542e-08, + "loss": 0.3391, + "step": 50895 + }, + { + "epoch": 2.87, + "grad_norm": 4.726259847332957, + "learning_rate": 5.8922970063946535e-08, + "loss": 0.3128, + "step": 50900 + }, + { + "epoch": 2.87, + "grad_norm": 5.952165097565031, + "learning_rate": 5.8672352578690306e-08, + "loss": 0.3241, + "step": 50905 + }, + { + "epoch": 2.87, + "grad_norm": 4.04859829766221, + "learning_rate": 5.842226606089796e-08, + "loss": 0.2879, + "step": 50910 + }, + { + "epoch": 2.87, + "grad_norm": 4.279618153517121, + "learning_rate": 5.817271053744189e-08, + "loss": 0.3278, + "step": 50915 + }, + { + "epoch": 2.87, + "grad_norm": 4.714338243746879, + "learning_rate": 5.7923686035137874e-08, + "loss": 0.3472, + "step": 50920 + }, + { + "epoch": 2.87, + "grad_norm": 5.0741075367910735, + "learning_rate": 5.767519258074505e-08, + "loss": 0.2991, + "step": 50925 + }, + { + "epoch": 2.87, + "grad_norm": 4.503275509095795, + "learning_rate": 5.74272302009643e-08, + "loss": 0.3175, + "step": 50930 + }, + { + "epoch": 2.87, + "grad_norm": 4.277906641815037, + "learning_rate": 5.717979892244152e-08, + "loss": 0.2854, + "step": 50935 + }, + { + "epoch": 2.87, + "grad_norm": 4.622521615117127, + "learning_rate": 5.693289877176378e-08, + "loss": 0.286, + "step": 50940 + }, + { + "epoch": 2.87, + "grad_norm": 4.644644371729939, + "learning_rate": 5.668652977546096e-08, + "loss": 0.3159, + "step": 50945 + }, + { + "epoch": 2.87, + "grad_norm": 4.451748444550624, + "learning_rate": 5.6440691960007455e-08, + "loss": 0.3119, + "step": 50950 + }, + { + "epoch": 2.87, + "grad_norm": 4.506514099024686, + "learning_rate": 5.619538535181879e-08, + "loss": 0.3424, + "step": 50955 + }, + { + "epoch": 2.87, + "grad_norm": 4.58903115694264, + "learning_rate": 5.5950609977254986e-08, + "loss": 0.3177, + "step": 50960 + }, + { + "epoch": 2.87, + "grad_norm": 4.2189616600085635, + "learning_rate": 5.5706365862617794e-08, + "loss": 0.3221, + "step": 50965 + }, + { + "epoch": 2.87, + "grad_norm": 4.6120194439308335, + "learning_rate": 5.5462653034152326e-08, + "loss": 0.2973, + "step": 50970 + }, + { + "epoch": 2.87, + "grad_norm": 4.596783433291919, + "learning_rate": 5.521947151804707e-08, + "loss": 0.3219, + "step": 50975 + }, + { + "epoch": 2.87, + "grad_norm": 4.899675789218501, + "learning_rate": 5.497682134043225e-08, + "loss": 0.3215, + "step": 50980 + }, + { + "epoch": 2.87, + "grad_norm": 3.9943283942817644, + "learning_rate": 5.4734702527382e-08, + "loss": 0.2995, + "step": 50985 + }, + { + "epoch": 2.87, + "grad_norm": 4.9511983233665795, + "learning_rate": 5.4493115104913286e-08, + "loss": 0.3044, + "step": 50990 + }, + { + "epoch": 2.87, + "grad_norm": 4.271563576894699, + "learning_rate": 5.4252059098985346e-08, + "loss": 0.3179, + "step": 50995 + }, + { + "epoch": 2.87, + "grad_norm": 4.651601483403973, + "learning_rate": 5.401153453550134e-08, + "loss": 0.3342, + "step": 51000 + }, + { + "epoch": 2.87, + "grad_norm": 4.765661565149231, + "learning_rate": 5.377154144030561e-08, + "loss": 0.3276, + "step": 51005 + }, + { + "epoch": 2.87, + "grad_norm": 4.2832444858579, + "learning_rate": 5.353207983918751e-08, + "loss": 0.3297, + "step": 51010 + }, + { + "epoch": 2.87, + "grad_norm": 4.706003775371774, + "learning_rate": 5.3293149757877585e-08, + "loss": 0.3668, + "step": 51015 + }, + { + "epoch": 2.87, + "grad_norm": 4.440220912483668, + "learning_rate": 5.3054751222050836e-08, + "loss": 0.3351, + "step": 51020 + }, + { + "epoch": 2.87, + "grad_norm": 4.076771025973336, + "learning_rate": 5.281688425732345e-08, + "loss": 0.2955, + "step": 51025 + }, + { + "epoch": 2.88, + "grad_norm": 4.538951980008243, + "learning_rate": 5.2579548889254985e-08, + "loss": 0.3271, + "step": 51030 + }, + { + "epoch": 2.88, + "grad_norm": 4.41404410727894, + "learning_rate": 5.2342745143348364e-08, + "loss": 0.3055, + "step": 51035 + }, + { + "epoch": 2.88, + "grad_norm": 4.5271931424653165, + "learning_rate": 5.2106473045049896e-08, + "loss": 0.2741, + "step": 51040 + }, + { + "epoch": 2.88, + "grad_norm": 4.326926162576558, + "learning_rate": 5.187073261974762e-08, + "loss": 0.2969, + "step": 51045 + }, + { + "epoch": 2.88, + "grad_norm": 4.427165943700475, + "learning_rate": 5.1635523892772935e-08, + "loss": 0.2884, + "step": 51050 + }, + { + "epoch": 2.88, + "grad_norm": 4.8610150988511895, + "learning_rate": 5.1400846889400055e-08, + "loss": 0.3083, + "step": 51055 + }, + { + "epoch": 2.88, + "grad_norm": 4.53356862857095, + "learning_rate": 5.116670163484605e-08, + "loss": 0.315, + "step": 51060 + }, + { + "epoch": 2.88, + "grad_norm": 5.142706146080753, + "learning_rate": 5.093308815427023e-08, + "loss": 0.2879, + "step": 51065 + }, + { + "epoch": 2.88, + "grad_norm": 4.666059296165824, + "learning_rate": 5.0700006472776395e-08, + "loss": 0.3335, + "step": 51070 + }, + { + "epoch": 2.88, + "grad_norm": 4.73198153320675, + "learning_rate": 5.0467456615410085e-08, + "loss": 0.3152, + "step": 51075 + }, + { + "epoch": 2.88, + "grad_norm": 4.183226817065994, + "learning_rate": 5.023543860715907e-08, + "loss": 0.3174, + "step": 51080 + }, + { + "epoch": 2.88, + "grad_norm": 4.299660855333153, + "learning_rate": 5.000395247295564e-08, + "loss": 0.3039, + "step": 51085 + }, + { + "epoch": 2.88, + "grad_norm": 4.366329296066329, + "learning_rate": 4.977299823767268e-08, + "loss": 0.2989, + "step": 51090 + }, + { + "epoch": 2.88, + "grad_norm": 4.3837064005194355, + "learning_rate": 4.954257592612921e-08, + "loss": 0.3149, + "step": 51095 + }, + { + "epoch": 2.88, + "grad_norm": 4.894362997635067, + "learning_rate": 4.931268556308322e-08, + "loss": 0.3248, + "step": 51100 + }, + { + "epoch": 2.88, + "grad_norm": 4.174686637715214, + "learning_rate": 4.9083327173238274e-08, + "loss": 0.3341, + "step": 51105 + }, + { + "epoch": 2.88, + "grad_norm": 4.73115555649793, + "learning_rate": 4.885450078124021e-08, + "loss": 0.2888, + "step": 51110 + }, + { + "epoch": 2.88, + "grad_norm": 4.2739387759155, + "learning_rate": 4.862620641167659e-08, + "loss": 0.2974, + "step": 51115 + }, + { + "epoch": 2.88, + "grad_norm": 4.497355303918984, + "learning_rate": 4.8398444089079454e-08, + "loss": 0.3225, + "step": 51120 + }, + { + "epoch": 2.88, + "grad_norm": 6.301918952129178, + "learning_rate": 4.817121383792256e-08, + "loss": 0.2912, + "step": 51125 + }, + { + "epoch": 2.88, + "grad_norm": 4.889121262789907, + "learning_rate": 4.7944515682623036e-08, + "loss": 0.3586, + "step": 51130 + }, + { + "epoch": 2.88, + "grad_norm": 4.9330507710958935, + "learning_rate": 4.7718349647540296e-08, + "loss": 0.2921, + "step": 51135 + }, + { + "epoch": 2.88, + "grad_norm": 4.044985545361549, + "learning_rate": 4.749271575697656e-08, + "loss": 0.2951, + "step": 51140 + }, + { + "epoch": 2.88, + "grad_norm": 4.502781456033189, + "learning_rate": 4.7267614035177986e-08, + "loss": 0.3031, + "step": 51145 + }, + { + "epoch": 2.88, + "grad_norm": 4.126933613816304, + "learning_rate": 4.704304450633246e-08, + "loss": 0.2781, + "step": 51150 + }, + { + "epoch": 2.88, + "grad_norm": 5.140035963321687, + "learning_rate": 4.6819007194570664e-08, + "loss": 0.3282, + "step": 51155 + }, + { + "epoch": 2.88, + "grad_norm": 4.728470333110667, + "learning_rate": 4.659550212396668e-08, + "loss": 0.3068, + "step": 51160 + }, + { + "epoch": 2.88, + "grad_norm": 4.469656500938408, + "learning_rate": 4.6372529318536864e-08, + "loss": 0.2826, + "step": 51165 + }, + { + "epoch": 2.88, + "grad_norm": 4.610129460771441, + "learning_rate": 4.615008880224092e-08, + "loss": 0.2893, + "step": 51170 + }, + { + "epoch": 2.88, + "grad_norm": 4.800904336974467, + "learning_rate": 4.5928180598980855e-08, + "loss": 0.2943, + "step": 51175 + }, + { + "epoch": 2.88, + "grad_norm": 4.472736543334713, + "learning_rate": 4.570680473260203e-08, + "loss": 0.2851, + "step": 51180 + }, + { + "epoch": 2.88, + "grad_norm": 4.478488408638247, + "learning_rate": 4.548596122689153e-08, + "loss": 0.3035, + "step": 51185 + }, + { + "epoch": 2.88, + "grad_norm": 4.712890660944273, + "learning_rate": 4.526565010558093e-08, + "loss": 0.3037, + "step": 51190 + }, + { + "epoch": 2.88, + "grad_norm": 4.858576691153519, + "learning_rate": 4.504587139234351e-08, + "loss": 0.3491, + "step": 51195 + }, + { + "epoch": 2.88, + "grad_norm": 4.931933719338714, + "learning_rate": 4.4826625110794276e-08, + "loss": 0.2804, + "step": 51200 + }, + { + "epoch": 2.89, + "grad_norm": 4.455080269802044, + "learning_rate": 4.4607911284493267e-08, + "loss": 0.3195, + "step": 51205 + }, + { + "epoch": 2.89, + "grad_norm": 4.832757994390502, + "learning_rate": 4.438972993694168e-08, + "loss": 0.3347, + "step": 51210 + }, + { + "epoch": 2.89, + "grad_norm": 4.385494312504423, + "learning_rate": 4.4172081091585215e-08, + "loss": 0.2808, + "step": 51215 + }, + { + "epoch": 2.89, + "grad_norm": 4.961706642024568, + "learning_rate": 4.3954964771809606e-08, + "loss": 0.312, + "step": 51220 + }, + { + "epoch": 2.89, + "grad_norm": 4.303491552788849, + "learning_rate": 4.373838100094563e-08, + "loss": 0.3034, + "step": 51225 + }, + { + "epoch": 2.89, + "grad_norm": 4.684195748815649, + "learning_rate": 4.352232980226689e-08, + "loss": 0.2999, + "step": 51230 + }, + { + "epoch": 2.89, + "grad_norm": 4.311559898568494, + "learning_rate": 4.3306811198988164e-08, + "loss": 0.3033, + "step": 51235 + }, + { + "epoch": 2.89, + "grad_norm": 4.405922005429559, + "learning_rate": 4.3091825214268135e-08, + "loss": 0.2905, + "step": 51240 + }, + { + "epoch": 2.89, + "grad_norm": 4.652308238280765, + "learning_rate": 4.2877371871207774e-08, + "loss": 0.3346, + "step": 51245 + }, + { + "epoch": 2.89, + "grad_norm": 4.63095466104666, + "learning_rate": 4.266345119285143e-08, + "loss": 0.2865, + "step": 51250 + }, + { + "epoch": 2.89, + "grad_norm": 5.0170678122206604, + "learning_rate": 4.245006320218625e-08, + "loss": 0.3189, + "step": 51255 + }, + { + "epoch": 2.89, + "grad_norm": 4.813475266790062, + "learning_rate": 4.223720792214003e-08, + "loss": 0.3066, + "step": 51260 + }, + { + "epoch": 2.89, + "grad_norm": 4.700602080491322, + "learning_rate": 4.202488537558669e-08, + "loss": 0.2889, + "step": 51265 + }, + { + "epoch": 2.89, + "grad_norm": 5.156138280176386, + "learning_rate": 4.181309558534075e-08, + "loss": 0.2992, + "step": 51270 + }, + { + "epoch": 2.89, + "grad_norm": 4.059320303157331, + "learning_rate": 4.160183857415956e-08, + "loss": 0.3108, + "step": 51275 + }, + { + "epoch": 2.89, + "grad_norm": 4.848833840171049, + "learning_rate": 4.139111436474441e-08, + "loss": 0.3032, + "step": 51280 + }, + { + "epoch": 2.89, + "grad_norm": 4.945736089338037, + "learning_rate": 4.1180922979737194e-08, + "loss": 0.2917, + "step": 51285 + }, + { + "epoch": 2.89, + "grad_norm": 4.4686169764965005, + "learning_rate": 4.097126444172539e-08, + "loss": 0.2792, + "step": 51290 + }, + { + "epoch": 2.89, + "grad_norm": 4.608989767600169, + "learning_rate": 4.0762138773236534e-08, + "loss": 0.3302, + "step": 51295 + }, + { + "epoch": 2.89, + "grad_norm": 5.151291916298146, + "learning_rate": 4.055354599674321e-08, + "loss": 0.2901, + "step": 51300 + }, + { + "epoch": 2.89, + "grad_norm": 4.319251022104954, + "learning_rate": 4.034548613465861e-08, + "loss": 0.3149, + "step": 51305 + }, + { + "epoch": 2.89, + "grad_norm": 4.911591303924704, + "learning_rate": 4.013795920934038e-08, + "loss": 0.3434, + "step": 51310 + }, + { + "epoch": 2.89, + "grad_norm": 4.348200707148874, + "learning_rate": 3.993096524308793e-08, + "loss": 0.2969, + "step": 51315 + }, + { + "epoch": 2.89, + "grad_norm": 4.4209763762854575, + "learning_rate": 3.972450425814345e-08, + "loss": 0.3114, + "step": 51320 + }, + { + "epoch": 2.89, + "grad_norm": 4.306674523177662, + "learning_rate": 3.951857627669309e-08, + "loss": 0.2987, + "step": 51325 + }, + { + "epoch": 2.89, + "grad_norm": 4.723021871881002, + "learning_rate": 3.931318132086359e-08, + "loss": 0.3104, + "step": 51330 + }, + { + "epoch": 2.89, + "grad_norm": 4.567668525573129, + "learning_rate": 3.910831941272564e-08, + "loss": 0.2921, + "step": 51335 + }, + { + "epoch": 2.89, + "grad_norm": 4.229991115510662, + "learning_rate": 3.890399057429328e-08, + "loss": 0.3124, + "step": 51340 + }, + { + "epoch": 2.89, + "grad_norm": 4.938609083720758, + "learning_rate": 3.870019482752174e-08, + "loss": 0.3118, + "step": 51345 + }, + { + "epoch": 2.89, + "grad_norm": 5.000319005002347, + "learning_rate": 3.8496932194310723e-08, + "loss": 0.3314, + "step": 51350 + }, + { + "epoch": 2.89, + "grad_norm": 4.225371342694177, + "learning_rate": 3.8294202696500525e-08, + "loss": 0.309, + "step": 51355 + }, + { + "epoch": 2.89, + "grad_norm": 5.265058831026209, + "learning_rate": 3.809200635587651e-08, + "loss": 0.3353, + "step": 51360 + }, + { + "epoch": 2.89, + "grad_norm": 5.097003599934477, + "learning_rate": 3.7890343194164625e-08, + "loss": 0.302, + "step": 51365 + }, + { + "epoch": 2.89, + "grad_norm": 5.3101364174846015, + "learning_rate": 3.768921323303476e-08, + "loss": 0.3466, + "step": 51370 + }, + { + "epoch": 2.89, + "grad_norm": 4.4002258948487425, + "learning_rate": 3.748861649409907e-08, + "loss": 0.3138, + "step": 51375 + }, + { + "epoch": 2.89, + "grad_norm": 4.635858072390705, + "learning_rate": 3.7288552998913095e-08, + "loss": 0.3192, + "step": 51380 + }, + { + "epoch": 2.9, + "grad_norm": 4.601753315150774, + "learning_rate": 3.708902276897408e-08, + "loss": 0.2795, + "step": 51385 + }, + { + "epoch": 2.9, + "grad_norm": 5.001443794317098, + "learning_rate": 3.6890025825723206e-08, + "loss": 0.3158, + "step": 51390 + }, + { + "epoch": 2.9, + "grad_norm": 5.085064681723496, + "learning_rate": 3.669156219054226e-08, + "loss": 0.3092, + "step": 51395 + }, + { + "epoch": 2.9, + "grad_norm": 4.575427050517259, + "learning_rate": 3.649363188475752e-08, + "loss": 0.2978, + "step": 51400 + }, + { + "epoch": 2.9, + "grad_norm": 4.644921343058895, + "learning_rate": 3.629623492963807e-08, + "loss": 0.3113, + "step": 51405 + }, + { + "epoch": 2.9, + "grad_norm": 4.142196906322257, + "learning_rate": 3.6099371346394184e-08, + "loss": 0.2854, + "step": 51410 + }, + { + "epoch": 2.9, + "grad_norm": 4.597506810627093, + "learning_rate": 3.59030411561806e-08, + "loss": 0.3206, + "step": 51415 + }, + { + "epoch": 2.9, + "grad_norm": 4.846672191601049, + "learning_rate": 3.570724438009376e-08, + "loss": 0.3142, + "step": 51420 + }, + { + "epoch": 2.9, + "grad_norm": 4.158642645229595, + "learning_rate": 3.551198103917241e-08, + "loss": 0.2925, + "step": 51425 + }, + { + "epoch": 2.9, + "grad_norm": 4.710377301342723, + "learning_rate": 3.5317251154398656e-08, + "loss": 0.3104, + "step": 51430 + }, + { + "epoch": 2.9, + "grad_norm": 4.510829582646343, + "learning_rate": 3.512305474669686e-08, + "loss": 0.2849, + "step": 51435 + }, + { + "epoch": 2.9, + "grad_norm": 5.73073715515635, + "learning_rate": 3.4929391836934776e-08, + "loss": 0.3341, + "step": 51440 + }, + { + "epoch": 2.9, + "grad_norm": 4.3911957216853, + "learning_rate": 3.4736262445921876e-08, + "loss": 0.2867, + "step": 51445 + }, + { + "epoch": 2.9, + "grad_norm": 4.404336851311999, + "learning_rate": 3.4543666594410996e-08, + "loss": 0.2891, + "step": 51450 + }, + { + "epoch": 2.9, + "grad_norm": 4.560208321176111, + "learning_rate": 3.4351604303097255e-08, + "loss": 0.3255, + "step": 51455 + }, + { + "epoch": 2.9, + "grad_norm": 4.625273373884491, + "learning_rate": 3.4160075592619134e-08, + "loss": 0.3257, + "step": 51460 + }, + { + "epoch": 2.9, + "grad_norm": 5.270953574225311, + "learning_rate": 3.39690804835563e-08, + "loss": 0.3018, + "step": 51465 + }, + { + "epoch": 2.9, + "grad_norm": 4.729310342874606, + "learning_rate": 3.377861899643287e-08, + "loss": 0.3353, + "step": 51470 + }, + { + "epoch": 2.9, + "grad_norm": 4.256171996866667, + "learning_rate": 3.358869115171415e-08, + "loss": 0.3147, + "step": 51475 + }, + { + "epoch": 2.9, + "grad_norm": 4.7195182519162735, + "learning_rate": 3.339929696980937e-08, + "loss": 0.3106, + "step": 51480 + }, + { + "epoch": 2.9, + "grad_norm": 4.75503189027886, + "learning_rate": 3.3210436471069477e-08, + "loss": 0.2842, + "step": 51485 + }, + { + "epoch": 2.9, + "grad_norm": 4.901900190672175, + "learning_rate": 3.3022109675787675e-08, + "loss": 0.3004, + "step": 51490 + }, + { + "epoch": 2.9, + "grad_norm": 4.243141756074635, + "learning_rate": 3.283431660420111e-08, + "loss": 0.3036, + "step": 51495 + }, + { + "epoch": 2.9, + "grad_norm": 4.267345204612427, + "learning_rate": 3.2647057276489755e-08, + "loss": 0.3327, + "step": 51500 + }, + { + "epoch": 2.9, + "grad_norm": 4.208475444683537, + "learning_rate": 3.2460331712774186e-08, + "loss": 0.3096, + "step": 51505 + }, + { + "epoch": 2.9, + "grad_norm": 4.9232891298562445, + "learning_rate": 3.227413993311945e-08, + "loss": 0.2926, + "step": 51510 + }, + { + "epoch": 2.9, + "grad_norm": 4.590923787583403, + "learning_rate": 3.208848195753289e-08, + "loss": 0.335, + "step": 51515 + }, + { + "epoch": 2.9, + "grad_norm": 4.423562550792244, + "learning_rate": 3.1903357805963545e-08, + "loss": 0.2954, + "step": 51520 + }, + { + "epoch": 2.9, + "grad_norm": 4.804914673741037, + "learning_rate": 3.171876749830494e-08, + "loss": 0.3487, + "step": 51525 + }, + { + "epoch": 2.9, + "grad_norm": 5.087525771742065, + "learning_rate": 3.153471105439121e-08, + "loss": 0.3366, + "step": 51530 + }, + { + "epoch": 2.9, + "grad_norm": 5.711356948745704, + "learning_rate": 3.135118849399987e-08, + "loss": 0.3195, + "step": 51535 + }, + { + "epoch": 2.9, + "grad_norm": 4.575049828526733, + "learning_rate": 3.116819983685182e-08, + "loss": 0.2956, + "step": 51540 + }, + { + "epoch": 2.9, + "grad_norm": 4.173594648449249, + "learning_rate": 3.098574510261021e-08, + "loss": 0.3174, + "step": 51545 + }, + { + "epoch": 2.9, + "grad_norm": 4.5551981854017525, + "learning_rate": 3.080382431087992e-08, + "loss": 0.3095, + "step": 51550 + }, + { + "epoch": 2.9, + "grad_norm": 4.905985507843454, + "learning_rate": 3.0622437481209745e-08, + "loss": 0.3246, + "step": 51555 + }, + { + "epoch": 2.91, + "grad_norm": 4.348849932831417, + "learning_rate": 3.044158463308966e-08, + "loss": 0.2852, + "step": 51560 + }, + { + "epoch": 2.91, + "grad_norm": 4.68815883620634, + "learning_rate": 3.026126578595412e-08, + "loss": 0.3406, + "step": 51565 + }, + { + "epoch": 2.91, + "grad_norm": 5.206814663854788, + "learning_rate": 3.008148095917873e-08, + "loss": 0.3073, + "step": 51570 + }, + { + "epoch": 2.91, + "grad_norm": 4.6143632469739515, + "learning_rate": 2.990223017208138e-08, + "loss": 0.285, + "step": 51575 + }, + { + "epoch": 2.91, + "grad_norm": 4.68741877363965, + "learning_rate": 2.972351344392499e-08, + "loss": 0.324, + "step": 51580 + }, + { + "epoch": 2.91, + "grad_norm": 4.234203589268513, + "learning_rate": 2.9545330793911974e-08, + "loss": 0.3339, + "step": 51585 + }, + { + "epoch": 2.91, + "grad_norm": 4.545525205202262, + "learning_rate": 2.93676822411898e-08, + "loss": 0.3037, + "step": 51590 + }, + { + "epoch": 2.91, + "grad_norm": 4.615354537959156, + "learning_rate": 2.919056780484708e-08, + "loss": 0.3033, + "step": 51595 + }, + { + "epoch": 2.91, + "grad_norm": 5.030765295019107, + "learning_rate": 2.901398750391582e-08, + "loss": 0.2997, + "step": 51600 + }, + { + "epoch": 2.91, + "grad_norm": 4.8715659585737985, + "learning_rate": 2.8837941357370282e-08, + "loss": 0.3097, + "step": 51605 + }, + { + "epoch": 2.91, + "grad_norm": 5.201650727094648, + "learning_rate": 2.8662429384127e-08, + "loss": 0.3409, + "step": 51610 + }, + { + "epoch": 2.91, + "grad_norm": 5.282882441299974, + "learning_rate": 2.8487451603046444e-08, + "loss": 0.305, + "step": 51615 + }, + { + "epoch": 2.91, + "grad_norm": 4.689086307926574, + "learning_rate": 2.831300803292969e-08, + "loss": 0.3469, + "step": 51620 + }, + { + "epoch": 2.91, + "grad_norm": 4.862361634197883, + "learning_rate": 2.8139098692522292e-08, + "loss": 0.3079, + "step": 51625 + }, + { + "epoch": 2.91, + "grad_norm": 3.8813773822524857, + "learning_rate": 2.7965723600510974e-08, + "loss": 0.2906, + "step": 51630 + }, + { + "epoch": 2.91, + "grad_norm": 4.575902476121729, + "learning_rate": 2.7792882775525832e-08, + "loss": 0.3047, + "step": 51635 + }, + { + "epoch": 2.91, + "grad_norm": 4.260125105414601, + "learning_rate": 2.7620576236139786e-08, + "loss": 0.2872, + "step": 51640 + }, + { + "epoch": 2.91, + "grad_norm": 6.193285116127877, + "learning_rate": 2.7448804000867468e-08, + "loss": 0.3405, + "step": 51645 + }, + { + "epoch": 2.91, + "grad_norm": 4.352027310051853, + "learning_rate": 2.7277566088166897e-08, + "loss": 0.2881, + "step": 51650 + }, + { + "epoch": 2.91, + "grad_norm": 4.235383909350402, + "learning_rate": 2.7106862516437793e-08, + "loss": 0.3034, + "step": 51655 + }, + { + "epoch": 2.91, + "grad_norm": 4.5599531517103316, + "learning_rate": 2.6936693304023266e-08, + "loss": 0.2789, + "step": 51660 + }, + { + "epoch": 2.91, + "grad_norm": 4.697232998562075, + "learning_rate": 2.676705846920924e-08, + "loss": 0.2952, + "step": 51665 + }, + { + "epoch": 2.91, + "grad_norm": 4.849987052839738, + "learning_rate": 2.6597958030223357e-08, + "loss": 0.3316, + "step": 51670 + }, + { + "epoch": 2.91, + "grad_norm": 4.211524333137218, + "learning_rate": 2.6429392005236087e-08, + "loss": 0.3232, + "step": 51675 + }, + { + "epoch": 2.91, + "grad_norm": 5.765563415974672, + "learning_rate": 2.6261360412360714e-08, + "loss": 0.3159, + "step": 51680 + }, + { + "epoch": 2.91, + "grad_norm": 4.4935754631227915, + "learning_rate": 2.6093863269652242e-08, + "loss": 0.3423, + "step": 51685 + }, + { + "epoch": 2.91, + "grad_norm": 4.765543462548494, + "learning_rate": 2.5926900595110717e-08, + "loss": 0.3273, + "step": 51690 + }, + { + "epoch": 2.91, + "grad_norm": 4.701805706145789, + "learning_rate": 2.5760472406675118e-08, + "loss": 0.3113, + "step": 51695 + }, + { + "epoch": 2.91, + "grad_norm": 4.246181524495169, + "learning_rate": 2.559457872223059e-08, + "loss": 0.3164, + "step": 51700 + }, + { + "epoch": 2.91, + "grad_norm": 4.368401265857004, + "learning_rate": 2.542921955960176e-08, + "loss": 0.2878, + "step": 51705 + }, + { + "epoch": 2.91, + "grad_norm": 4.916691159377702, + "learning_rate": 2.5264394936557746e-08, + "loss": 0.3101, + "step": 51710 + }, + { + "epoch": 2.91, + "grad_norm": 4.364876281440367, + "learning_rate": 2.510010487080994e-08, + "loss": 0.3126, + "step": 51715 + }, + { + "epoch": 2.91, + "grad_norm": 4.790348168262585, + "learning_rate": 2.4936349380011437e-08, + "loss": 0.2963, + "step": 51720 + }, + { + "epoch": 2.91, + "grad_norm": 5.620303678991867, + "learning_rate": 2.477312848175928e-08, + "loss": 0.3283, + "step": 51725 + }, + { + "epoch": 2.91, + "grad_norm": 5.8825015085588825, + "learning_rate": 2.4610442193591655e-08, + "loss": 0.3052, + "step": 51730 + }, + { + "epoch": 2.91, + "grad_norm": 4.2828952826924755, + "learning_rate": 2.4448290532990137e-08, + "loss": 0.3046, + "step": 51735 + }, + { + "epoch": 2.92, + "grad_norm": 5.332009873377474, + "learning_rate": 2.4286673517378566e-08, + "loss": 0.346, + "step": 51740 + }, + { + "epoch": 2.92, + "grad_norm": 4.223505913109157, + "learning_rate": 2.4125591164123052e-08, + "loss": 0.3069, + "step": 51745 + }, + { + "epoch": 2.92, + "grad_norm": 4.502265405918352, + "learning_rate": 2.3965043490533635e-08, + "loss": 0.3479, + "step": 51750 + }, + { + "epoch": 2.92, + "grad_norm": 5.206542323519605, + "learning_rate": 2.3805030513860406e-08, + "loss": 0.3287, + "step": 51755 + }, + { + "epoch": 2.92, + "grad_norm": 5.136697508095126, + "learning_rate": 2.3645552251299053e-08, + "loss": 0.3265, + "step": 51760 + }, + { + "epoch": 2.92, + "grad_norm": 4.465478249630526, + "learning_rate": 2.348660871998476e-08, + "loss": 0.3125, + "step": 51765 + }, + { + "epoch": 2.92, + "grad_norm": 4.413908591851821, + "learning_rate": 2.3328199936997752e-08, + "loss": 0.2979, + "step": 51770 + }, + { + "epoch": 2.92, + "grad_norm": 4.691844200408493, + "learning_rate": 2.3170325919359416e-08, + "loss": 0.3216, + "step": 51775 + }, + { + "epoch": 2.92, + "grad_norm": 4.83099009078426, + "learning_rate": 2.30129866840334e-08, + "loss": 0.3186, + "step": 51780 + }, + { + "epoch": 2.92, + "grad_norm": 4.525809301467718, + "learning_rate": 2.285618224792785e-08, + "loss": 0.3263, + "step": 51785 + }, + { + "epoch": 2.92, + "grad_norm": 4.6011477729389485, + "learning_rate": 2.2699912627890952e-08, + "loss": 0.313, + "step": 51790 + }, + { + "epoch": 2.92, + "grad_norm": 4.483392038556261, + "learning_rate": 2.2544177840715385e-08, + "loss": 0.3074, + "step": 51795 + }, + { + "epoch": 2.92, + "grad_norm": 4.992611406745169, + "learning_rate": 2.2388977903134433e-08, + "loss": 0.3218, + "step": 51800 + }, + { + "epoch": 2.92, + "grad_norm": 5.399920938997252, + "learning_rate": 2.2234312831825866e-08, + "loss": 0.3174, + "step": 51805 + }, + { + "epoch": 2.92, + "grad_norm": 4.5807378365562785, + "learning_rate": 2.2080182643409163e-08, + "loss": 0.326, + "step": 51810 + }, + { + "epoch": 2.92, + "grad_norm": 5.0474034509006716, + "learning_rate": 2.1926587354446084e-08, + "loss": 0.2959, + "step": 51815 + }, + { + "epoch": 2.92, + "grad_norm": 4.931134150175982, + "learning_rate": 2.1773526981441196e-08, + "loss": 0.3297, + "step": 51820 + }, + { + "epoch": 2.92, + "grad_norm": 4.214547228961579, + "learning_rate": 2.1621001540841346e-08, + "loss": 0.3123, + "step": 51825 + }, + { + "epoch": 2.92, + "grad_norm": 4.372645717361881, + "learning_rate": 2.146901104903565e-08, + "loss": 0.3323, + "step": 51830 + }, + { + "epoch": 2.92, + "grad_norm": 4.552758867387564, + "learning_rate": 2.131755552235715e-08, + "loss": 0.3156, + "step": 51835 + }, + { + "epoch": 2.92, + "grad_norm": 5.718847379394675, + "learning_rate": 2.1166634977080046e-08, + "loss": 0.3071, + "step": 51840 + }, + { + "epoch": 2.92, + "grad_norm": 4.53781423202524, + "learning_rate": 2.101624942942082e-08, + "loss": 0.3076, + "step": 51845 + }, + { + "epoch": 2.92, + "grad_norm": 4.819087998607202, + "learning_rate": 2.0866398895539875e-08, + "loss": 0.3088, + "step": 51850 + }, + { + "epoch": 2.92, + "grad_norm": 5.143133408222382, + "learning_rate": 2.0717083391538772e-08, + "loss": 0.3133, + "step": 51855 + }, + { + "epoch": 2.92, + "grad_norm": 4.6909854400209765, + "learning_rate": 2.0568302933462458e-08, + "loss": 0.2913, + "step": 51860 + }, + { + "epoch": 2.92, + "grad_norm": 4.73264316310088, + "learning_rate": 2.0420057537297587e-08, + "loss": 0.3129, + "step": 51865 + }, + { + "epoch": 2.92, + "grad_norm": 5.33950616368151, + "learning_rate": 2.02723472189742e-08, + "loss": 0.3649, + "step": 51870 + }, + { + "epoch": 2.92, + "grad_norm": 4.3320655853980625, + "learning_rate": 2.012517199436459e-08, + "loss": 0.2903, + "step": 51875 + }, + { + "epoch": 2.92, + "grad_norm": 4.419131660144969, + "learning_rate": 1.997853187928278e-08, + "loss": 0.2841, + "step": 51880 + }, + { + "epoch": 2.92, + "grad_norm": 4.511392161015789, + "learning_rate": 1.9832426889486724e-08, + "loss": 0.3031, + "step": 51885 + }, + { + "epoch": 2.92, + "grad_norm": 4.1565660223673895, + "learning_rate": 1.9686857040674966e-08, + "loss": 0.2797, + "step": 51890 + }, + { + "epoch": 2.92, + "grad_norm": 4.173101251395043, + "learning_rate": 1.9541822348490557e-08, + "loss": 0.2998, + "step": 51895 + }, + { + "epoch": 2.92, + "grad_norm": 4.485837962699446, + "learning_rate": 1.9397322828517695e-08, + "loss": 0.2903, + "step": 51900 + }, + { + "epoch": 2.92, + "grad_norm": 4.787076324755868, + "learning_rate": 1.9253358496283402e-08, + "loss": 0.3215, + "step": 51905 + }, + { + "epoch": 2.92, + "grad_norm": 4.871186103633835, + "learning_rate": 1.9109929367258085e-08, + "loss": 0.3316, + "step": 51910 + }, + { + "epoch": 2.93, + "grad_norm": 4.991265910841598, + "learning_rate": 1.8967035456852744e-08, + "loss": 0.3208, + "step": 51915 + }, + { + "epoch": 2.93, + "grad_norm": 5.979550366337759, + "learning_rate": 1.8824676780422323e-08, + "loss": 0.3278, + "step": 51920 + }, + { + "epoch": 2.93, + "grad_norm": 4.730352625739576, + "learning_rate": 1.868285335326403e-08, + "loss": 0.2971, + "step": 51925 + }, + { + "epoch": 2.93, + "grad_norm": 4.907837546497949, + "learning_rate": 1.854156519061734e-08, + "loss": 0.3384, + "step": 51930 + }, + { + "epoch": 2.93, + "grad_norm": 4.905632553223436, + "learning_rate": 1.840081230766455e-08, + "loss": 0.3027, + "step": 51935 + }, + { + "epoch": 2.93, + "grad_norm": 5.252186013716569, + "learning_rate": 1.826059471952968e-08, + "loss": 0.3252, + "step": 51940 + }, + { + "epoch": 2.93, + "grad_norm": 4.328358359509609, + "learning_rate": 1.8120912441280113e-08, + "loss": 0.3294, + "step": 51945 + }, + { + "epoch": 2.93, + "grad_norm": 4.526319560985917, + "learning_rate": 1.7981765487925517e-08, + "loss": 0.3154, + "step": 51950 + }, + { + "epoch": 2.93, + "grad_norm": 4.495100580443618, + "learning_rate": 1.7843153874416707e-08, + "loss": 0.315, + "step": 51955 + }, + { + "epoch": 2.93, + "grad_norm": 4.394620423427488, + "learning_rate": 1.7705077615649546e-08, + "loss": 0.3268, + "step": 51960 + }, + { + "epoch": 2.93, + "grad_norm": 4.330576473488113, + "learning_rate": 1.75675367264605e-08, + "loss": 0.3138, + "step": 51965 + }, + { + "epoch": 2.93, + "grad_norm": 5.411634681174137, + "learning_rate": 1.7430531221628856e-08, + "loss": 0.2898, + "step": 51970 + }, + { + "epoch": 2.93, + "grad_norm": 4.6029906575583235, + "learning_rate": 1.7294061115875615e-08, + "loss": 0.3093, + "step": 51975 + }, + { + "epoch": 2.93, + "grad_norm": 3.941728359034263, + "learning_rate": 1.715812642386683e-08, + "loss": 0.3075, + "step": 51980 + }, + { + "epoch": 2.93, + "grad_norm": 4.54498108787844, + "learning_rate": 1.7022727160207476e-08, + "loss": 0.3423, + "step": 51985 + }, + { + "epoch": 2.93, + "grad_norm": 4.263460908902632, + "learning_rate": 1.688786333944814e-08, + "loss": 0.2997, + "step": 51990 + }, + { + "epoch": 2.93, + "grad_norm": 4.327911088294999, + "learning_rate": 1.675353497608001e-08, + "loss": 0.2946, + "step": 51995 + }, + { + "epoch": 2.93, + "grad_norm": 5.103429089545876, + "learning_rate": 1.6619742084537094e-08, + "loss": 0.3457, + "step": 52000 + }, + { + "epoch": 2.93, + "grad_norm": 4.716292394950386, + "learning_rate": 1.6486484679196222e-08, + "loss": 0.314, + "step": 52005 + }, + { + "epoch": 2.93, + "grad_norm": 4.6793692727390495, + "learning_rate": 1.63537627743765e-08, + "loss": 0.3264, + "step": 52010 + }, + { + "epoch": 2.93, + "grad_norm": 4.481553135249161, + "learning_rate": 1.6221576384339855e-08, + "loss": 0.308, + "step": 52015 + }, + { + "epoch": 2.93, + "grad_norm": 4.387374326861135, + "learning_rate": 1.6089925523289917e-08, + "loss": 0.3005, + "step": 52020 + }, + { + "epoch": 2.93, + "grad_norm": 4.71940319027515, + "learning_rate": 1.5958810205372598e-08, + "loss": 0.3348, + "step": 52025 + }, + { + "epoch": 2.93, + "grad_norm": 4.928486382181252, + "learning_rate": 1.582823044467774e-08, + "loss": 0.3337, + "step": 52030 + }, + { + "epoch": 2.93, + "grad_norm": 4.455368405457012, + "learning_rate": 1.5698186255236338e-08, + "loss": 0.3363, + "step": 52035 + }, + { + "epoch": 2.93, + "grad_norm": 4.576769667525351, + "learning_rate": 1.5568677651022213e-08, + "loss": 0.2972, + "step": 52040 + }, + { + "epoch": 2.93, + "grad_norm": 4.150292120113141, + "learning_rate": 1.5439704645951458e-08, + "loss": 0.293, + "step": 52045 + }, + { + "epoch": 2.93, + "grad_norm": 4.48692076773266, + "learning_rate": 1.5311267253882988e-08, + "loss": 0.3439, + "step": 52050 + }, + { + "epoch": 2.93, + "grad_norm": 4.847504493376699, + "learning_rate": 1.5183365488617985e-08, + "loss": 0.3417, + "step": 52055 + }, + { + "epoch": 2.93, + "grad_norm": 5.880121552097477, + "learning_rate": 1.5055999363899898e-08, + "loss": 0.3102, + "step": 52060 + }, + { + "epoch": 2.93, + "grad_norm": 4.195617831175037, + "learning_rate": 1.4929168893415556e-08, + "loss": 0.2767, + "step": 52065 + }, + { + "epoch": 2.93, + "grad_norm": 4.925578652706051, + "learning_rate": 1.4802874090792396e-08, + "loss": 0.3218, + "step": 52070 + }, + { + "epoch": 2.93, + "grad_norm": 5.762875768608234, + "learning_rate": 1.467711496960178e-08, + "loss": 0.3129, + "step": 52075 + }, + { + "epoch": 2.93, + "grad_norm": 4.669080721503346, + "learning_rate": 1.4551891543356788e-08, + "loss": 0.3249, + "step": 52080 + }, + { + "epoch": 2.93, + "grad_norm": 4.627150354235175, + "learning_rate": 1.4427203825513881e-08, + "loss": 0.3025, + "step": 52085 + }, + { + "epoch": 2.93, + "grad_norm": 4.725344870799214, + "learning_rate": 1.430305182947067e-08, + "loss": 0.311, + "step": 52090 + }, + { + "epoch": 2.94, + "grad_norm": 4.652074377843964, + "learning_rate": 1.4179435568568156e-08, + "loss": 0.3137, + "step": 52095 + }, + { + "epoch": 2.94, + "grad_norm": 4.484482365196683, + "learning_rate": 1.40563550560896e-08, + "loss": 0.3242, + "step": 52100 + }, + { + "epoch": 2.94, + "grad_norm": 4.425388229050742, + "learning_rate": 1.3933810305260531e-08, + "loss": 0.3167, + "step": 52105 + }, + { + "epoch": 2.94, + "grad_norm": 4.787726268688387, + "learning_rate": 1.3811801329248198e-08, + "loss": 0.318, + "step": 52110 + }, + { + "epoch": 2.94, + "grad_norm": 4.592607809503968, + "learning_rate": 1.3690328141163777e-08, + "loss": 0.2852, + "step": 52115 + }, + { + "epoch": 2.94, + "grad_norm": 4.853999402913513, + "learning_rate": 1.3569390754060163e-08, + "loss": 0.3311, + "step": 52120 + }, + { + "epoch": 2.94, + "grad_norm": 5.188404494802519, + "learning_rate": 1.344898918093196e-08, + "loss": 0.2897, + "step": 52125 + }, + { + "epoch": 2.94, + "grad_norm": 5.148936810773478, + "learning_rate": 1.332912343471715e-08, + "loss": 0.3259, + "step": 52130 + }, + { + "epoch": 2.94, + "grad_norm": 4.511313299574433, + "learning_rate": 1.3209793528295988e-08, + "loss": 0.3132, + "step": 52135 + }, + { + "epoch": 2.94, + "grad_norm": 4.9012202971609575, + "learning_rate": 1.3090999474490995e-08, + "loss": 0.3054, + "step": 52140 + }, + { + "epoch": 2.94, + "grad_norm": 4.859070220167851, + "learning_rate": 1.2972741286066404e-08, + "loss": 0.2892, + "step": 52145 + }, + { + "epoch": 2.94, + "grad_norm": 4.455208602785613, + "learning_rate": 1.2855018975730937e-08, + "loss": 0.2877, + "step": 52150 + }, + { + "epoch": 2.94, + "grad_norm": 4.670421177321195, + "learning_rate": 1.2737832556133367e-08, + "loss": 0.313, + "step": 52155 + }, + { + "epoch": 2.94, + "grad_norm": 9.17953509234675, + "learning_rate": 1.2621182039866398e-08, + "loss": 0.3415, + "step": 52160 + }, + { + "epoch": 2.94, + "grad_norm": 6.275350171071507, + "learning_rate": 1.2505067439464446e-08, + "loss": 0.299, + "step": 52165 + }, + { + "epoch": 2.94, + "grad_norm": 7.868990414992129, + "learning_rate": 1.2389488767403646e-08, + "loss": 0.2755, + "step": 52170 + }, + { + "epoch": 2.94, + "grad_norm": 4.611307624850632, + "learning_rate": 1.227444603610517e-08, + "loss": 0.3087, + "step": 52175 + }, + { + "epoch": 2.94, + "grad_norm": 5.404459277641047, + "learning_rate": 1.2159939257929687e-08, + "loss": 0.3241, + "step": 52180 + }, + { + "epoch": 2.94, + "grad_norm": 4.927689272309279, + "learning_rate": 1.2045968445181799e-08, + "loss": 0.326, + "step": 52185 + }, + { + "epoch": 2.94, + "grad_norm": 4.219327943777228, + "learning_rate": 1.1932533610107821e-08, + "loss": 0.3204, + "step": 52190 + }, + { + "epoch": 2.94, + "grad_norm": 5.065673762240773, + "learning_rate": 1.1819634764897448e-08, + "loss": 0.308, + "step": 52195 + }, + { + "epoch": 2.94, + "grad_norm": 4.773472371014076, + "learning_rate": 1.1707271921682084e-08, + "loss": 0.2988, + "step": 52200 + }, + { + "epoch": 2.94, + "grad_norm": 4.840840010062998, + "learning_rate": 1.1595445092534852e-08, + "loss": 0.3004, + "step": 52205 + }, + { + "epoch": 2.94, + "grad_norm": 5.817942462967672, + "learning_rate": 1.1484154289472805e-08, + "loss": 0.3087, + "step": 52210 + }, + { + "epoch": 2.94, + "grad_norm": 4.820070402045558, + "learning_rate": 1.1373399524454154e-08, + "loss": 0.3346, + "step": 52215 + }, + { + "epoch": 2.94, + "grad_norm": 4.512201523229725, + "learning_rate": 1.1263180809380492e-08, + "loss": 0.3409, + "step": 52220 + }, + { + "epoch": 2.94, + "grad_norm": 4.606543231603245, + "learning_rate": 1.1153498156094567e-08, + "loss": 0.3329, + "step": 52225 + }, + { + "epoch": 2.94, + "grad_norm": 4.819383885136055, + "learning_rate": 1.1044351576383061e-08, + "loss": 0.3189, + "step": 52230 + }, + { + "epoch": 2.94, + "grad_norm": 4.7360271075100275, + "learning_rate": 1.0935741081974371e-08, + "loss": 0.317, + "step": 52235 + }, + { + "epoch": 2.94, + "grad_norm": 4.647507166704835, + "learning_rate": 1.0827666684538051e-08, + "loss": 0.3307, + "step": 52240 + }, + { + "epoch": 2.94, + "grad_norm": 4.793969473854846, + "learning_rate": 1.0720128395688145e-08, + "loss": 0.3193, + "step": 52245 + }, + { + "epoch": 2.94, + "grad_norm": 5.185595282548315, + "learning_rate": 1.0613126226979853e-08, + "loss": 0.3176, + "step": 52250 + }, + { + "epoch": 2.94, + "grad_norm": 4.888676946850519, + "learning_rate": 1.0506660189910645e-08, + "loss": 0.3143, + "step": 52255 + }, + { + "epoch": 2.94, + "grad_norm": 4.972644933108963, + "learning_rate": 1.040073029592137e-08, + "loss": 0.3138, + "step": 52260 + }, + { + "epoch": 2.94, + "grad_norm": 4.874024939395055, + "learning_rate": 1.0295336556394031e-08, + "loss": 0.2942, + "step": 52265 + }, + { + "epoch": 2.95, + "grad_norm": 5.098285409668428, + "learning_rate": 1.0190478982654573e-08, + "loss": 0.2995, + "step": 52270 + }, + { + "epoch": 2.95, + "grad_norm": 4.871984302664849, + "learning_rate": 1.0086157585970091e-08, + "loss": 0.2785, + "step": 52275 + }, + { + "epoch": 2.95, + "grad_norm": 4.91481131961047, + "learning_rate": 9.9823723775494e-09, + "loss": 0.3152, + "step": 52280 + }, + { + "epoch": 2.95, + "grad_norm": 4.44373163447765, + "learning_rate": 9.879123368545795e-09, + "loss": 0.2839, + "step": 52285 + }, + { + "epoch": 2.95, + "grad_norm": 4.412368546087253, + "learning_rate": 9.776410570053186e-09, + "loss": 0.3269, + "step": 52290 + }, + { + "epoch": 2.95, + "grad_norm": 4.655100645661409, + "learning_rate": 9.674233993109405e-09, + "loss": 0.3223, + "step": 52295 + }, + { + "epoch": 2.95, + "grad_norm": 5.410426394663395, + "learning_rate": 9.572593648692341e-09, + "loss": 0.3026, + "step": 52300 + }, + { + "epoch": 2.95, + "grad_norm": 4.617620243294774, + "learning_rate": 9.471489547724921e-09, + "loss": 0.3076, + "step": 52305 + }, + { + "epoch": 2.95, + "grad_norm": 4.415697482325816, + "learning_rate": 9.370921701071233e-09, + "loss": 0.2923, + "step": 52310 + }, + { + "epoch": 2.95, + "grad_norm": 4.7329934127292725, + "learning_rate": 9.27089011953708e-09, + "loss": 0.3323, + "step": 52315 + }, + { + "epoch": 2.95, + "grad_norm": 4.441815325896142, + "learning_rate": 9.171394813871081e-09, + "loss": 0.2931, + "step": 52320 + }, + { + "epoch": 2.95, + "grad_norm": 4.31469371107524, + "learning_rate": 9.072435794765244e-09, + "loss": 0.3393, + "step": 52325 + }, + { + "epoch": 2.95, + "grad_norm": 4.425477438367652, + "learning_rate": 8.974013072852727e-09, + "loss": 0.3225, + "step": 52330 + }, + { + "epoch": 2.95, + "grad_norm": 4.37568432517497, + "learning_rate": 8.87612665871007e-09, + "loss": 0.3176, + "step": 52335 + }, + { + "epoch": 2.95, + "grad_norm": 4.736481356056593, + "learning_rate": 8.77877656285442e-09, + "loss": 0.3103, + "step": 52340 + }, + { + "epoch": 2.95, + "grad_norm": 4.666221174640978, + "learning_rate": 8.681962795747955e-09, + "loss": 0.337, + "step": 52345 + }, + { + "epoch": 2.95, + "grad_norm": 4.437653898208052, + "learning_rate": 8.585685367792362e-09, + "loss": 0.3232, + "step": 52350 + }, + { + "epoch": 2.95, + "grad_norm": 4.629032618671705, + "learning_rate": 8.489944289333806e-09, + "loss": 0.2796, + "step": 52355 + }, + { + "epoch": 2.95, + "grad_norm": 4.488751992610083, + "learning_rate": 8.394739570660171e-09, + "loss": 0.2996, + "step": 52360 + }, + { + "epoch": 2.95, + "grad_norm": 4.709464236450522, + "learning_rate": 8.30007122200105e-09, + "loss": 0.3111, + "step": 52365 + }, + { + "epoch": 2.95, + "grad_norm": 4.706032455750089, + "learning_rate": 8.205939253529416e-09, + "loss": 0.3112, + "step": 52370 + }, + { + "epoch": 2.95, + "grad_norm": 4.891845461885052, + "learning_rate": 8.112343675359958e-09, + "loss": 0.2835, + "step": 52375 + }, + { + "epoch": 2.95, + "grad_norm": 4.664060810834802, + "learning_rate": 8.019284497550184e-09, + "loss": 0.3077, + "step": 52380 + }, + { + "epoch": 2.95, + "grad_norm": 4.362832373548632, + "learning_rate": 7.926761730099874e-09, + "loss": 0.3341, + "step": 52385 + }, + { + "epoch": 2.95, + "grad_norm": 5.453116542371152, + "learning_rate": 7.834775382950522e-09, + "loss": 0.3521, + "step": 52390 + }, + { + "epoch": 2.95, + "grad_norm": 4.435751272009442, + "learning_rate": 7.74332546598644e-09, + "loss": 0.2822, + "step": 52395 + }, + { + "epoch": 2.95, + "grad_norm": 5.41740543858173, + "learning_rate": 7.65241198903477e-09, + "loss": 0.3358, + "step": 52400 + }, + { + "epoch": 2.95, + "grad_norm": 4.461778169052866, + "learning_rate": 7.562034961864361e-09, + "loss": 0.3535, + "step": 52405 + }, + { + "epoch": 2.95, + "grad_norm": 4.081227209607926, + "learning_rate": 7.472194394186338e-09, + "loss": 0.3222, + "step": 52410 + }, + { + "epoch": 2.95, + "grad_norm": 4.629891181257991, + "learning_rate": 7.382890295654644e-09, + "loss": 0.3054, + "step": 52415 + }, + { + "epoch": 2.95, + "grad_norm": 4.315021931142176, + "learning_rate": 7.2941226758660445e-09, + "loss": 0.3311, + "step": 52420 + }, + { + "epoch": 2.95, + "grad_norm": 4.471789889424597, + "learning_rate": 7.205891544357912e-09, + "loss": 0.2736, + "step": 52425 + }, + { + "epoch": 2.95, + "grad_norm": 4.519041621841997, + "learning_rate": 7.118196910612107e-09, + "loss": 0.2956, + "step": 52430 + }, + { + "epoch": 2.95, + "grad_norm": 4.959552556071339, + "learning_rate": 7.031038784051092e-09, + "loss": 0.3395, + "step": 52435 + }, + { + "epoch": 2.95, + "grad_norm": 4.820161577926311, + "learning_rate": 6.944417174040707e-09, + "loss": 0.3033, + "step": 52440 + }, + { + "epoch": 2.95, + "grad_norm": 4.646316528705217, + "learning_rate": 6.858332089888509e-09, + "loss": 0.3119, + "step": 52445 + }, + { + "epoch": 2.96, + "grad_norm": 4.620750086621261, + "learning_rate": 6.7727835408448735e-09, + "loss": 0.3123, + "step": 52450 + }, + { + "epoch": 2.96, + "grad_norm": 6.262751220339147, + "learning_rate": 6.687771536103005e-09, + "loss": 0.3119, + "step": 52455 + }, + { + "epoch": 2.96, + "grad_norm": 4.820043019297438, + "learning_rate": 6.603296084796706e-09, + "loss": 0.3095, + "step": 52460 + }, + { + "epoch": 2.96, + "grad_norm": 4.497552218335965, + "learning_rate": 6.519357196003717e-09, + "loss": 0.3054, + "step": 52465 + }, + { + "epoch": 2.96, + "grad_norm": 4.920993053052893, + "learning_rate": 6.435954878744044e-09, + "loss": 0.3158, + "step": 52470 + }, + { + "epoch": 2.96, + "grad_norm": 4.3961457430899165, + "learning_rate": 6.353089141979407e-09, + "loss": 0.2819, + "step": 52475 + }, + { + "epoch": 2.96, + "grad_norm": 4.767300182498846, + "learning_rate": 6.270759994613795e-09, + "loss": 0.3227, + "step": 52480 + }, + { + "epoch": 2.96, + "grad_norm": 4.384253467656807, + "learning_rate": 6.188967445494021e-09, + "loss": 0.3101, + "step": 52485 + }, + { + "epoch": 2.96, + "grad_norm": 4.450868333634514, + "learning_rate": 6.1077115034091635e-09, + "loss": 0.291, + "step": 52490 + }, + { + "epoch": 2.96, + "grad_norm": 4.394315454485715, + "learning_rate": 6.026992177090019e-09, + "loss": 0.3246, + "step": 52495 + }, + { + "epoch": 2.96, + "grad_norm": 5.2815009120238345, + "learning_rate": 5.946809475210758e-09, + "loss": 0.3052, + "step": 52500 + }, + { + "epoch": 2.96, + "grad_norm": 5.843652894375007, + "learning_rate": 5.867163406387822e-09, + "loss": 0.303, + "step": 52505 + }, + { + "epoch": 2.96, + "grad_norm": 4.296658960721441, + "learning_rate": 5.788053979178254e-09, + "loss": 0.3044, + "step": 52510 + }, + { + "epoch": 2.96, + "grad_norm": 4.548167929631657, + "learning_rate": 5.709481202084144e-09, + "loss": 0.3288, + "step": 52515 + }, + { + "epoch": 2.96, + "grad_norm": 5.280104652256016, + "learning_rate": 5.631445083547071e-09, + "loss": 0.3156, + "step": 52520 + }, + { + "epoch": 2.96, + "grad_norm": 4.3821783390502755, + "learning_rate": 5.553945631953661e-09, + "loss": 0.3212, + "step": 52525 + }, + { + "epoch": 2.96, + "grad_norm": 4.791121977622194, + "learning_rate": 5.476982855630586e-09, + "loss": 0.3391, + "step": 52530 + }, + { + "epoch": 2.96, + "grad_norm": 4.748360894943294, + "learning_rate": 5.4005567628478975e-09, + "loss": 0.3202, + "step": 52535 + }, + { + "epoch": 2.96, + "grad_norm": 5.226065304793106, + "learning_rate": 5.324667361819025e-09, + "loss": 0.345, + "step": 52540 + }, + { + "epoch": 2.96, + "grad_norm": 4.316860487123996, + "learning_rate": 5.249314660697447e-09, + "loss": 0.3011, + "step": 52545 + }, + { + "epoch": 2.96, + "grad_norm": 5.358530785274255, + "learning_rate": 5.174498667580574e-09, + "loss": 0.3194, + "step": 52550 + }, + { + "epoch": 2.96, + "grad_norm": 4.555601613781942, + "learning_rate": 5.100219390507532e-09, + "loss": 0.3196, + "step": 52555 + }, + { + "epoch": 2.96, + "grad_norm": 4.747688110343156, + "learning_rate": 5.02647683746027e-09, + "loss": 0.3059, + "step": 52560 + }, + { + "epoch": 2.96, + "grad_norm": 4.743080291054537, + "learning_rate": 4.953271016362449e-09, + "loss": 0.3089, + "step": 52565 + }, + { + "epoch": 2.96, + "grad_norm": 5.2473504407222915, + "learning_rate": 4.880601935079998e-09, + "loss": 0.3375, + "step": 52570 + }, + { + "epoch": 2.96, + "grad_norm": 4.594031998831323, + "learning_rate": 4.808469601422783e-09, + "loss": 0.3303, + "step": 52575 + }, + { + "epoch": 2.96, + "grad_norm": 4.616023760796691, + "learning_rate": 4.736874023140714e-09, + "loss": 0.2979, + "step": 52580 + }, + { + "epoch": 2.96, + "grad_norm": 4.687626815992607, + "learning_rate": 4.665815207927083e-09, + "loss": 0.3303, + "step": 52585 + }, + { + "epoch": 2.96, + "grad_norm": 4.847525294640633, + "learning_rate": 4.595293163418002e-09, + "loss": 0.303, + "step": 52590 + }, + { + "epoch": 2.96, + "grad_norm": 4.721476917913135, + "learning_rate": 4.5253078971913e-09, + "loss": 0.3227, + "step": 52595 + }, + { + "epoch": 2.96, + "grad_norm": 5.0111436246551175, + "learning_rate": 4.455859416766517e-09, + "loss": 0.32, + "step": 52600 + }, + { + "epoch": 2.96, + "grad_norm": 4.345335916251948, + "learning_rate": 4.386947729606572e-09, + "loss": 0.3302, + "step": 52605 + }, + { + "epoch": 2.96, + "grad_norm": 4.778534890144306, + "learning_rate": 4.318572843116653e-09, + "loss": 0.3277, + "step": 52610 + }, + { + "epoch": 2.96, + "grad_norm": 4.9042433707296125, + "learning_rate": 4.25073476464366e-09, + "loss": 0.3049, + "step": 52615 + }, + { + "epoch": 2.96, + "grad_norm": 4.230490503426172, + "learning_rate": 4.183433501477318e-09, + "loss": 0.3013, + "step": 52620 + }, + { + "epoch": 2.97, + "grad_norm": 4.704792837084918, + "learning_rate": 4.116669060849065e-09, + "loss": 0.3188, + "step": 52625 + }, + { + "epoch": 2.97, + "grad_norm": 4.393275824440584, + "learning_rate": 4.050441449933162e-09, + "loss": 0.3141, + "step": 52630 + }, + { + "epoch": 2.97, + "grad_norm": 4.4533973250389804, + "learning_rate": 3.984750675846139e-09, + "loss": 0.2932, + "step": 52635 + }, + { + "epoch": 2.97, + "grad_norm": 4.40402148357871, + "learning_rate": 3.919596745646792e-09, + "loss": 0.3148, + "step": 52640 + }, + { + "epoch": 2.97, + "grad_norm": 5.35932916402804, + "learning_rate": 3.854979666336745e-09, + "loss": 0.3068, + "step": 52645 + }, + { + "epoch": 2.97, + "grad_norm": 4.059937107282575, + "learning_rate": 3.7908994448582205e-09, + "loss": 0.2972, + "step": 52650 + }, + { + "epoch": 2.97, + "grad_norm": 4.466068410612167, + "learning_rate": 3.727356088097378e-09, + "loss": 0.3167, + "step": 52655 + }, + { + "epoch": 2.97, + "grad_norm": 5.0576839610472435, + "learning_rate": 3.664349602882644e-09, + "loss": 0.3411, + "step": 52660 + }, + { + "epoch": 2.97, + "grad_norm": 4.451213919774457, + "learning_rate": 3.601879995984159e-09, + "loss": 0.3203, + "step": 52665 + }, + { + "epoch": 2.97, + "grad_norm": 5.314255888607907, + "learning_rate": 3.53994727411433e-09, + "loss": 0.3063, + "step": 52670 + }, + { + "epoch": 2.97, + "grad_norm": 4.765208803808081, + "learning_rate": 3.4785514439278357e-09, + "loss": 0.2994, + "step": 52675 + }, + { + "epoch": 2.97, + "grad_norm": 4.636140889536764, + "learning_rate": 3.41769251202273e-09, + "loss": 0.3257, + "step": 52680 + }, + { + "epoch": 2.97, + "grad_norm": 4.4474747672151915, + "learning_rate": 3.3573704849382273e-09, + "loss": 0.3077, + "step": 52685 + }, + { + "epoch": 2.97, + "grad_norm": 5.090214735496071, + "learning_rate": 3.297585369155809e-09, + "loss": 0.2882, + "step": 52690 + }, + { + "epoch": 2.97, + "grad_norm": 4.973183462293142, + "learning_rate": 3.2383371711003366e-09, + "loss": 0.3262, + "step": 52695 + }, + { + "epoch": 2.97, + "grad_norm": 4.475461560725476, + "learning_rate": 3.1796258971372727e-09, + "loss": 0.292, + "step": 52700 + }, + { + "epoch": 2.97, + "grad_norm": 4.6387364348574245, + "learning_rate": 3.121451553576571e-09, + "loss": 0.3205, + "step": 52705 + }, + { + "epoch": 2.97, + "grad_norm": 7.208933111887954, + "learning_rate": 3.0638141466687866e-09, + "loss": 0.3173, + "step": 52710 + }, + { + "epoch": 2.97, + "grad_norm": 4.362484976363597, + "learning_rate": 3.0067136826072986e-09, + "loss": 0.3412, + "step": 52715 + }, + { + "epoch": 2.97, + "grad_norm": 5.122931067733473, + "learning_rate": 2.9501501675277544e-09, + "loss": 0.3481, + "step": 52720 + }, + { + "epoch": 2.97, + "grad_norm": 4.30235071952696, + "learning_rate": 2.8941236075086256e-09, + "loss": 0.3356, + "step": 52725 + }, + { + "epoch": 2.97, + "grad_norm": 4.292343359567982, + "learning_rate": 2.838634008569541e-09, + "loss": 0.3063, + "step": 52730 + }, + { + "epoch": 2.97, + "grad_norm": 4.283475516396307, + "learning_rate": 2.783681376673508e-09, + "loss": 0.3132, + "step": 52735 + }, + { + "epoch": 2.97, + "grad_norm": 4.893696263082588, + "learning_rate": 2.729265717724694e-09, + "loss": 0.3132, + "step": 52740 + }, + { + "epoch": 2.97, + "grad_norm": 4.025191685581385, + "learning_rate": 2.675387037571753e-09, + "loss": 0.3127, + "step": 52745 + }, + { + "epoch": 2.97, + "grad_norm": 4.810919273149971, + "learning_rate": 2.6220453420028325e-09, + "loss": 0.305, + "step": 52750 + }, + { + "epoch": 2.97, + "grad_norm": 4.817797980037922, + "learning_rate": 2.5692406367505695e-09, + "loss": 0.318, + "step": 52755 + }, + { + "epoch": 2.97, + "grad_norm": 4.628643539625244, + "learning_rate": 2.516972927488759e-09, + "loss": 0.319, + "step": 52760 + }, + { + "epoch": 2.97, + "grad_norm": 4.781153106037272, + "learning_rate": 2.4652422198334638e-09, + "loss": 0.3039, + "step": 52765 + }, + { + "epoch": 2.97, + "grad_norm": 4.857922136843907, + "learning_rate": 2.41404851934357e-09, + "loss": 0.3052, + "step": 52770 + }, + { + "epoch": 2.97, + "grad_norm": 4.391891467239641, + "learning_rate": 2.3633918315207895e-09, + "loss": 0.3211, + "step": 52775 + }, + { + "epoch": 2.97, + "grad_norm": 4.229468119515766, + "learning_rate": 2.313272161807434e-09, + "loss": 0.3046, + "step": 52780 + }, + { + "epoch": 2.97, + "grad_norm": 4.756058296162218, + "learning_rate": 2.263689515589196e-09, + "loss": 0.3343, + "step": 52785 + }, + { + "epoch": 2.97, + "grad_norm": 4.644018764213784, + "learning_rate": 2.2146438981945908e-09, + "loss": 0.3093, + "step": 52790 + }, + { + "epoch": 2.97, + "grad_norm": 4.77380290534556, + "learning_rate": 2.1661353148932917e-09, + "loss": 0.3123, + "step": 52795 + }, + { + "epoch": 2.97, + "grad_norm": 5.019618943865824, + "learning_rate": 2.1181637708977963e-09, + "loss": 0.3366, + "step": 52800 + }, + { + "epoch": 2.98, + "grad_norm": 4.308653992855518, + "learning_rate": 2.0707292713634255e-09, + "loss": 0.3253, + "step": 52805 + }, + { + "epoch": 2.98, + "grad_norm": 4.49799096524608, + "learning_rate": 2.023831821386102e-09, + "loss": 0.3174, + "step": 52810 + }, + { + "epoch": 2.98, + "grad_norm": 4.347721066338038, + "learning_rate": 1.977471426006239e-09, + "loss": 0.2957, + "step": 52815 + }, + { + "epoch": 2.98, + "grad_norm": 4.643183311459069, + "learning_rate": 1.9316480902048517e-09, + "loss": 0.3246, + "step": 52820 + }, + { + "epoch": 2.98, + "grad_norm": 4.483571405517584, + "learning_rate": 1.8863618189057798e-09, + "loss": 0.3045, + "step": 52825 + }, + { + "epoch": 2.98, + "grad_norm": 5.039070869095936, + "learning_rate": 1.8416126169762405e-09, + "loss": 0.3266, + "step": 52830 + }, + { + "epoch": 2.98, + "grad_norm": 4.615760782256252, + "learning_rate": 1.7974004892234998e-09, + "loss": 0.3102, + "step": 52835 + }, + { + "epoch": 2.98, + "grad_norm": 4.414752377335334, + "learning_rate": 1.753725440398757e-09, + "loss": 0.288, + "step": 52840 + }, + { + "epoch": 2.98, + "grad_norm": 4.844827737767919, + "learning_rate": 1.7105874751949247e-09, + "loss": 0.2952, + "step": 52845 + }, + { + "epoch": 2.98, + "grad_norm": 4.5092265505590605, + "learning_rate": 1.6679865982482946e-09, + "loss": 0.3163, + "step": 52850 + }, + { + "epoch": 2.98, + "grad_norm": 4.936603015264989, + "learning_rate": 1.6259228141352058e-09, + "loss": 0.2973, + "step": 52855 + }, + { + "epoch": 2.98, + "grad_norm": 6.169628484505024, + "learning_rate": 1.5843961273764862e-09, + "loss": 0.3347, + "step": 52860 + }, + { + "epoch": 2.98, + "grad_norm": 4.294682213649803, + "learning_rate": 1.5434065424341227e-09, + "loss": 0.3081, + "step": 52865 + }, + { + "epoch": 2.98, + "grad_norm": 4.465428353067447, + "learning_rate": 1.5029540637123696e-09, + "loss": 0.2922, + "step": 52870 + }, + { + "epoch": 2.98, + "grad_norm": 4.152948023697802, + "learning_rate": 1.4630386955583054e-09, + "loss": 0.3113, + "step": 52875 + }, + { + "epoch": 2.98, + "grad_norm": 4.959260029969127, + "learning_rate": 1.4236604422612766e-09, + "loss": 0.3282, + "step": 52880 + }, + { + "epoch": 2.98, + "grad_norm": 4.470867675936348, + "learning_rate": 1.384819308051788e-09, + "loss": 0.3172, + "step": 52885 + }, + { + "epoch": 2.98, + "grad_norm": 4.351809370846714, + "learning_rate": 1.3465152971048335e-09, + "loss": 0.2982, + "step": 52890 + }, + { + "epoch": 2.98, + "grad_norm": 4.45804987173079, + "learning_rate": 1.3087484135348993e-09, + "loss": 0.2962, + "step": 52895 + }, + { + "epoch": 2.98, + "grad_norm": 4.646750164507272, + "learning_rate": 1.271518661400961e-09, + "loss": 0.3302, + "step": 52900 + }, + { + "epoch": 2.98, + "grad_norm": 4.431301545097962, + "learning_rate": 1.2348260447037074e-09, + "loss": 0.3226, + "step": 52905 + }, + { + "epoch": 2.98, + "grad_norm": 4.682177549735444, + "learning_rate": 1.1986705673849851e-09, + "loss": 0.3384, + "step": 52910 + }, + { + "epoch": 2.98, + "grad_norm": 4.311406088703032, + "learning_rate": 1.16305223333113e-09, + "loss": 0.2983, + "step": 52915 + }, + { + "epoch": 2.98, + "grad_norm": 4.276341251302369, + "learning_rate": 1.1279710463685256e-09, + "loss": 0.2895, + "step": 52920 + }, + { + "epoch": 2.98, + "grad_norm": 4.982763729718368, + "learning_rate": 1.0934270102674893e-09, + "loss": 0.3333, + "step": 52925 + }, + { + "epoch": 2.98, + "grad_norm": 4.569834812857314, + "learning_rate": 1.0594201287394967e-09, + "loss": 0.317, + "step": 52930 + }, + { + "epoch": 2.98, + "grad_norm": 4.413040368232051, + "learning_rate": 1.0259504054382918e-09, + "loss": 0.3102, + "step": 52935 + }, + { + "epoch": 2.98, + "grad_norm": 4.787770120033109, + "learning_rate": 9.930178439615523e-10, + "loss": 0.3153, + "step": 52940 + }, + { + "epoch": 2.98, + "grad_norm": 4.240075960406916, + "learning_rate": 9.606224478470039e-10, + "loss": 0.3179, + "step": 52945 + }, + { + "epoch": 2.98, + "grad_norm": 5.030678396334085, + "learning_rate": 9.287642205763059e-10, + "loss": 0.3061, + "step": 52950 + }, + { + "epoch": 2.98, + "grad_norm": 6.982543408386207, + "learning_rate": 8.97443165572276e-10, + "loss": 0.3448, + "step": 52955 + }, + { + "epoch": 2.98, + "grad_norm": 5.616099239800316, + "learning_rate": 8.666592862005552e-10, + "loss": 0.3785, + "step": 52960 + }, + { + "epoch": 2.98, + "grad_norm": 4.608389749842388, + "learning_rate": 8.36412585769053e-10, + "loss": 0.3058, + "step": 52965 + }, + { + "epoch": 2.98, + "grad_norm": 4.373584522790846, + "learning_rate": 8.067030675285026e-10, + "loss": 0.2898, + "step": 52970 + }, + { + "epoch": 2.98, + "grad_norm": 4.272426551394382, + "learning_rate": 7.775307346702399e-10, + "loss": 0.3309, + "step": 52975 + }, + { + "epoch": 2.99, + "grad_norm": 4.963509366439007, + "learning_rate": 7.488955903300898e-10, + "loss": 0.3251, + "step": 52980 + }, + { + "epoch": 2.99, + "grad_norm": 4.576542616274994, + "learning_rate": 7.207976375839254e-10, + "loss": 0.3009, + "step": 52985 + }, + { + "epoch": 2.99, + "grad_norm": 4.571072696034182, + "learning_rate": 6.932368794521083e-10, + "loss": 0.2837, + "step": 52990 + }, + { + "epoch": 2.99, + "grad_norm": 4.779269423741139, + "learning_rate": 6.662133188956033e-10, + "loss": 0.3154, + "step": 52995 + }, + { + "epoch": 2.99, + "grad_norm": 4.412823950048739, + "learning_rate": 6.397269588176436e-10, + "loss": 0.3195, + "step": 53000 + }, + { + "epoch": 2.99, + "grad_norm": 4.070608395115606, + "learning_rate": 6.137778020653962e-10, + "loss": 0.3061, + "step": 53005 + }, + { + "epoch": 2.99, + "grad_norm": 4.2741498365653285, + "learning_rate": 5.883658514266311e-10, + "loss": 0.3067, + "step": 53010 + }, + { + "epoch": 2.99, + "grad_norm": 6.115827711252612, + "learning_rate": 5.634911096319417e-10, + "loss": 0.3259, + "step": 53015 + }, + { + "epoch": 2.99, + "grad_norm": 4.9834486818431705, + "learning_rate": 5.391535793541902e-10, + "loss": 0.2915, + "step": 53020 + }, + { + "epoch": 2.99, + "grad_norm": 4.744970874173944, + "learning_rate": 5.153532632090619e-10, + "loss": 0.3187, + "step": 53025 + }, + { + "epoch": 2.99, + "grad_norm": 4.131062627866473, + "learning_rate": 4.920901637534004e-10, + "loss": 0.3105, + "step": 53030 + }, + { + "epoch": 2.99, + "grad_norm": 4.799085291103587, + "learning_rate": 4.693642834868728e-10, + "loss": 0.3261, + "step": 53035 + }, + { + "epoch": 2.99, + "grad_norm": 5.027244128098909, + "learning_rate": 4.471756248519699e-10, + "loss": 0.3194, + "step": 53040 + }, + { + "epoch": 2.99, + "grad_norm": 4.420099986937671, + "learning_rate": 4.255241902328955e-10, + "loss": 0.3221, + "step": 53045 + }, + { + "epoch": 2.99, + "grad_norm": 5.0019439651383815, + "learning_rate": 4.04409981956122e-10, + "loss": 0.3048, + "step": 53050 + }, + { + "epoch": 2.99, + "grad_norm": 4.583616091572451, + "learning_rate": 3.838330022903902e-10, + "loss": 0.3175, + "step": 53055 + }, + { + "epoch": 2.99, + "grad_norm": 4.124705438210518, + "learning_rate": 3.6379325344670925e-10, + "loss": 0.2786, + "step": 53060 + }, + { + "epoch": 2.99, + "grad_norm": 6.127207862512285, + "learning_rate": 3.4429073757835663e-10, + "loss": 0.3145, + "step": 53065 + }, + { + "epoch": 2.99, + "grad_norm": 5.165205522855713, + "learning_rate": 3.2532545678143346e-10, + "loss": 0.3198, + "step": 53070 + }, + { + "epoch": 2.99, + "grad_norm": 5.003536846773055, + "learning_rate": 3.06897413093199e-10, + "loss": 0.293, + "step": 53075 + }, + { + "epoch": 2.99, + "grad_norm": 5.247727744195595, + "learning_rate": 2.8900660849429105e-10, + "loss": 0.3262, + "step": 53080 + }, + { + "epoch": 2.99, + "grad_norm": 5.368846114283611, + "learning_rate": 2.716530449070609e-10, + "loss": 0.3289, + "step": 53085 + }, + { + "epoch": 2.99, + "grad_norm": 4.713329930175754, + "learning_rate": 2.5483672419612806e-10, + "loss": 0.3188, + "step": 53090 + }, + { + "epoch": 2.99, + "grad_norm": 4.432704527151101, + "learning_rate": 2.3855764816893555e-10, + "loss": 0.302, + "step": 53095 + }, + { + "epoch": 2.99, + "grad_norm": 4.291525387320038, + "learning_rate": 2.2281581857408474e-10, + "loss": 0.3071, + "step": 53100 + }, + { + "epoch": 2.99, + "grad_norm": 4.65358104887318, + "learning_rate": 2.0761123710300035e-10, + "loss": 0.3453, + "step": 53105 + }, + { + "epoch": 2.99, + "grad_norm": 4.556930459035139, + "learning_rate": 1.929439053904858e-10, + "loss": 0.3536, + "step": 53110 + }, + { + "epoch": 2.99, + "grad_norm": 4.5115042302969774, + "learning_rate": 1.7881382501139243e-10, + "loss": 0.3022, + "step": 53115 + }, + { + "epoch": 2.99, + "grad_norm": 4.52677531248776, + "learning_rate": 1.652209974850605e-10, + "loss": 0.313, + "step": 53120 + }, + { + "epoch": 2.99, + "grad_norm": 4.297324251384825, + "learning_rate": 1.5216542427143322e-10, + "loss": 0.3216, + "step": 53125 + }, + { + "epoch": 2.99, + "grad_norm": 4.749725198149976, + "learning_rate": 1.3964710677383253e-10, + "loss": 0.3707, + "step": 53130 + }, + { + "epoch": 2.99, + "grad_norm": 4.501658968668157, + "learning_rate": 1.2766604633729364e-10, + "loss": 0.3065, + "step": 53135 + }, + { + "epoch": 2.99, + "grad_norm": 5.143536651636263, + "learning_rate": 1.162222442491201e-10, + "loss": 0.3079, + "step": 53140 + }, + { + "epoch": 2.99, + "grad_norm": 4.347122517183747, + "learning_rate": 1.0531570173888395e-10, + "loss": 0.3091, + "step": 53145 + }, + { + "epoch": 2.99, + "grad_norm": 4.578423641347769, + "learning_rate": 9.494641997898069e-11, + "loss": 0.3018, + "step": 53150 + }, + { + "epoch": 2.99, + "grad_norm": 6.329463561818456, + "learning_rate": 8.511440008296402e-11, + "loss": 0.2982, + "step": 53155 + }, + { + "epoch": 3.0, + "grad_norm": 4.7254765739441025, + "learning_rate": 7.581964310776624e-11, + "loss": 0.3123, + "step": 53160 + }, + { + "epoch": 3.0, + "grad_norm": 4.745683841446329, + "learning_rate": 6.706215005203298e-11, + "loss": 0.3174, + "step": 53165 + }, + { + "epoch": 3.0, + "grad_norm": 4.64765029074995, + "learning_rate": 5.884192185723337e-11, + "loss": 0.3113, + "step": 53170 + }, + { + "epoch": 3.0, + "grad_norm": 4.49916323772985, + "learning_rate": 5.115895940599469e-11, + "loss": 0.3388, + "step": 53175 + }, + { + "epoch": 3.0, + "grad_norm": 4.726996100697955, + "learning_rate": 4.401326352432289e-11, + "loss": 0.3108, + "step": 53180 + }, + { + "epoch": 3.0, + "grad_norm": 4.976107560106654, + "learning_rate": 3.740483497993719e-11, + "loss": 0.3155, + "step": 53185 + }, + { + "epoch": 3.0, + "grad_norm": 4.470688031701695, + "learning_rate": 3.1333674482825205e-11, + "loss": 0.3463, + "step": 53190 + }, + { + "epoch": 3.0, + "grad_norm": 4.335144723373069, + "learning_rate": 2.579978268524297e-11, + "loss": 0.3014, + "step": 53195 + }, + { + "epoch": 3.0, + "grad_norm": 4.682211886983013, + "learning_rate": 2.080316018227002e-11, + "loss": 0.3189, + "step": 53200 + }, + { + "epoch": 3.0, + "grad_norm": 4.461975872545511, + "learning_rate": 1.6343807510699193e-11, + "loss": 0.3213, + "step": 53205 + }, + { + "epoch": 3.0, + "grad_norm": 4.09502696077276, + "learning_rate": 1.2421725149591723e-11, + "loss": 0.2776, + "step": 53210 + }, + { + "epoch": 3.0, + "grad_norm": 4.467293134379502, + "learning_rate": 9.036913520277246e-12, + "loss": 0.2935, + "step": 53215 + }, + { + "epoch": 3.0, + "grad_norm": 5.5623102076298645, + "learning_rate": 6.1893729869089144e-12, + "loss": 0.3194, + "step": 53220 + }, + { + "epoch": 3.0, + "grad_norm": 4.798586620085093, + "learning_rate": 3.879103854798061e-12, + "loss": 0.3053, + "step": 53225 + }, + { + "epoch": 3.0, + "grad_norm": 4.714632863457232, + "learning_rate": 2.106106372634642e-12, + "loss": 0.313, + "step": 53230 + }, + { + "epoch": 3.0, + "grad_norm": 4.561214232712145, + "learning_rate": 8.70380730821907e-13, + "loss": 0.332, + "step": 53235 + }, + { + "epoch": 3.0, + "grad_norm": 4.398089998615461, + "learning_rate": 1.7192706203150722e-13, + "loss": 0.2983, + "step": 53240 + }, + { + "epoch": 3.0, + "eval_loss": 1.1549195051193237, + "eval_runtime": 25.3927, + "eval_samples_per_second": 31.741, + "eval_steps_per_second": 3.978, + "step": 53244 + }, + { + "epoch": 3.0, + "step": 53244, + "total_flos": 1833005840596992.0, + "train_loss": 0.832810894061188, + "train_runtime": 128353.9377, + "train_samples_per_second": 6.637, + "train_steps_per_second": 0.415 + } + ], + "logging_steps": 5, + "max_steps": 53244, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 1833005840596992.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}